From a893a84e8799270fbec5c3708d001650aab47138 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 23 Jun 2009 21:25:32 +0200
Subject: mm_for_maps: simplify, use ptrace_may_access()

It would be nice to kill __ptrace_may_access(). It requires task_lock(),
but this lock is only needed to read mm->flags in the middle.

Convert mm_for_maps() to use ptrace_may_access(), this also simplifies
the code a little bit.

Also, we do not need to take ->mmap_sem in advance. In fact I think
mm_for_maps() should not play with ->mmap_sem at all, the caller should
take this lock.

With or without this patch, without ->cred_guard_mutex held we can race
with exec() and get the new ->mm but check old creds.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/base.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3ce5ae9e3d2d..917f338a6739 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -237,20 +237,19 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 	struct mm_struct *mm = get_task_mm(task);
 	if (!mm)
 		return NULL;
+	if (mm != current->mm) {
+		/*
+		 * task->mm can be changed before security check,
+		 * in that case we must notice the change after.
+		 */
+		if (!ptrace_may_access(task, PTRACE_MODE_READ) ||
+		    mm != task->mm) {
+			mmput(mm);
+			return NULL;
+		}
+	}
 	down_read(&mm->mmap_sem);
-	task_lock(task);
-	if (task->mm != mm)
-		goto out;
-	if (task->mm != current->mm &&
-	    __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
-		goto out;
-	task_unlock(task);
 	return mm;
-out:
-	task_unlock(task);
-	up_read(&mm->mmap_sem);
-	mmput(mm);
-	return NULL;
 }
 
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
-- 
cgit v1.2.3-70-g09d2


From 024eab4d5bf7e3168a2b71038b3e04e6b1f376ed Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 17 Jul 2009 09:01:04 -0400
Subject: ext4: Fix memory leak fix when mounting an ext4 filesystem

The allocation of the ext4_group_info array was moved to a new
function ext4_mb_add_group_info() in commit 5f21b0e6 so that online
resize would use a common (and correct) codepath.  Unfortunately, the
call to the new ext4_mb_add_group_info() function was added without
removing the code which originally allocated the array.  This caused a
memory leak each time an ext4 filesystem was mounted.

The fix is simple; remove the code that did the original allocation,
since it is no longer needed.

Reported-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd258463e2a9..cf9972090ad9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2571,13 +2571,11 @@ static int ext4_mb_init_backend(struct super_block *sb)
 {
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t i;
-	int metalen;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	int num_meta_group_infos;
 	int num_meta_group_infos_max;
 	int array_size;
-	struct ext4_group_info **meta_group_info;
 	struct ext4_group_desc *desc;
 
 	/* This is the number of blocks used by GDT */
@@ -2622,22 +2620,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
 		goto err_freesgi;
 	}
 	EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
-
-	metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
-	for (i = 0; i < num_meta_group_infos; i++) {
-		if ((i + 1) == num_meta_group_infos)
-			metalen = sizeof(*meta_group_info) *
-				(ngroups -
-					(i << EXT4_DESC_PER_BLOCK_BITS(sb)));
-		meta_group_info = kmalloc(metalen, GFP_KERNEL);
-		if (meta_group_info == NULL) {
-			printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
-			       "buddy group\n");
-			goto err_freemeta;
-		}
-		sbi->s_group_info[i] = meta_group_info;
-	}
-
 	for (i = 0; i < ngroups; i++) {
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
@@ -2655,7 +2637,6 @@ err_freebuddy:
 	while (i-- > 0)
 		kfree(ext4_get_group_info(sb, i));
 	i = num_meta_group_infos;
-err_freemeta:
 	while (i-- > 0)
 		kfree(sbi->s_group_info[i]);
 	iput(sbi->s_buddy_cache);
-- 
cgit v1.2.3-70-g09d2


From 43b38520296d0c1730c78c878e1d6d1a49122091 Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Mon, 27 Jul 2009 21:38:17 -0400
Subject: ext4: Fix typo in ext4/Kconfig

Signed-off-by: Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 418b6f3b0ae8..152304624644 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,7 +37,7 @@ config EXT4DEV_COMPAT
 
 	  To enable backwards compatibility so that systems that are
 	  still expecting to mount ext4 filesystems using ext4dev,
-	  chose Y here.   This feature will go away by 2.6.31, so
+	  choose Y here.   This feature will go away by 2.6.31, so
 	  please arrange to get your userspace programs fixed!
 
 config EXT4_FS_XATTR
-- 
cgit v1.2.3-70-g09d2


From 78f1ddbb498283c2445c11b0dfa666424c301803 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 27 Jul 2009 23:09:47 -0400
Subject: ext4: Avoid null pointer dereference when decoding EROFS w/o a
 journal

We need to check to make sure a journal is present before checking the
journal flags in ext4_decode_error().

Signed-off-by: Eric Sesterhenn <eric.sesterhenn@lsexperts.de>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f4f079e6b9a..fe3f376b7df2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -344,7 +344,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
 		errstr = "Out of memory";
 		break;
 	case -EROFS:
-		if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
+		if (!sb || (EXT4_SB(sb)->s_journal &&
+			    EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
 			errstr = "Journal has aborted";
 		else
 			errstr = "Readonly filesystem";
-- 
cgit v1.2.3-70-g09d2


From f6f50e28f0cb8d7bcdfaacc83129f005dede11b1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 17 Jul 2009 10:40:01 -0400
Subject: jbd2: Fail to load a journal if it is too short

Due to on disk corruption, it can happen that journal is too short. Fail
to load it in such case so that we don't oops somewhere later.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e378cb383979..a8a358bc0f21 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1187,6 +1187,12 @@ static int journal_reset(journal_t *journal)
 
 	first = be32_to_cpu(sb->s_first);
 	last = be32_to_cpu(sb->s_maxlen);
+	if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
+		printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
+		       first, last);
+		journal_fail_superblock(journal);
+		return -EINVAL;
+	}
 
 	journal->j_first = first;
 	journal->j_last = last;
-- 
cgit v1.2.3-70-g09d2


From 6487a9d3b5476ffd1bbbe97e58cf26dbeb7a5d4a Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Fri, 17 Jul 2009 10:54:08 -0400
Subject: ext4: More buffer head reference leaks

After the patch I posted last week regarding buffer head ref leaks in
no-journal mode, I looked at all the code that uses buffer heads and
searched for more potential leaks.

The patch below fixes the issues I found; these can occur even when a
journal is present.

The change to inode.c fixes a double release if
ext4_journal_get_create_access() fails.

The changes to namei.c are more complicated.  add_dirent_to_buf() will
release the input buffer head EXCEPT when it returns -ENOSPC.  There are
some callers of this routine that don't always do the brelse() in the event
that -ENOSPC is returned.  Unfortunately, to put this fix into ext4_add_entry()
required capturing the return value of make_indexed_dir() and
add_dirent_to_buf().

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c |  3 ++-
 fs/ext4/namei.c | 16 ++++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9c642b22efa..deb14a728791 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -762,8 +762,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 		BUFFER_TRACE(bh, "call get_create_access");
 		err = ext4_journal_get_create_access(handle, bh);
 		if (err) {
+			/* Don't brelse(bh) here; it's done in
+			 * ext4_journal_forget() below */
 			unlock_buffer(bh);
-			brelse(bh);
 			goto failed;
 		}
 
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 114abe5d2c1d..fea14dbd3c22 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,8 +1518,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 			return retval;
 
 		if (blocks == 1 && !dx_fallback &&
-		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
-			return make_indexed_dir(handle, dentry, inode, bh);
+		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+			retval = make_indexed_dir(handle, dentry, inode, bh);
+			if (retval == -ENOSPC)
+				brelse(bh);
+			return retval;
+		}
 		brelse(bh);
 	}
 	bh = ext4_append(handle, dir, &block, &retval);
@@ -1528,7 +1532,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	de->inode = 0;
 	de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
-	return add_dirent_to_buf(handle, dentry, inode, de, bh);
+	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	if (retval == -ENOSPC)
+		brelse(bh);
+	return retval;
 }
 
 /*
@@ -1657,7 +1664,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	if (!de)
 		goto cleanup;
 	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
-	bh = NULL;
+	if (err != -ENOSPC)
+		bh = NULL;
 	goto cleanup;
 
 journal_error:
-- 
cgit v1.2.3-70-g09d2


From 5a4a798937f92413cb7dbdb6bd554186024092e9 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 5 Jul 2009 22:33:08 -0400
Subject: ext4: Remove unnecessary semicolons in mballoc.c

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cf9972090ad9..ba49e0c596f2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2532,7 +2532,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
 	init_rwsem(&meta_group_info[i]->alloc_sem);
-	meta_group_info[i]->bb_free_root.rb_node = NULL;;
+	meta_group_info[i]->bb_free_root.rb_node = NULL;
 
 #ifdef DOUBLE_CHECK
 	{
-- 
cgit v1.2.3-70-g09d2


From 1c718505171b06dbb60eafcb3fddba877dae5f7b Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Sun, 5 Jul 2009 23:04:36 -0400
Subject: ext4: Fix compile warnings with MB_DEBUG

When MB_DEBUG is enabled, we get some compile warnings because
ext4_group_t is unsigned int.  This patch fixes them.

Signed-off-by Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ba49e0c596f2..68cde598b3bf 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1851,7 +1851,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 	struct inode *inode = sbi->s_buddy_cache;
 	struct page *page = NULL, *bitmap_page = NULL;
 
-	mb_debug("init group %lu\n", group);
+	mb_debug("init group %u\n", group);
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 	this_grp = ext4_get_group_info(sb, group);
 	/*
@@ -4120,14 +4120,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 			ext4_get_group_no_and_offset(sb, pa->pa_pstart,
 						     NULL, &start);
 			spin_unlock(&pa->pa_lock);
-			printk(KERN_ERR "PA:%lu:%d:%u \n", i,
-							start, pa->pa_len);
+			printk(KERN_ERR "PA:%u:%d:%u \n", i,
+			       start, pa->pa_len);
 		}
 		ext4_unlock_group(sb, i);
 
 		if (grp->bb_free == 0)
 			continue;
-		printk(KERN_ERR "%lu: %d/%d \n",
+		printk(KERN_ERR "%u: %d/%d \n",
 		       i, grp->bb_free, grp->bb_fragments);
 	}
 	printk(KERN_ERR "\n");
-- 
cgit v1.2.3-70-g09d2


From 7a6d3c8b3049d07123628f2bf57127bba2cc878f Mon Sep 17 00:00:00 2001
From: Csaba Henk <csaba@gluster.com>
Date: Wed, 1 Jul 2009 17:28:41 -0700
Subject: fuse: make the number of max background requests and congestion
 threshold tunable

The practical values for these limits depend on the design of the
filesystem server so let userspace set them at initialization time.

Signed-off-by: Csaba Henk <csaba@gluster.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c        | 10 +++++-----
 fs/fuse/fuse_i.h     | 12 ++++++------
 fs/fuse/inode.c      | 14 ++++++++++++++
 include/linux/fuse.h |  9 +++++++--
 4 files changed, 32 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f58ecbc416c8..b152761c1bf6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -250,7 +250,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 
 static void flush_bg_queue(struct fuse_conn *fc)
 {
-	while (fc->active_background < FUSE_MAX_BACKGROUND &&
+	while (fc->active_background < fc->max_background &&
 	       !list_empty(&fc->bg_queue)) {
 		struct fuse_req *req;
 
@@ -280,11 +280,11 @@ __releases(&fc->lock)
 	list_del(&req->intr_entry);
 	req->state = FUSE_REQ_FINISHED;
 	if (req->background) {
-		if (fc->num_background == FUSE_MAX_BACKGROUND) {
+		if (fc->num_background == fc->max_background) {
 			fc->blocked = 0;
 			wake_up_all(&fc->blocked_waitq);
 		}
-		if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+		if (fc->num_background == fc->congestion_threshold &&
 		    fc->connected && fc->bdi_initialized) {
 			clear_bdi_congested(&fc->bdi, READ);
 			clear_bdi_congested(&fc->bdi, WRITE);
@@ -410,9 +410,9 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
 {
 	req->background = 1;
 	fc->num_background++;
-	if (fc->num_background == FUSE_MAX_BACKGROUND)
+	if (fc->num_background == fc->max_background)
 		fc->blocked = 1;
-	if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+	if (fc->num_background == fc->congestion_threshold &&
 	    fc->bdi_initialized) {
 		set_bdi_congested(&fc->bdi, READ);
 		set_bdi_congested(&fc->bdi, WRITE);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 52b641fc0faf..6bcfab04396f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -25,12 +25,6 @@
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
 
-/** Maximum number of outstanding background requests */
-#define FUSE_MAX_BACKGROUND 12
-
-/** Congestion starts at 75% of maximum */
-#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100)
-
 /** Bias for fi->writectr, meaning new writepages must not be sent */
 #define FUSE_NOWRITE INT_MIN
 
@@ -349,6 +343,12 @@ struct fuse_conn {
 	/** rbtree of fuse_files waiting for poll events indexed by ph */
 	struct rb_root polled_files;
 
+	/** Maximum number of outstanding background requests */
+	unsigned max_background;
+
+	/** Number of background requests at which congestion starts */
+	unsigned congestion_threshold;
+
 	/** Number of requests currently in the background */
 	unsigned num_background;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f91ccc4a189d..9aa6f46d0c32 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -32,6 +32,12 @@ DEFINE_MUTEX(fuse_mutex);
 
 #define FUSE_DEFAULT_BLKSIZE 512
 
+/** Maximum number of outstanding background requests */
+#define FUSE_DEFAULT_MAX_BACKGROUND 12
+
+/** Congestion starts at 75% of maximum */
+#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
+
 struct fuse_mount_data {
 	int fd;
 	unsigned rootmode;
@@ -517,6 +523,8 @@ void fuse_conn_init(struct fuse_conn *fc)
 	INIT_LIST_HEAD(&fc->bg_queue);
 	INIT_LIST_HEAD(&fc->entry);
 	atomic_set(&fc->num_waiting, 0);
+	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
+	fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
 	fc->khctr = 0;
 	fc->polled_files = RB_ROOT;
 	fc->reqctr = 0;
@@ -736,6 +744,12 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 	else {
 		unsigned long ra_pages;
 
+		if (arg->minor >= 13) {
+			if (arg->max_background)
+				fc->max_background = arg->max_background;
+			if (arg->congestion_threshold)
+				fc->congestion_threshold = arg->congestion_threshold;
+		}
 		if (arg->minor >= 6) {
 			ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
 			if (arg->flags & FUSE_ASYNC_READ)
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index cf593bf9fd32..b3700f0ac268 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -30,6 +30,10 @@
  *  - add umask flag to input argument of open, mknod and mkdir
  *  - add notification messages for invalidation of inodes and
  *    directory entries
+ *
+ * 7.13
+ *  - make max number of background requests and congestion threshold
+ *    tunables
  */
 
 #ifndef _LINUX_FUSE_H
@@ -41,7 +45,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 12
+#define FUSE_KERNEL_MINOR_VERSION 13
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -427,7 +431,8 @@ struct fuse_init_out {
 	__u32	minor;
 	__u32	max_readahead;
 	__u32	flags;
-	__u32	unused;
+	__u16   max_background;
+	__u16   congestion_threshold;
 	__u32	max_write;
 };
 
-- 
cgit v1.2.3-70-g09d2


From 8b712cd58adfe6aeeb0be4ecc011dc35620719e7 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 7 Jul 2009 17:22:12 -0400
Subject: ocfs2: Fixup orphan scan cleanup after failed mount

If the mount fails for any reason, ocfs2_dismount_volume calls
ocfs2_orphan_scan_stop. It requires that ocfs2_orphan_scan_init
be called to setup the mutex and work queues, but that doesn't
happen if the mount has failed and we oops accessing an uninitialized
work queue.

This patch splits the init and startup of the orphan scan, eliminating
the oops.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/journal.c | 8 +++++++-
 fs/ocfs2/journal.h | 1 +
 fs/ocfs2/super.c   | 4 +++-
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f033760ecbea..c48b93ac6b65 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1954,10 +1954,16 @@ void ocfs2_orphan_scan_init(struct ocfs2_super *osb)
 	os->os_osb = osb;
 	os->os_count = 0;
 	os->os_seqno = 0;
-	os->os_scantime = CURRENT_TIME;
 	mutex_init(&os->os_lock);
 	INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work);
+}
 
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
+{
+	struct ocfs2_orphan_scan *os;
+
+	os = &osb->osb_orphan_scan;
+	os->os_scantime = CURRENT_TIME;
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
 		atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
 	else {
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 5432c7f79cc6..81e8abcd246e 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -145,6 +145,7 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
 
 /* Exported only for the journal struct init code in super.c. Do not call. */
 void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
 void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
 void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7efb349fb9bd..63af2e36d834 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1182,7 +1182,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	wake_up(&osb->osb_mount_event);
 
 	/* Start this when the mount is almost sure of being successful */
-	ocfs2_orphan_scan_init(osb);
+	ocfs2_orphan_scan_start(osb);
 
 	mlog_exit(status);
 	return status;
@@ -1981,6 +1981,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
 		 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
 
+	ocfs2_orphan_scan_init(osb);
+
 	status = ocfs2_recovery_init(osb);
 	if (status) {
 		mlog(ML_ERROR, "Unable to initialize recovery state\n");
-- 
cgit v1.2.3-70-g09d2


From 17ae26b669886efe237b77439e43eb390fceb119 Mon Sep 17 00:00:00 2001
From: Jeff Liu <jeff.liu@oracle.com>
Date: Tue, 7 Jul 2009 15:51:40 +0800
Subject: ocfs2: trivial fix for s/migrate/migration/ in dlmrecovery.c logging

in dlmrecovery.c:1121, replace 'migrate' to 'migration' to keep the consistency
by comparing to other lines with the similar log info in the same file.

Signed-off-by: Jeff Liu <jeff.liu@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index bcb9260c3735..43e6e3280569 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1118,7 +1118,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
 
 	mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
 	     dlm->name, res->lockname.len, res->lockname.name,
-	     orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery",
+	     orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery",
 	     send_to);
 
 	/* send it */
-- 
cgit v1.2.3-70-g09d2


From 812e7a6a43fc34bc8f70c2b80db4ea5997d66ea8 Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Fri, 10 Jul 2009 13:26:04 +0800
Subject: ocfs2: log the actual return value of ocfs2_file_aio_write()

in ocfs2_file_aio_write(), log_exit() could don't log the value
which is really returned. this patch fixes it.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/file.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 62442e413a00..a49fa44aea1f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1918,8 +1918,10 @@ out_sems:
 
 	mutex_unlock(&inode->i_mutex);
 
+	if (written)
+		ret = written;
 	mlog_exit(ret);
-	return written ? written : ret;
+	return ret;
 }
 
 static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
-- 
cgit v1.2.3-70-g09d2


From 134e63756d5f3d0f7604dfcca847b09d1b14fd66 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 10 Jul 2009 09:51:34 +0000
Subject: genetlink: make netns aware

This makes generic netlink network namespace aware. No
generic netlink families except for the controller family
are made namespace aware, they need to be checked one by
one and then set the family->netnsok member to true.

A new function genlmsg_multicast_netns() is introduced to
allow sending a multicast message in a given namespace,
for example when it applies to an object that lives in
that namespace, a new function genlmsg_multicast_allns()
to send a message to all network namespaces (for objects
that do not have an associated netns).

The function genlmsg_multicast() is changed to multicast
the message in just init_net, which is currently correct
for all generic netlink families since they only work in
init_net right now. Some will later want to work in all
net namespaces because they do not care about the netns
at all -- those will have to be converted to use one of
the new functions genlmsg_multicast_allns() or
genlmsg_multicast_netns() whenever they are made netns
aware in some way.

After this patch families can easily decide whether or
not they should be available in all net namespaces. Many
genl families us it for objects not related to networking
and should therefore be available in all namespaces, but
that will have to be done on a per family basis.

Note that this doesn't touch on the checkpoint/restart
problem where network namespaces could be used, genl
families and multicast groups are numbered globally and
I see no easy way of changing that, especially since it
must be possible to multicast to all network namespaces
for those families that do not care about netns.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/dlm/netlink.c               |   2 +-
 include/net/genetlink.h        |  66 +++++++++++++--
 include/net/net_namespace.h    |   2 +
 kernel/taskstats.c             |  10 +--
 net/irda/irnetlink.c           |   2 +-
 net/netfilter/ipvs/ip_vs_ctl.c |   2 +-
 net/netlink/genetlink.c        | 186 ++++++++++++++++++++++++++++++++---------
 net/tipc/netlink.c             |   2 +-
 net/wireless/nl80211.c         |  14 ++--
 9 files changed, 223 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index ccc9d62c462d..55ea369f43a9 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb)
 		return rv;
 	}
 
-	return genlmsg_unicast(skb, listener_nlpid);
+	return genlmsg_unicast(&init_net, skb, listener_nlpid);
 }
 
 static int user_cmd(struct sk_buff *skb, struct genl_info *info)
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 1b0e3ee4ddd8..2a1c06874c42 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -3,6 +3,7 @@
 
 #include <linux/genetlink.h>
 #include <net/netlink.h>
+#include <net/net_namespace.h>
 
 /**
  * struct genl_multicast_group - generic netlink multicast group
@@ -27,6 +28,8 @@ struct genl_multicast_group
  * @name: name of family
  * @version: protocol version
  * @maxattr: maximum number of attributes supported
+ * @netnsok: set to true if the family can handle network
+ *	namespaces and should be presented in all of them
  * @attrbuf: buffer to store parsed attributes
  * @ops_list: list of all assigned operations
  * @family_list: family list
@@ -39,6 +42,7 @@ struct genl_family
 	char			name[GENL_NAMSIZ];
 	unsigned int		version;
 	unsigned int		maxattr;
+	bool			netnsok;
 	struct nlattr **	attrbuf;	/* private */
 	struct list_head	ops_list;	/* private */
 	struct list_head	family_list;	/* private */
@@ -62,8 +66,32 @@ struct genl_info
 	struct genlmsghdr *	genlhdr;
 	void *			userhdr;
 	struct nlattr **	attrs;
+#ifdef CONFIG_NET_NS
+	struct net *		_net;
+#endif
 };
 
+#ifdef CONFIG_NET_NS
+static inline struct net *genl_info_net(struct genl_info *info)
+{
+	return info->_net;
+}
+
+static inline void genl_info_net_set(struct genl_info *info, struct net *net)
+{
+	info->_net = net;
+}
+#else
+static inline struct net *genl_info_net(struct genl_info *info)
+{
+	return &init_net;
+}
+
+static inline void genl_info_net_set(struct genl_info *info, struct net *net)
+{
+}
+#endif
+
 /**
  * struct genl_ops - generic netlink operations
  * @cmd: command identifier
@@ -98,8 +126,6 @@ extern int genl_register_mc_group(struct genl_family *family,
 extern void genl_unregister_mc_group(struct genl_family *family,
 				     struct genl_multicast_group *grp);
 
-extern struct sock *genl_sock;
-
 /**
  * genlmsg_put - Add generic netlink header to netlink message
  * @skb: socket buffer holding the message
@@ -170,7 +196,21 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr)
 }
 
 /**
- * genlmsg_multicast - multicast a netlink message
+ * genlmsg_multicast_netns - multicast a netlink message to a specific netns
+ * @net: the net namespace
+ * @skb: netlink message as socket buffer
+ * @pid: own netlink pid to avoid sending to yourself
+ * @group: multicast group id
+ * @flags: allocation flags
+ */
+static inline int genlmsg_multicast_netns(struct net *net, struct sk_buff *skb,
+					  u32 pid, unsigned int group, gfp_t flags)
+{
+	return nlmsg_multicast(net->genl_sock, skb, pid, group, flags);
+}
+
+/**
+ * genlmsg_multicast - multicast a netlink message to the default netns
  * @skb: netlink message as socket buffer
  * @pid: own netlink pid to avoid sending to yourself
  * @group: multicast group id
@@ -179,17 +219,29 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr)
 static inline int genlmsg_multicast(struct sk_buff *skb, u32 pid,
 				    unsigned int group, gfp_t flags)
 {
-	return nlmsg_multicast(genl_sock, skb, pid, group, flags);
+	return genlmsg_multicast_netns(&init_net, skb, pid, group, flags);
 }
 
+/**
+ * genlmsg_multicast_allns - multicast a netlink message to all net namespaces
+ * @skb: netlink message as socket buffer
+ * @pid: own netlink pid to avoid sending to yourself
+ * @group: multicast group id
+ * @flags: allocation flags
+ *
+ * This function must hold the RTNL or rcu_read_lock().
+ */
+int genlmsg_multicast_allns(struct sk_buff *skb, u32 pid,
+			    unsigned int group, gfp_t flags);
+
 /**
  * genlmsg_unicast - unicast a netlink message
  * @skb: netlink message as socket buffer
  * @pid: netlink pid of the destination socket
  */
-static inline int genlmsg_unicast(struct sk_buff *skb, u32 pid)
+static inline int genlmsg_unicast(struct net *net, struct sk_buff *skb, u32 pid)
 {
-	return nlmsg_unicast(genl_sock, skb, pid);
+	return nlmsg_unicast(net->genl_sock, skb, pid);
 }
 
 /**
@@ -199,7 +251,7 @@ static inline int genlmsg_unicast(struct sk_buff *skb, u32 pid)
  */
 static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info)
 {
-	return genlmsg_unicast(skb, info->snd_pid);
+	return genlmsg_unicast(genl_info_net(info), skb, info->snd_pid);
 }
 
 /**
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index b34a6ee73754..3173d12d946b 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -26,6 +26,7 @@ struct net_device;
 struct sock;
 struct ctl_table_header;
 struct net_generic;
+struct sock;
 
 struct net {
 	atomic_t		count;		/* To decided when the network
@@ -57,6 +58,7 @@ struct net {
 	spinlock_t		rules_mod_lock;
 
 	struct sock 		*rtnl;			/* rtnetlink socket */
+	struct sock		*genl_sock;
 
 	struct netns_core	core;
 	struct netns_mib	mib;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 888adbcca30c..ea8384d3caa7 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 /*
  * Send taskstats data in @skb to listener with nl_pid @pid
  */
-static int send_reply(struct sk_buff *skb, pid_t pid)
+static int send_reply(struct sk_buff *skb, struct genl_info *info)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
 	void *reply = genlmsg_data(genlhdr);
@@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
 		return rc;
 	}
 
-	return genlmsg_unicast(skb, pid);
+	return genlmsg_reply(skb, info);
 }
 
 /*
@@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
 			if (!skb_next)
 				break;
 		}
-		rc = genlmsg_unicast(skb_cur, s->pid);
+		rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
 		if (rc == -ECONNREFUSED) {
 			s->valid = 0;
 			delcount++;
@@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 		goto err;
 	}
 
-	rc = send_reply(rep_skb, info->snd_pid);
+	rc = send_reply(rep_skb, info);
 
 err:
 	fput_light(file, fput_needed);
@@ -487,7 +487,7 @@ free_return_rc:
 	} else
 		goto err;
 
-	return send_reply(rep_skb, info->snd_pid);
+	return send_reply(rep_skb, info);
 err:
 	nlmsg_free(rep_skb);
 	return rc;
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
index 8dd7ed7e7c1f..476b307bd801 100644
--- a/net/irda/irnetlink.c
+++ b/net/irda/irnetlink.c
@@ -115,7 +115,7 @@ static int irda_nl_get_mode(struct sk_buff *skb, struct genl_info *info)
 
 	genlmsg_end(msg, hdr);
 
-	return genlmsg_unicast(msg, info->snd_pid);
+	return genlmsg_reply(msg, info);
 
  err_out:
 	nlmsg_free(msg);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 7c1333c67ff3..2d24d81474ce 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3231,7 +3231,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	genlmsg_end(msg, reply);
-	ret = genlmsg_unicast(msg, info->snd_pid);
+	ret = genlmsg_reply(msg, info);
 	goto out;
 
 nla_put_failure:
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index eed4c6a8afc0..575c64341508 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -18,8 +18,6 @@
 #include <net/sock.h>
 #include <net/genetlink.h>
 
-struct sock *genl_sock = NULL;
-
 static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */
 
 static inline void genl_lock(void)
@@ -175,10 +173,31 @@ int genl_register_mc_group(struct genl_family *family,
 		mc_groups_longs++;
 	}
 
-	err = netlink_change_ngroups(genl_sock,
-				     mc_groups_longs * BITS_PER_LONG);
-	if (err)
-		goto out;
+	if (family->netnsok) {
+		struct net *net;
+
+		rcu_read_lock();
+		for_each_net_rcu(net) {
+			err = netlink_change_ngroups(net->genl_sock,
+					mc_groups_longs * BITS_PER_LONG);
+			if (err) {
+				/*
+				 * No need to roll back, can only fail if
+				 * memory allocation fails and then the
+				 * number of _possible_ groups has been
+				 * increased on some sockets which is ok.
+				 */
+				rcu_read_unlock();
+				goto out;
+			}
+		}
+		rcu_read_unlock();
+	} else {
+		err = netlink_change_ngroups(init_net.genl_sock,
+					     mc_groups_longs * BITS_PER_LONG);
+		if (err)
+			goto out;
+	}
 
 	grp->id = id;
 	set_bit(id, mc_groups);
@@ -195,8 +214,14 @@ EXPORT_SYMBOL(genl_register_mc_group);
 static void __genl_unregister_mc_group(struct genl_family *family,
 				       struct genl_multicast_group *grp)
 {
+	struct net *net;
 	BUG_ON(grp->family != family);
-	netlink_clear_multicast_users(genl_sock, grp->id);
+
+	rcu_read_lock();
+	for_each_net_rcu(net)
+		netlink_clear_multicast_users(net->genl_sock, grp->id);
+	rcu_read_unlock();
+
 	clear_bit(grp->id, mc_groups);
 	list_del(&grp->list);
 	genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, grp);
@@ -467,6 +492,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	struct genl_ops *ops;
 	struct genl_family *family;
+	struct net *net = sock_net(skb->sk);
 	struct genl_info info;
 	struct genlmsghdr *hdr = nlmsg_data(nlh);
 	int hdrlen, err;
@@ -475,6 +501,10 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (family == NULL)
 		return -ENOENT;
 
+	/* this family doesn't exist in this netns */
+	if (!family->netnsok && !net_eq(net, &init_net))
+		return -ENOENT;
+
 	hdrlen = GENL_HDRLEN + family->hdrsize;
 	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
 		return -EINVAL;
@@ -492,7 +522,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			return -EOPNOTSUPP;
 
 		genl_unlock();
-		err = netlink_dump_start(genl_sock, skb, nlh,
+		err = netlink_dump_start(net->genl_sock, skb, nlh,
 					 ops->dumpit, ops->done);
 		genl_lock();
 		return err;
@@ -514,6 +544,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	info.genlhdr = nlmsg_data(nlh);
 	info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN;
 	info.attrs = family->attrbuf;
+	genl_info_net_set(&info, net);
 
 	return ops->doit(skb, &info);
 }
@@ -534,6 +565,7 @@ static struct genl_family genl_ctrl = {
 	.name = "nlctrl",
 	.version = 0x2,
 	.maxattr = CTRL_ATTR_MAX,
+	.netnsok = true,
 };
 
 static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq,
@@ -650,6 +682,7 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
 
 	int i, n = 0;
 	struct genl_family *rt;
+	struct net *net = sock_net(skb->sk);
 	int chains_to_skip = cb->args[0];
 	int fams_to_skip = cb->args[1];
 
@@ -658,6 +691,8 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
 			continue;
 		n = 0;
 		list_for_each_entry(rt, genl_family_chain(i), family_list) {
+			if (!rt->netnsok && !net_eq(net, &init_net))
+				continue;
 			if (++n < fams_to_skip)
 				continue;
 			if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).pid,
@@ -729,6 +764,7 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[CTRL_ATTR_FAMILY_ID]) {
 		u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]);
 		res = genl_family_find_byid(id);
+		err = -ENOENT;
 	}
 
 	if (info->attrs[CTRL_ATTR_FAMILY_NAME]) {
@@ -736,49 +772,61 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
 
 		name = nla_data(info->attrs[CTRL_ATTR_FAMILY_NAME]);
 		res = genl_family_find_byname(name);
+		err = -ENOENT;
 	}
 
-	if (res == NULL) {
-		err = -ENOENT;
-		goto errout;
+	if (res == NULL)
+		return err;
+
+	if (!res->netnsok && !net_eq(genl_info_net(info), &init_net)) {
+		/* family doesn't exist here */
+		return -ENOENT;
 	}
 
 	msg = ctrl_build_family_msg(res, info->snd_pid, info->snd_seq,
 				    CTRL_CMD_NEWFAMILY);
-	if (IS_ERR(msg)) {
-		err = PTR_ERR(msg);
-		goto errout;
-	}
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	err = genlmsg_reply(msg, info);
-errout:
-	return err;
+	return genlmsg_reply(msg, info);
 }
 
 static int genl_ctrl_event(int event, void *data)
 {
 	struct sk_buff *msg;
+	struct genl_family *family;
+	struct genl_multicast_group *grp;
 
-	if (genl_sock == NULL)
+	/* genl is still initialising */
+	if (!init_net.genl_sock)
 		return 0;
 
 	switch (event) {
 	case CTRL_CMD_NEWFAMILY:
 	case CTRL_CMD_DELFAMILY:
-		msg = ctrl_build_family_msg(data, 0, 0, event);
-		if (IS_ERR(msg))
-			return PTR_ERR(msg);
-
-		genlmsg_multicast(msg, 0, GENL_ID_CTRL, GFP_KERNEL);
+		family = data;
+		msg = ctrl_build_family_msg(family, 0, 0, event);
 		break;
 	case CTRL_CMD_NEWMCAST_GRP:
 	case CTRL_CMD_DELMCAST_GRP:
+		grp = data;
+		family = grp->family;
 		msg = ctrl_build_mcgrp_msg(data, 0, 0, event);
-		if (IS_ERR(msg))
-			return PTR_ERR(msg);
-
-		genlmsg_multicast(msg, 0, GENL_ID_CTRL, GFP_KERNEL);
 		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	if (!family->netnsok) {
+		genlmsg_multicast_netns(&init_net, msg, 0,
+					GENL_ID_CTRL, GFP_KERNEL);
+	} else {
+		rcu_read_lock();
+		genlmsg_multicast_allns(msg, 0, GENL_ID_CTRL, GFP_ATOMIC);
+		rcu_read_unlock();
 	}
 
 	return 0;
@@ -795,6 +843,33 @@ static struct genl_multicast_group notify_grp = {
 	.name		= "notify",
 };
 
+static int __net_init genl_pernet_init(struct net *net)
+{
+	/* we'll bump the group number right afterwards */
+	net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, 0,
+					       genl_rcv, &genl_mutex,
+					       THIS_MODULE);
+
+	if (!net->genl_sock && net_eq(net, &init_net))
+		panic("GENL: Cannot initialize generic netlink\n");
+
+	if (!net->genl_sock)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __net_exit genl_pernet_exit(struct net *net)
+{
+	netlink_kernel_release(net->genl_sock);
+	net->genl_sock = NULL;
+}
+
+static struct pernet_operations genl_pernet_ops = {
+	.init = genl_pernet_init,
+	.exit = genl_pernet_exit,
+};
+
 static int __init genl_init(void)
 {
 	int i, err;
@@ -804,36 +879,67 @@ static int __init genl_init(void)
 
 	err = genl_register_family(&genl_ctrl);
 	if (err < 0)
-		goto errout;
+		goto problem;
 
 	err = genl_register_ops(&genl_ctrl, &genl_ctrl_ops);
 	if (err < 0)
-		goto errout_register;
+		goto problem;
 
 	netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV);
 
-	/* we'll bump the group number right afterwards */
-	genl_sock = netlink_kernel_create(&init_net, NETLINK_GENERIC, 0,
-					  genl_rcv, &genl_mutex, THIS_MODULE);
-	if (genl_sock == NULL)
-		panic("GENL: Cannot initialize generic netlink\n");
+	err = register_pernet_subsys(&genl_pernet_ops);
+	if (err)
+		goto problem;
 
 	err = genl_register_mc_group(&genl_ctrl, &notify_grp);
 	if (err < 0)
-		goto errout_register;
+		goto problem;
 
 	return 0;
 
-errout_register:
-	genl_unregister_family(&genl_ctrl);
-errout:
+problem:
 	panic("GENL: Cannot register controller: %d\n", err);
 }
 
 subsys_initcall(genl_init);
 
-EXPORT_SYMBOL(genl_sock);
 EXPORT_SYMBOL(genl_register_ops);
 EXPORT_SYMBOL(genl_unregister_ops);
 EXPORT_SYMBOL(genl_register_family);
 EXPORT_SYMBOL(genl_unregister_family);
+
+static int genlmsg_mcast(struct sk_buff *skb, u32 pid, unsigned long group,
+			 gfp_t flags)
+{
+	struct sk_buff *tmp;
+	struct net *net, *prev = NULL;
+	int err;
+
+	for_each_net_rcu(net) {
+		if (prev) {
+			tmp = skb_clone(skb, flags);
+			if (!tmp) {
+				err = -ENOMEM;
+				goto error;
+			}
+			err = nlmsg_multicast(prev->genl_sock, tmp,
+					      pid, group, flags);
+			if (err)
+				goto error;
+		}
+
+		prev = net;
+	}
+
+	return nlmsg_multicast(prev->genl_sock, skb, pid, group, flags);
+ error:
+	kfree_skb(skb);
+	return err;
+}
+
+int genlmsg_multicast_allns(struct sk_buff *skb, u32 pid, unsigned int group,
+			    gfp_t flags)
+{
+	return genlmsg_mcast(skb, pid, group, flags);
+}
+EXPORT_SYMBOL(genlmsg_multicast_allns);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 3c57005e44d1..7bda8e3d1398 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -62,7 +62,7 @@ static int handle_cmd(struct sk_buff *skb, struct genl_info *info)
 		rep_nlh = nlmsg_hdr(rep_buf);
 		memcpy(rep_nlh, req_nlh, hdr_space);
 		rep_nlh->nlmsg_len = rep_buf->len;
-		genlmsg_unicast(rep_buf, NETLINK_CB(skb).pid);
+		genlmsg_unicast(&init_net, rep_buf, NETLINK_CB(skb).pid);
 	}
 
 	return 0;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 9deb12f73c44..2a04beba4369 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -413,7 +413,7 @@ static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info)
 
 	cfg80211_unlock_rdev(dev);
 
-	return genlmsg_unicast(msg, info->snd_pid);
+	return genlmsg_reply(msg, info);
 
  out_free:
 	nlmsg_free(msg);
@@ -739,7 +739,7 @@ static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info)
 	dev_put(netdev);
 	cfg80211_unlock_rdev(dev);
 
-	return genlmsg_unicast(msg, info->snd_pid);
+	return genlmsg_reply(msg, info);
 
  out_free:
 	nlmsg_free(msg);
@@ -1030,7 +1030,7 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
-	err = genlmsg_unicast(msg, info->snd_pid);
+	err = genlmsg_reply(msg, info);
 	goto out;
 
  nla_put_failure:
@@ -1618,7 +1618,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
 				 dev, mac_addr, &sinfo) < 0)
 		goto out_free;
 
-	err = genlmsg_unicast(msg, info->snd_pid);
+	err = genlmsg_reply(msg, info);
 	goto out;
 
  out_free:
@@ -2087,7 +2087,7 @@ static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info)
 				 dev, dst, next_hop, &pinfo) < 0)
 		goto out_free;
 
-	err = genlmsg_unicast(msg, info->snd_pid);
+	err = genlmsg_reply(msg, info);
 	goto out;
 
  out_free:
@@ -2436,7 +2436,7 @@ static int nl80211_get_mesh_params(struct sk_buff *skb,
 			cur_params.dot11MeshHWMPnetDiameterTraversalTime);
 	nla_nest_end(msg, pinfoattr);
 	genlmsg_end(msg, hdr);
-	err = genlmsg_unicast(msg, info->snd_pid);
+	err = genlmsg_reply(msg, info);
 	goto out;
 
  nla_put_failure:
@@ -2624,7 +2624,7 @@ static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info)
 	nla_nest_end(msg, nl_reg_rules);
 
 	genlmsg_end(msg, hdr);
-	err = genlmsg_unicast(msg, info->snd_pid);
+	err = genlmsg_reply(msg, info);
 	goto out;
 
 nla_put_failure:
-- 
cgit v1.2.3-70-g09d2


From 7447a668a3860b66b3c9db86fdea91e355ba59ac Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 Jul 2009 20:36:08 +0200
Subject: jbd: Fail to load a journal if it is too short

Due to on disk corruption, it can happen that journal is too short. Fail
to load it in such case so that we don't oops somewhere later.

Reported-by: Nageswara R Sastry <rnsastry@linux.vnet.ibm.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/journal.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 737f7246a4b5..94a64a199a63 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -848,6 +848,12 @@ static int journal_reset(journal_t *journal)
 
 	first = be32_to_cpu(sb->s_first);
 	last = be32_to_cpu(sb->s_maxlen);
+	if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
+		printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n",
+		       first, last);
+		journal_fail_superblock(journal);
+		return -EINVAL;
+	}
 
 	journal->j_first = first;
 	journal->j_last = last;
-- 
cgit v1.2.3-70-g09d2


From 9eaaa2d5759837402ec5eee13b2a97921808c3eb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 13 Jul 2009 20:26:52 +0200
Subject: ext3: Fix truncation of symlinks after failed write

Contents of long symlinks is written via standard write methods. So when the
write fails, we add inode to orphan list. But symlinks don't have .truncate
method defined so nobody properly removes them from the orphan list (both on
disk and in memory).

Fix this by calling ext3_truncate() directly instead of calling vmtruncate()
(which is saner anyway since we don't need anything vmtruncate() does except
from calling .truncate in these paths).  We also add inode to orphan list only
if ext3_can_truncate() is true (currently, it can be false for symlinks when
there are no blocks allocated) - otherwise orphan list processing will complain
and ext3_truncate() will not remove inode from on-disk orphan list.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/inode.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5f51fed5c750..4d7da6f6184b 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1193,15 +1193,16 @@ write_begin_failed:
 		 * i_size_read because we hold i_mutex.
 		 *
 		 * Add inode to orphan list in case we crash before truncate
-		 * finishes.
+		 * finishes. Do this only if ext3_can_truncate() agrees so
+		 * that orphan processing code is happy.
 		 */
-		if (pos + len > inode->i_size)
+		if (pos + len > inode->i_size && ext3_can_truncate(inode))
 			ext3_orphan_add(handle, inode);
 		ext3_journal_stop(handle);
 		unlock_page(page);
 		page_cache_release(page);
 		if (pos + len > inode->i_size)
-			vmtruncate(inode, inode->i_size);
+			ext3_truncate(inode);
 	}
 	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
@@ -1287,7 +1288,7 @@ static int ext3_ordered_write_end(struct file *file,
 	 * There may be allocated blocks outside of i_size because
 	 * we failed to copy some data. Prepare for truncate.
 	 */
-	if (pos + len > inode->i_size)
+	if (pos + len > inode->i_size && ext3_can_truncate(inode))
 		ext3_orphan_add(handle, inode);
 	ret2 = ext3_journal_stop(handle);
 	if (!ret)
@@ -1296,7 +1297,7 @@ static int ext3_ordered_write_end(struct file *file,
 	page_cache_release(page);
 
 	if (pos + len > inode->i_size)
-		vmtruncate(inode, inode->i_size);
+		ext3_truncate(inode);
 	return ret ? ret : copied;
 }
 
@@ -1315,14 +1316,14 @@ static int ext3_writeback_write_end(struct file *file,
 	 * There may be allocated blocks outside of i_size because
 	 * we failed to copy some data. Prepare for truncate.
 	 */
-	if (pos + len > inode->i_size)
+	if (pos + len > inode->i_size && ext3_can_truncate(inode))
 		ext3_orphan_add(handle, inode);
 	ret = ext3_journal_stop(handle);
 	unlock_page(page);
 	page_cache_release(page);
 
 	if (pos + len > inode->i_size)
-		vmtruncate(inode, inode->i_size);
+		ext3_truncate(inode);
 	return ret ? ret : copied;
 }
 
@@ -1358,7 +1359,7 @@ static int ext3_journalled_write_end(struct file *file,
 	 * There may be allocated blocks outside of i_size because
 	 * we failed to copy some data. Prepare for truncate.
 	 */
-	if (pos + len > inode->i_size)
+	if (pos + len > inode->i_size && ext3_can_truncate(inode))
 		ext3_orphan_add(handle, inode);
 	EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
 	if (inode->i_size > EXT3_I(inode)->i_disksize) {
@@ -1375,7 +1376,7 @@ static int ext3_journalled_write_end(struct file *file,
 	page_cache_release(page);
 
 	if (pos + len > inode->i_size)
-		vmtruncate(inode, inode->i_size);
+		ext3_truncate(inode);
 	return ret ? ret : copied;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 1e9fd53b783ea646de3ee09a4574afeb6778d504 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 24 Jun 2009 17:31:40 +0200
Subject: jbd: Fix a race between checkpointing code and
 journal_get_write_access()

The following race can happen:

  CPU1                          CPU2
                                checkpointing code checks the buffer, adds
                                  it to an array for writeback
do_get_write_access()
  ...
  lock_buffer()
  unlock_buffer()
                                  flush_batch() submits the buffer for IO
  __jbd_journal_file_buffer()

  So a buffer under writeout is returned from do_get_write_access(). Since
the filesystem code relies on the fact that journaled buffers cannot be
written out, it does not take the buffer lock and so it can modify buffer
while it is under writeout. That can lead to a filesystem corruption
if we crash at the right moment. The similar problem can happen with
the journal_get_create_access() path.
  We fix the problem by clearing the buffer dirty bit under buffer_lock
even if the buffer is on BJ_None list. Actually, we clear the dirty bit
regardless the list the buffer is in and warn about the fact if
the buffer is already journalled.

Thanks for spotting the problem goes to dingdinghua <dingdinghua85@gmail.com>.

Reported-by: dingdinghua <dingdinghua85@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/transaction.c | 68 +++++++++++++++++++++++++++-------------------------
 1 file changed, 35 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 73242ba7c7b1..c03ac11f74be 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -489,34 +489,15 @@ void journal_unlock_updates (journal_t *journal)
 	wake_up(&journal->j_wait_transaction_locked);
 }
 
-/*
- * Report any unexpected dirty buffers which turn up.  Normally those
- * indicate an error, but they can occur if the user is running (say)
- * tune2fs to modify the live filesystem, so we need the option of
- * continuing as gracefully as possible.  #
- *
- * The caller should already hold the journal lock and
- * j_list_lock spinlock: most callers will need those anyway
- * in order to probe the buffer's journaling state safely.
- */
-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
+static void warn_dirty_buffer(struct buffer_head *bh)
 {
-	int jlist;
-
-	/* If this buffer is one which might reasonably be dirty
-	 * --- ie. data, or not part of this journal --- then
-	 * we're OK to leave it alone, but otherwise we need to
-	 * move the dirty bit to the journal's own internal
-	 * JBDDirty bit. */
-	jlist = jh->b_jlist;
+	char b[BDEVNAME_SIZE];
 
-	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
-	    jlist == BJ_Shadow || jlist == BJ_Forget) {
-		struct buffer_head *bh = jh2bh(jh);
-
-		if (test_clear_buffer_dirty(bh))
-			set_buffer_jbddirty(bh);
-	}
+	printk(KERN_WARNING
+	       "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+	       "There's a risk of filesystem corruption in case of system "
+	       "crash.\n",
+	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
 
 /*
@@ -583,14 +564,16 @@ repeat:
 			if (jh->b_next_transaction)
 				J_ASSERT_JH(jh, jh->b_next_transaction ==
 							transaction);
+			warn_dirty_buffer(bh);
 		}
 		/*
 		 * In any case we need to clean the dirty flag and we must
 		 * do it under the buffer lock to be sure we don't race
 		 * with running write-out.
 		 */
-		JBUFFER_TRACE(jh, "Unexpected dirty buffer");
-		jbd_unexpected_dirty_buffer(jh);
+		JBUFFER_TRACE(jh, "Journalling dirty buffer");
+		clear_buffer_dirty(bh);
+		set_buffer_jbddirty(bh);
 	}
 
 	unlock_buffer(bh);
@@ -826,6 +809,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
 
 	if (jh->b_transaction == NULL) {
+		/*
+		 * Previous journal_forget() could have left the buffer
+		 * with jbddirty bit set because it was being committed. When
+		 * the commit finished, we've filed the buffer for
+		 * checkpointing and marked it dirty. Now we are reallocating
+		 * the buffer so the transaction freeing it must have
+		 * committed and so it's safe to clear the dirty bit.
+		 */
+		clear_buffer_dirty(jh2bh(jh));
 		jh->b_transaction = transaction;
 
 		/* first access by this transaction */
@@ -1782,8 +1774,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
 
 	if (jh->b_cp_transaction) {
 		JBUFFER_TRACE(jh, "on running+cp transaction");
+		/*
+		 * We don't want to write the buffer anymore, clear the
+		 * bit so that we don't confuse checks in
+		 * __journal_file_buffer
+		 */
+		clear_buffer_dirty(bh);
 		__journal_file_buffer(jh, transaction, BJ_Forget);
-		clear_buffer_jbddirty(bh);
 		may_free = 0;
 	} else {
 		JBUFFER_TRACE(jh, "on running transaction");
@@ -2041,12 +2038,17 @@ void __journal_file_buffer(struct journal_head *jh,
 	if (jh->b_transaction && jh->b_jlist == jlist)
 		return;
 
-	/* The following list of buffer states needs to be consistent
-	 * with __jbd_unexpected_dirty_buffer()'s handling of dirty
-	 * state. */
-
 	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
 	    jlist == BJ_Shadow || jlist == BJ_Forget) {
+		/*
+		 * For metadata buffers, we track dirty bit in buffer_jbddirty
+		 * instead of buffer_dirty. We should not see a dirty bit set
+		 * here because we clear it in do_get_write_access but e.g.
+		 * tune2fs can modify the sb and set the dirty bit at any time
+		 * so we try to gracefully handle that.
+		 */
+		if (buffer_dirty(bh))
+			warn_dirty_buffer(bh);
 		if (test_clear_buffer_dirty(bh) ||
 		    test_clear_buffer_jbddirty(bh))
 			was_dirty = 1;
-- 
cgit v1.2.3-70-g09d2


From 43237b5490e8f2f4679decd660064ff35ce490cc Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 20 May 2009 18:41:58 +0200
Subject: ext3: Get rid of extenddisksize parameter of ext3_get_blocks_handle()

Get rid of extenddisksize parameter of ext3_get_blocks_handle(). This seems to
be a relict from some old days and setting disksize in this function does not
make much sence. Currently it was set only by ext3_getblk().  Since the
parameter has some effect only if create == 1, it is easy to check that the
three callers which end up calling ext3_getblk() with create == 1 (ext3_append,
ext3_quota_write, ext3_mkdir) do the right thing and set disksize themselves.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/dir.c           |  3 +--
 fs/ext3/inode.c         | 13 +++----------
 include/linux/ext3_fs.h |  2 +-
 3 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 3d724a95882f..373fa90c796a 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp,
 		struct buffer_head *bh = NULL;
 
 		map_bh.b_state = 0;
-		err = ext3_get_blocks_handle(NULL, inode, blk, 1,
-						&map_bh, 0, 0);
+		err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
 		if (err > 0) {
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 4d7da6f6184b..b49908a167ae 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -788,7 +788,7 @@ err_out:
 int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 		sector_t iblock, unsigned long maxblocks,
 		struct buffer_head *bh_result,
-		int create, int extend_disksize)
+		int create)
 {
 	int err = -EIO;
 	int offsets[4];
@@ -911,13 +911,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 	if (!err)
 		err = ext3_splice_branch(handle, inode, iblock,
 					partial, indirect_blks, count);
-	/*
-	 * i_disksize growing is protected by truncate_mutex.  Don't forget to
-	 * protect it if you're about to implement concurrent
-	 * ext3_get_block() -bzzz
-	*/
-	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
-		ei->i_disksize = inode->i_size;
 	mutex_unlock(&ei->truncate_mutex);
 	if (err)
 		goto cleanup;
@@ -972,7 +965,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
 	}
 
 	ret = ext3_get_blocks_handle(handle, inode, iblock,
-					max_blocks, bh_result, create, 0);
+					max_blocks, bh_result, create);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
@@ -1005,7 +998,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
 	err = ext3_get_blocks_handle(handle, inode, block, 1,
-					&dummy, create, 1);
+					&dummy, create);
 	/*
 	 * ext3_get_blocks_handle() returns number of blocks
 	 * mapped. 0 in case of a HOLE.
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 634a5e5aba3e..7499b3667798 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -874,7 +874,7 @@ struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
 struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
 int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 	sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result,
-	int create, int extend_disksize);
+	int create);
 
 extern struct inode *ext3_iget(struct super_block *, unsigned long);
 extern int  ext3_write_inode (struct inode *, int);
-- 
cgit v1.2.3-70-g09d2


From 713c0ecdb888e9ef6f085e828555455c5916b07f Mon Sep 17 00:00:00 2001
From: Sten Spans <Sten_Spans@genua.de>
Date: Thu, 16 Jul 2009 09:41:39 +0200
Subject: security: fix security_file_lock cmd argument

Pass posix-translated lock operations to security_file_lock
when invoked via sys_flock.

Signed-off-by: Sten Spans <Sten_Spans@genua.de>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/locks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index b6440f52178f..52366e877d76 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 	if (can_sleep)
 		lock->fl_flags |= FL_SLEEP;
 
-	error = security_file_lock(filp, cmd);
+	error = security_file_lock(filp, lock->fl_type);
 	if (error)
 		goto out_free;
 
-- 
cgit v1.2.3-70-g09d2


From 613afbf83298efaead05ebcac23d2285609d7160 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: sched: Pull up the might_sleep() check into cond_resched()

might_sleep() is called late-ish in cond_resched(), after the
need_resched()/preempt enabled/system running tests are
checked.

It's better to check the sleeps while atomic earlier and not
depend on some environment datas that reduce the chances to
detect a problem.

Also define cond_resched_*() helpers as macros, so that the
FILE/LINE reported in the sleeping while atomic warning
displays the real origin and not sched.h

Changes in v2:

 - Call __might_sleep() directly instead of might_sleep() which
   may call cond_resched()

 - Turn cond_resched() into a macro so that the file:line
   couple reported refers to the caller of cond_resched() and
   not __cond_resched() itself.

Changes in v3:

 - Also propagate this __might_sleep() pull up to
   cond_resched_lock() and cond_resched_softirq()

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1247725694-6082-6-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/dcache.c           |  1 +
 include/linux/sched.h | 29 +++++++++++++++++++----------
 kernel/sched.c        | 12 +++++-------
 3 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 9e5cd3c3a6ba..a100fa35a48f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
 #include <linux/swap.h>
 #include <linux/bootmem.h>
 #include <linux/fs_struct.h>
+#include <linux/hardirq.h>
 #include "internal.h"
 
 int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e2bdf18e05c4..c41d424db887 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2286,17 +2286,26 @@ static inline int need_resched(void)
  */
 extern int _cond_resched(void);
 
-static inline int cond_resched(void)
-{
-	return _cond_resched();
-}
+#define cond_resched() ({			\
+	__might_sleep(__FILE__, __LINE__, 0);	\
+	_cond_resched();			\
+})
 
-extern int cond_resched_lock(spinlock_t * lock);
-extern int cond_resched_softirq(void);
-static inline int cond_resched_bkl(void)
-{
-	return _cond_resched();
-}
+extern int __cond_resched_lock(spinlock_t *lock);
+
+#define cond_resched_lock(lock) ({				\
+	__might_sleep(__FILE__, __LINE__, PREEMPT_OFFSET);	\
+	__cond_resched_lock(lock);				\
+})
+
+extern int __cond_resched_softirq(void);
+
+#define cond_resched_softirq() ({				\
+	__might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET);	\
+	__cond_resched_softirq();				\
+})
+
+#define cond_resched_bkl()	cond_resched()
 
 /*
  * Does a critical section need to be broken due to another
diff --git a/kernel/sched.c b/kernel/sched.c
index 3ff4d004bd95..1f7919add8ae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6610,8 +6610,6 @@ static inline int should_resched(void)
 
 static void __cond_resched(void)
 {
-	__might_sleep(__FILE__, __LINE__, 0);
-
 	add_preempt_count(PREEMPT_ACTIVE);
 	schedule();
 	sub_preempt_count(PREEMPT_ACTIVE);
@@ -6628,14 +6626,14 @@ int __sched _cond_resched(void)
 EXPORT_SYMBOL(_cond_resched);
 
 /*
- * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
  *
  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
-int cond_resched_lock(spinlock_t *lock)
+int __cond_resched_lock(spinlock_t *lock)
 {
 	int resched = should_resched();
 	int ret = 0;
@@ -6651,9 +6649,9 @@ int cond_resched_lock(spinlock_t *lock)
 	}
 	return ret;
 }
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_lock);
 
-int __sched cond_resched_softirq(void)
+int __sched __cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 
@@ -6665,7 +6663,7 @@ int __sched cond_resched_softirq(void)
 	}
 	return 0;
 }
-EXPORT_SYMBOL(cond_resched_softirq);
+EXPORT_SYMBOL(__cond_resched_softirq);
 
 /**
  * yield - yield the current processor to other threads.
-- 
cgit v1.2.3-70-g09d2


From def01bc53d03881acfc393bd10a5c7575187e008 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: sched: Convert the only user of cond_resched_bkl to use
 cond_resched()

fs/locks.c:flock_lock_file() is the only user of
cond_resched_bkl()

This helper doesn't do anything more than cond_resched(). The
latter naming is enough to explain that we are rescheduling if
needed.

The bkl suffix suggests another semantics but it's actually a
synonym of cond_resched().

Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1247725694-6082-7-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/locks.c            | 2 +-
 include/linux/sched.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index b6440f52178f..2eb81975c99c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 	 * give it the opportunity to lock the file.
 	 */
 	if (found)
-		cond_resched_bkl();
+		cond_resched();
 
 find_conflict:
 	for_each_lock(inode, before) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c41d424db887..cbbfca69aa4a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2305,8 +2305,6 @@ extern int __cond_resched_softirq(void);
 	__cond_resched_softirq();				\
 })
 
-#define cond_resched_bkl()	cond_resched()
-
 /*
  * Does a critical section need to be broken due to another
  * task waiting?: (technically does not depend on CONFIG_PREEMPT,
-- 
cgit v1.2.3-70-g09d2


From cefcb800fa9536bb6f29485c53e6c82a65b0c022 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@gmail.com>
Date: Sat, 11 Jul 2009 10:57:27 -0500
Subject: ocfs2: Initialize count in aio_write before generic_write_checks

generic_write_checks() expects count to be initialized to the size of
the write.  Writes to files open with O_DIRECT|O_LARGEFILE write 0 bytes
because count is uninitialized.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.de>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a49fa44aea1f..aa501d3f93f1 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1851,6 +1851,7 @@ relock:
 		if (ret)
 			goto out_dio;
 
+		count = ocount;
 		ret = generic_write_checks(file, ppos, &count,
 					   S_ISBLK(inode->i_mode));
 		if (ret)
-- 
cgit v1.2.3-70-g09d2


From 44d8e4e1e9b941065eb7578669fe51eb65c73e63 Mon Sep 17 00:00:00 2001
From: Subrata Modak <subrata@linux.vnet.ibm.com>
Date: Tue, 14 Jul 2009 01:19:31 +0530
Subject: ocfs2: Fix compilation warning for fs/ocfs2/xattr.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gcc 4.4.1 generates the following build warning on i386:

CC [M]  fs/ocfs2/xattr.o
fs/ocfs2/xattr.c: In function ‘ocfs2_xattr_block_get’:
fs/ocfs2/xattr.c:1055: warning: ‘block_off’ may be used uninitialized in this function

The following fix is based on a similar approach by David Howells
few days back: http://lkml.org/lkml/2009/7/9/109,

Signed-off-by: Subrata Modak<subrata@linux.vnet.ibm.com>,
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ba320e250747..d1a27cda984f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1052,7 +1052,8 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 	struct ocfs2_xattr_block *xb;
 	struct ocfs2_xattr_value_root *xv;
 	size_t size;
-	int ret = -ENODATA, name_offset, name_len, block_off, i;
+	int ret = -ENODATA, name_offset, name_len, i;
+	int uninitialized_var(block_off);
 
 	xs->bucket = ocfs2_xattr_bucket_new(inode);
 	if (!xs->bucket) {
-- 
cgit v1.2.3-70-g09d2


From cbfa9639aea5007313b75ec43b689d5f9e0c037d Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Mon, 13 Jul 2009 11:38:23 +0800
Subject: ocfs2: Fix error return in ocfs2_write_cluster()

A typo caused ocfs2_write_cluster() to return 0 in some error cases.
Fix it.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/aops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b2c52b3a1484..25aced3b0c7d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1301,7 +1301,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 		if (tmpret) {
 			mlog_errno(tmpret);
 			if (ret == 0)
-				tmpret = ret;
+				ret = tmpret;
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 1f4cea3790bf44137192f441ee84af256e3bf809 Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Mon, 13 Jul 2009 11:38:58 +0800
Subject: ocfs2: Fail ocfs2_get_block() immediately when a block needs
 allocation

ocfs2_get_block() does no allocation.  Hole filling for writes should
have happened farther up in the call chain.  We detect this case and
print an error, but we then continue with the function.  We should be
exiting immediately.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/aops.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 25aced3b0c7d..e511df101451 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -193,6 +193,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 			mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
 			dump_stack();
+			goto bail;
 		}
 
 		past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-- 
cgit v1.2.3-70-g09d2


From f1015c447781729060c415f5133164c638561f25 Mon Sep 17 00:00:00 2001
From: dingdinghua <dingdinghua85@gmail.com>
Date: Wed, 15 Jul 2009 21:42:05 +0200
Subject: jbd: fix race between write_metadata_buffer and get_write_access

The function journal_write_metadata_buffer() calls jbd_unlock_bh_state(bh_in)
too early; this could potentially allow another thread to call get_write_access
on the buffer head, modify the data, and dirty it, and allowing the wrong data
to be written into the journal.  Fortunately, if we lose this race, the only
time this will actually cause filesystem corruption is if there is a system
crash or other unclean shutdown of the system before the next commit can take
place.

Signed-off-by: dingdinghua <dingdinghua85@gmail.com>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/journal.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 94a64a199a63..f96f85092d1c 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
 	struct page *new_page;
 	unsigned int new_offset;
 	struct buffer_head *bh_in = jh2bh(jh_in);
+	journal_t *journal = transaction->t_journal;
 
 	/*
 	 * The buffer really shouldn't be locked: only the current committing
@@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction,
 	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
 
 	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+	/* keep subsequent assertions sane */
+	new_bh->b_state = 0;
+	init_buffer(new_bh, NULL, NULL);
+	atomic_set(&new_bh->b_count, 1);
+	new_jh = journal_add_journal_head(new_bh);	/* This sleeps */
 
 	/*
 	 * If a new transaction has already done a buffer copy-out, then
@@ -361,14 +367,6 @@ repeat:
 		kunmap_atomic(mapped_data, KM_USER0);
 	}
 
-	/* keep subsequent assertions sane */
-	new_bh->b_state = 0;
-	init_buffer(new_bh, NULL, NULL);
-	atomic_set(&new_bh->b_count, 1);
-	jbd_unlock_bh_state(bh_in);
-
-	new_jh = journal_add_journal_head(new_bh);	/* This sleeps */
-
 	set_bh_page(new_bh, new_page, new_offset);
 	new_jh->b_transaction = NULL;
 	new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -385,7 +383,11 @@ repeat:
 	 * copying is moved to the transaction's shadow queue.
 	 */
 	JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
-	journal_file_buffer(jh_in, transaction, BJ_Shadow);
+	spin_lock(&journal->j_list_lock);
+	__journal_file_buffer(jh_in, transaction, BJ_Shadow);
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh_in);
+
 	JBUFFER_TRACE(new_jh, "file as BJ_IO");
 	journal_file_buffer(new_jh, transaction, BJ_IO);
 
-- 
cgit v1.2.3-70-g09d2


From 5549f7cdf84c02939fd368d0842aa2f472bb6e98 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 7 Jul 2009 10:28:23 -0400
Subject: inotify: drop user watch count when a watch is removed

The inotify rewrite forgot to drop the inotify watch use cound when a watch
was removed.  This means that a single inotify fd can only ever register a
maximum of /proc/sys/fs/max_user_watches even if some of those had been
freed.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ff27a2965844..1a870f9157b3 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -404,6 +404,8 @@ skip_send_ignore:
 
 	/* removed from idr, drop that reference */
 	fsnotify_put_mark(entry);
+
+	atomic_dec(&group->inotify_data.user->inotify_watches);
 }
 
 /* ding dong the mark is dead */
-- 
cgit v1.2.3-70-g09d2


From 75fe2b26394c59c8e16bd7b76f4be5d048103ad1 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 7 Jul 2009 10:28:23 -0400
Subject: inotify: do not leak inode marks in inotify_add_watch

inotify_add_watch had a couple of problems.  The biggest being that if
inotify_add_watch was called on the same inode twice (to update or change the
event mask) a refence was taken on the original inode mark by
fsnotify_find_mark_entry but was not being dropped at the end of the
inotify_add_watch call.  Thus if inotify_rm_watch was called although the mark
was removed from the inode, the refcnt wouldn't hit zero and we would leak
memory.

Reported-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1a870f9157b3..aff4214f16c3 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -463,9 +463,6 @@ retry:
 			goto out_err;
 
 		spin_lock(&group->inotify_data.idr_lock);
-		/* if entry is added to the idr we keep the reference obtained
-		 * through fsnotify_mark_add.  remember to drop this reference
-		 * when entry is removed from idr */
 		ret = idr_get_new_above(&group->inotify_data.idr, entry,
 					++group->inotify_data.last_wd,
 					&ientry->wd);
@@ -476,8 +473,13 @@ retry:
 			goto out_err;
 		}
 		atomic_inc(&group->inotify_data.user->inotify_watches);
+
+		/* we put the mark on the idr, take a reference */
+		fsnotify_get_mark(entry);
 	}
 
+	ret = ientry->wd;
+
 	spin_lock(&entry->lock);
 
 	old_mask = entry->mask;
@@ -508,7 +510,11 @@ retry:
 			fsnotify_recalc_group_mask(group);
 	}
 
-	return ientry->wd;
+	/* this either matches fsnotify_find_mark_entry, or init_mark_entry
+	 * depending on which path we took... */
+	fsnotify_put_mark(entry);
+
+	return ret;
 
 out_err:
 	/* see this isn't supposed to happen, just kill the watch */
-- 
cgit v1.2.3-70-g09d2


From 7e790dd5fc937bc8d2400c30a05e32a9e9eef276 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 7 Jul 2009 10:28:24 -0400
Subject: inotify: fix error paths in inotify_update_watch

inotify_update_watch could leave things in a horrid state on a number of
error paths.  We could try to remove idr entries that didn't exist, we
could send an IN_IGNORED to userspace for watches that don't exist, and a
bit of other stupidity.  Clean these up by doing the idr addition before we
put the mark on the inode since we can clean that up on error and getting
off the inode's mark list is hard.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 79 +++++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index aff4214f16c3..726118a5845b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -365,6 +365,17 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
 	return error;
 }
 
+static void inotify_remove_from_idr(struct fsnotify_group *group,
+				    struct inotify_inode_mark_entry *ientry)
+{
+	struct idr *idr;
+
+	spin_lock(&group->inotify_data.idr_lock);
+	idr = &group->inotify_data.idr;
+	idr_remove(idr, ientry->wd);
+	spin_unlock(&group->inotify_data.idr_lock);
+	ientry->wd = -1;
+}
 /*
  * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the
  * internal reference help on the mark because it is in the idr.
@@ -375,7 +386,6 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 	struct inotify_inode_mark_entry *ientry;
 	struct inotify_event_private_data *event_priv;
 	struct fsnotify_event_private_data *fsn_event_priv;
-	struct idr *idr;
 
 	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
 
@@ -397,10 +407,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 skip_send_ignore:
 
 	/* remove this entry from the idr */
-	spin_lock(&group->inotify_data.idr_lock);
-	idr = &group->inotify_data.idr;
-	idr_remove(idr, ientry->wd);
-	spin_unlock(&group->inotify_data.idr_lock);
+	inotify_remove_from_idr(group, ientry);
 
 	/* removed from idr, drop that reference */
 	fsnotify_put_mark(entry);
@@ -420,6 +427,7 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
 {
 	struct fsnotify_mark_entry *entry = NULL;
 	struct inotify_inode_mark_entry *ientry;
+	struct inotify_inode_mark_entry *tmp_ientry;
 	int ret = 0;
 	int add = (arg & IN_MASK_ADD);
 	__u32 mask;
@@ -430,50 +438,60 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
 	if (unlikely(!mask))
 		return -EINVAL;
 
-	ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
-	if (unlikely(!ientry))
+	tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+	if (unlikely(!tmp_ientry))
 		return -ENOMEM;
 	/* we set the mask at the end after attaching it */
-	fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
-	ientry->wd = 0;
+	fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
+	tmp_ientry->wd = -1;
 
 find_entry:
 	spin_lock(&inode->i_lock);
 	entry = fsnotify_find_mark_entry(group, inode);
 	spin_unlock(&inode->i_lock);
 	if (entry) {
-		kmem_cache_free(inotify_inode_mark_cachep, ientry);
 		ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
 	} else {
-		if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
-			ret = -ENOSPC;
-			goto out_err;
-		}
-
-		ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
-		if (ret == -EEXIST)
-			goto find_entry;
-		else if (ret)
+		ret = -ENOSPC;
+		if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
 			goto out_err;
-
-		entry = &ientry->fsn_entry;
 retry:
 		ret = -ENOMEM;
 		if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
 			goto out_err;
 
 		spin_lock(&group->inotify_data.idr_lock);
-		ret = idr_get_new_above(&group->inotify_data.idr, entry,
-					++group->inotify_data.last_wd,
-					&ientry->wd);
+		ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
+					group->inotify_data.last_wd,
+					&tmp_ientry->wd);
 		spin_unlock(&group->inotify_data.idr_lock);
 		if (ret) {
 			if (ret == -EAGAIN)
 				goto retry;
 			goto out_err;
 		}
+
+		ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+		if (ret) {
+			inotify_remove_from_idr(group, tmp_ientry);
+			if (ret == -EEXIST)
+				goto find_entry;
+			goto out_err;
+		}
+
+		/* tmp_ientry has been added to the inode, so we are all set up.
+		 * now we just need to make sure tmp_ientry doesn't get freed and
+		 * we need to set up entry and ientry so the generic code can
+		 * do its thing. */
+		ientry = tmp_ientry;
+		entry = &ientry->fsn_entry;
+		tmp_ientry = NULL;
+
 		atomic_inc(&group->inotify_data.user->inotify_watches);
 
+		/* update the idr hint */
+		group->inotify_data.last_wd = ientry->wd;
+
 		/* we put the mark on the idr, take a reference */
 		fsnotify_get_mark(entry);
 	}
@@ -514,14 +532,15 @@ retry:
 	 * depending on which path we took... */
 	fsnotify_put_mark(entry);
 
-	return ret;
-
 out_err:
-	/* see this isn't supposed to happen, just kill the watch */
-	if (entry) {
-		fsnotify_destroy_mark_by_entry(entry);
-		fsnotify_put_mark(entry);
+	/* could be an error, could be that we found an existing mark */
+	if (tmp_ientry) {
+		/* on the idr but didn't make it on the inode */
+		if (tmp_ientry->wd != -1)
+			inotify_remove_from_idr(group, tmp_ientry);
+		kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
 	}
+
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 520dc2a526fd681337883b6ff1ddcf7c23b1b063 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 13 Jul 2009 15:56:54 -0400
Subject: fsnotify: use def_bool in kconfig instead of letting the user choose

fsnotify doens't give the user anything.  If someone chooses inotify or
dnotify it should build fsnotify, if they don't select one it shouldn't be
built.  This patch changes fsnotify to be a def_bool=n and makes everything
else select it.  Also fixes the issue people complained about on lwn where
gdm hung because they didn't have inotify and they didn't get the inotify
build option.....

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/Kconfig         | 12 +-----------
 fs/notify/dnotify/Kconfig |  2 +-
 fs/notify/inotify/Kconfig |  2 +-
 3 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 31dac7e3b0f1..dffbb0911d02 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,15 +1,5 @@
 config FSNOTIFY
-	bool "Filesystem notification backend"
-	default y
-	---help---
-	   fsnotify is a backend for filesystem notification.  fsnotify does
-	   not provide any userspace interface but does provide the basis
-	   needed for other notification schemes such as dnotify, inotify,
-	   and fanotify.
-
-	   Say Y here to enable fsnotify suport.
-
-	   If unsure, say Y.
+	def_bool n
 
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 904ff8d5405a..f9c1ca139d8f 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,6 +1,6 @@
 config DNOTIFY
 	bool "Dnotify support"
-	depends on FSNOTIFY
+	select FSNOTIFY
 	default y
 	help
 	  Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 5356884289a1..3e56dbffe729 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -15,7 +15,7 @@ config INOTIFY
 
 config INOTIFY_USER
 	bool "Inotify support for userspace"
-	depends on FSNOTIFY
+	select FSNOTIFY
 	default y
 	---help---
 	  Say Y here to enable inotify support for userspace, including the
-- 
cgit v1.2.3-70-g09d2


From 4a148ba988988b9c400ad0f2cbccc155289b954b Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 13 Jul 2009 15:56:55 -0400
Subject: inotify: check filename before dropping repeat events

inotify drops events if the last event on the queue is the same as the
current event.  But it does 2 things wrong.  First it is comparing old->inode
with new->inode.  But after an event if put on the queue the ->inode is no
longer allowed to be used.  It's possible between the last event and this new
event the inode could be reused and we would falsely match the inode's memory
address between two differing events.

The second problem is that when a file is removed fsnotify is passed the
negative dentry for the removed object rather than the postive dentry from
immediately before the removal.  This mean the (broken) inotify tail drop code
was matching the NULL ->inode of differing events.

The fix is to check the file name which is stored with events when doing the
tail drop instead of wrongly checking the address of the stored ->inode.

Reported-by: Scott James Remnant <scott@ubuntu.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/notification.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 959b73e756fd..69391fe8efb1 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -136,10 +136,15 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 {
 	if ((old->mask == new->mask) &&
 	    (old->to_tell == new->to_tell) &&
-	    (old->data_type == new->data_type)) {
+	    (old->data_type == new->data_type) &&
+	    (old->name_len == new->name_len)) {
 		switch (old->data_type) {
 		case (FSNOTIFY_EVENT_INODE):
-			if (old->inode == new->inode)
+			/* remember, after old was put on the wait_q we aren't
+			 * allowed to look at the inode any more, only thing
+			 * left to check was if the file_name is the same */
+			if (old->name_len &&
+			    !strcmp(old->file_name, new->file_name))
 				return true;
 			break;
 		case (FSNOTIFY_EVENT_PATH):
-- 
cgit v1.2.3-70-g09d2


From c05594b62125c528d93af3a78229793aae36df7f Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 13 Jul 2009 15:56:55 -0400
Subject: fsnotify: fix inotify tail drop check with path entries

fsnotify drops new events when they are the same as the tail event on the
queue to be sent to userspace.  The problem is that if the event comes with
a path we forget to break out of the switch statement and fall into the
code path which matches on events that do not have any type of file backed
information (things like IN_UNMOUNT and IN_Q_OVERFLOW).  The problem is
that this code thinks all such events should be dropped.  Fix is to add a
break.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/notification.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 69391fe8efb1..2b20feaf263a 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -151,6 +151,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 			if ((old->path.mnt == new->path.mnt) &&
 			    (old->path.dentry == new->path.dentry))
 				return true;
+			break;
 		case (FSNOTIFY_EVENT_NONE):
 			return true;
 		};
-- 
cgit v1.2.3-70-g09d2


From f44aebcc566d1d6275f7191867b9633dc11de2ee Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 15 Jul 2009 15:49:52 -0400
Subject: inotify: use GFP_NOFS under potential memory pressure

inotify can have a watchs removed under filesystem reclaim.

=================================
[ INFO: inconsistent lock state ]
2.6.31-rc2 #16
---------------------------------
inconsistent {IN-RECLAIM_FS-W} -> {RECLAIM_FS-ON-W} usage.
khubd/217 [HC0[0]:SC0[0]:HE1:SE1] takes:
 (iprune_mutex){+.+.?.}, at: [<c10ba899>] invalidate_inodes+0x20/0xe3
{IN-RECLAIM_FS-W} state was registered at:
  [<c10536ab>] __lock_acquire+0x2c9/0xac4
  [<c1053f45>] lock_acquire+0x9f/0xc2
  [<c1308872>] __mutex_lock_common+0x2d/0x323
  [<c1308c00>] mutex_lock_nested+0x2e/0x36
  [<c10ba6ff>] shrink_icache_memory+0x38/0x1b2
  [<c108bfb6>] shrink_slab+0xe2/0x13c
  [<c108c3e1>] kswapd+0x3d1/0x55d
  [<c10449b5>] kthread+0x66/0x6b
  [<c1003fdf>] kernel_thread_helper+0x7/0x10
  [<ffffffff>] 0xffffffff

Two things are needed to fix this.  First we need a method to tell
fsnotify_create_event() to use GFP_NOFS and second we need to stop using
one global IN_IGNORED event and allocate them one at a time.  This solves
current issues with multiple IN_IGNORED on a queue having tail drop
problems and simplifies the allocations since we don't have to worry about
two tasks opperating on the IGNORED event concurrently.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c             |  4 +++-
 fs/notify/inotify/inotify_user.c | 18 ++++++++++++------
 fs/notify/notification.c         |  9 +++++----
 include/linux/fsnotify_backend.h |  2 +-
 4 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ec2f7bd76818..037e878e03fc 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -159,7 +159,9 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 			if (!group->ops->should_send_event(group, to_tell, mask))
 				continue;
 			if (!event) {
-				event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie);
+				event = fsnotify_create_event(to_tell, mask, data,
+							      data_is, file_name, cookie,
+							      GFP_KERNEL);
 				/* shit, we OOM'd and now we can't tell, maybe
 				 * someday someone else will want to do something
 				 * here */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 726118a5845b..f30d9bbc2e1b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -57,7 +57,6 @@ int inotify_max_user_watches __read_mostly;
 
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 struct kmem_cache *event_priv_cachep __read_mostly;
-static struct fsnotify_event *inotify_ignored_event;
 
 /*
  * When inotify registers a new group it increments this and uses that
@@ -384,12 +383,19 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 				    struct fsnotify_group *group)
 {
 	struct inotify_inode_mark_entry *ientry;
+	struct fsnotify_event *ignored_event;
 	struct inotify_event_private_data *event_priv;
 	struct fsnotify_event_private_data *fsn_event_priv;
 
+	ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
+					      FSNOTIFY_EVENT_NONE, NULL, 0,
+					      GFP_NOFS);
+	if (!ignored_event)
+		return;
+
 	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
 
-	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
 	if (unlikely(!event_priv))
 		goto skip_send_ignore;
 
@@ -398,7 +404,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 	fsn_event_priv->group = group;
 	event_priv->wd = ientry->wd;
 
-	fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
+	fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
 
 	/* did the private data get added? */
 	if (list_empty(&fsn_event_priv->event_list))
@@ -406,6 +412,9 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 
 skip_send_ignore:
 
+	/* matches the reference taken when the event was created */
+	fsnotify_put_event(ignored_event);
+
 	/* remove this entry from the idr */
 	inotify_remove_from_idr(group, ientry);
 
@@ -748,9 +757,6 @@ static int __init inotify_user_setup(void)
 
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
 	event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
-	inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
-	if (!inotify_ignored_event)
-		panic("unable to allocate the inotify ignored event\n");
 
 	inotify_max_queued_events = 16384;
 	inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 2b20feaf263a..521368574e97 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -153,7 +153,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 				return true;
 			break;
 		case (FSNOTIFY_EVENT_NONE):
-			return true;
+			return false;
 		};
 	}
 	return false;
@@ -345,18 +345,19 @@ static void initialize_event(struct fsnotify_event *event)
  * @name the filename, if available
  */
 struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
-					     int data_type, const char *name, u32 cookie)
+					     int data_type, const char *name, u32 cookie,
+					     gfp_t gfp)
 {
 	struct fsnotify_event *event;
 
-	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+	event = kmem_cache_alloc(fsnotify_event_cachep, gfp);
 	if (!event)
 		return NULL;
 
 	initialize_event(event);
 
 	if (name) {
-		event->file_name = kstrdup(name, GFP_KERNEL);
+		event->file_name = kstrdup(name, gfp);
 		if (!event->file_name) {
 			kmem_cache_free(fsnotify_event_cachep, event);
 			return NULL;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 6c3de999fb34..4d6f47b51189 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -352,7 +352,7 @@ extern void fsnotify_unmount_inodes(struct list_head *list);
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 						    void *data, int data_is, const char *name,
-						    u32 cookie);
+						    u32 cookie, gfp_t gfp);
 
 #else
 
-- 
cgit v1.2.3-70-g09d2


From 3c5e10683e684ef45614c9071847e48f633d9806 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Tue, 21 Jul 2009 15:42:05 +0800
Subject: ocfs2: Add extra credits and access the modified bh in
 update_edge_lengths.

In normal tree rotation left process, we will never touch the tree
branch above subtree_index and ocfs2_extend_rotate_transaction doesn't
reserve the credits for them either.

But when we want to delete the rightmost extent block, we have to update
the rightmost records for all the rightmost branch(See
ocfs2_update_edge_lengths), so we have to allocate extra credits for them.
What's more, we have to access them also.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/alloc.c | 44 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9edcde4974aa..11085af71247 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2476,15 +2476,37 @@ out_ret_path:
 	return ret;
 }
 
-static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
-				      struct ocfs2_path *path)
+static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
+				     int subtree_index, struct ocfs2_path *path)
 {
-	int i, idx;
+	int i, idx, ret;
 	struct ocfs2_extent_rec *rec;
 	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_block *eb;
 	u32 range;
 
+	/*
+	 * In normal tree rotation process, we will never touch the
+	 * tree branch above subtree_index and ocfs2_extend_rotate_transaction
+	 * doesn't reserve the credits for them either.
+	 *
+	 * But we do have a special case here which will update the rightmost
+	 * records for all the bh in the path.
+	 * So we have to allocate extra credits and access them.
+	 */
+	ret = ocfs2_extend_trans(handle,
+				 handle->h_buffer_credits + subtree_index);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_path(inode, handle, path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
 	/* Path should always be rightmost. */
 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
 	BUG_ON(eb->h_next_leaf_blk != 0ULL);
@@ -2505,6 +2527,8 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
 
 		ocfs2_journal_dirty(handle, path->p_node[i].bh);
 	}
+out:
+	return ret;
 }
 
 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
@@ -2717,7 +2741,12 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 	if (del_right_subtree) {
 		ocfs2_unlink_subtree(inode, handle, left_path, right_path,
 				     subtree_index, dealloc);
-		ocfs2_update_edge_lengths(inode, handle, left_path);
+		ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
+						left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
 
 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -3034,7 +3063,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
 
 		ocfs2_unlink_subtree(inode, handle, left_path, path,
 				     subtree_index, dealloc);
-		ocfs2_update_edge_lengths(inode, handle, left_path);
+		ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
+						left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
 
 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
-- 
cgit v1.2.3-70-g09d2


From f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 20 Jul 2009 12:12:36 +0200
Subject: ocfs2: Fix deadlock on umount

In commit ea455f8ab68338ba69f5d3362b342c115bea8e13, we moved the dentry lock
put process into ocfs2_wq. This causes problems during umount because ocfs2_wq
can drop references to inodes while they are being invalidated by
invalidate_inodes() causing all sorts of nasty things (invalidate_inodes()
ending in an infinite loop, "Busy inodes after umount" messages etc.).

We fix the problem by stopping ocfs2_wq from doing any further releasing of
inode references on the superblock being unmounted, wait until it finishes
the current round of releasing and finally cleaning up all the references in
dentry_lock_list from ocfs2_put_super().

The issue was tracked down by Tao Ma <tao.ma@oracle.com>.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dcache.c | 35 +++++++++++++++++++++++++++--------
 fs/ocfs2/dcache.h |  3 +++
 fs/ocfs2/ocfs2.h  | 22 ++++++++++++++++++----
 fs/ocfs2/super.c  | 25 ++++++++++++++++++++++---
 4 files changed, 70 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b574431a031d..2f28b7de2c8d 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -310,22 +310,19 @@ out_attach:
 	return ret;
 }
 
-static DEFINE_SPINLOCK(dentry_list_lock);
+DEFINE_SPINLOCK(dentry_list_lock);
 
 /* We limit the number of dentry locks to drop in one go. We have
  * this limit so that we don't starve other users of ocfs2_wq. */
 #define DL_INODE_DROP_COUNT 64
 
 /* Drop inode references from dentry locks */
-void ocfs2_drop_dl_inodes(struct work_struct *work)
+static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
 {
-	struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
-					       dentry_lock_work);
 	struct ocfs2_dentry_lock *dl;
-	int drop_count = DL_INODE_DROP_COUNT;
 
 	spin_lock(&dentry_list_lock);
-	while (osb->dentry_lock_list && drop_count--) {
+	while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
 		dl = osb->dentry_lock_list;
 		osb->dentry_lock_list = dl->dl_next;
 		spin_unlock(&dentry_list_lock);
@@ -333,11 +330,32 @@ void ocfs2_drop_dl_inodes(struct work_struct *work)
 		kfree(dl);
 		spin_lock(&dentry_list_lock);
 	}
-	if (osb->dentry_lock_list)
+	spin_unlock(&dentry_list_lock);
+}
+
+void ocfs2_drop_dl_inodes(struct work_struct *work)
+{
+	struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+					       dentry_lock_work);
+
+	__ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
+	/*
+	 * Don't queue dropping if umount is in progress. We flush the
+	 * list in ocfs2_dismount_volume
+	 */
+	spin_lock(&dentry_list_lock);
+	if (osb->dentry_lock_list &&
+	    !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
 		queue_work(ocfs2_wq, &osb->dentry_lock_work);
 	spin_unlock(&dentry_list_lock);
 }
 
+/* Flush the whole work queue */
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
+{
+	__ocfs2_drop_dl_inodes(osb, -1);
+}
+
 /*
  * ocfs2_dentry_iput() and friends.
  *
@@ -368,7 +386,8 @@ static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
 	/* We leave dropping of inode reference to ocfs2_wq as that can
 	 * possibly lead to inode deletion which gets tricky */
 	spin_lock(&dentry_list_lock);
-	if (!osb->dentry_lock_list)
+	if (!osb->dentry_lock_list &&
+	    !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
 		queue_work(ocfs2_wq, &osb->dentry_lock_work);
 	dl->dl_next = osb->dentry_lock_list;
 	osb->dentry_lock_list = dl;
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index faa12e75f98d..f5dd1789acf1 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -49,10 +49,13 @@ struct ocfs2_dentry_lock {
 int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
 			     u64 parent_blkno);
 
+extern spinlock_t dentry_list_lock;
+
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
 			   struct ocfs2_dentry_lock *dl);
 
 void ocfs2_drop_dl_inodes(struct work_struct *work);
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
 
 struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
 				      int skip_unhashed);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c9345ebb8493..39e1d5a39505 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -224,10 +224,12 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
 };
 
-#define OCFS2_OSB_SOFT_RO	0x0001
-#define OCFS2_OSB_HARD_RO	0x0002
-#define OCFS2_OSB_ERROR_FS	0x0004
-#define OCFS2_DEFAULT_ATIME_QUANTUM	60
+#define OCFS2_OSB_SOFT_RO			0x0001
+#define OCFS2_OSB_HARD_RO			0x0002
+#define OCFS2_OSB_ERROR_FS			0x0004
+#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED	0x0008
+
+#define OCFS2_DEFAULT_ATIME_QUANTUM		60
 
 struct ocfs2_journal;
 struct ocfs2_slot_info;
@@ -490,6 +492,18 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
 	spin_unlock(&osb->osb_lock);
 }
 
+
+static inline unsigned long  ocfs2_test_osb_flag(struct ocfs2_super *osb,
+						 unsigned long flag)
+{
+	unsigned long ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & flag;
+	spin_unlock(&osb->osb_lock);
+	return ret;
+}
+
 static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
 				     int hard)
 {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 63af2e36d834..f2893878f8ad 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1213,14 +1213,27 @@ static int ocfs2_get_sb(struct file_system_type *fs_type,
 			   mnt);
 }
 
+static void ocfs2_kill_sb(struct super_block *sb)
+{
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	/* Prevent further queueing of inode drop events */
+	spin_lock(&dentry_list_lock);
+	ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
+	spin_unlock(&dentry_list_lock);
+	/* Wait for work to finish and/or remove it */
+	cancel_work_sync(&osb->dentry_lock_work);
+
+	kill_block_super(sb);
+}
+
 static struct file_system_type ocfs2_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "ocfs2",
 	.get_sb         = ocfs2_get_sb, /* is this called when we mount
 					* the fs? */
-	.kill_sb        = kill_block_super, /* set to the generic one
-					     * right now, but do we
-					     * need to change that? */
+	.kill_sb        = ocfs2_kill_sb,
+
 	.fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
 	.next           = NULL
 };
@@ -1819,6 +1832,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	debugfs_remove(osb->osb_ctxt);
 
+	/*
+	 * Flush inode dropping work queue so that deletes are
+	 * performed while the filesystem is still working
+	 */
+	ocfs2_drop_all_dl_inodes(osb);
+
 	/* Orphan scan should be stopped as early as possible */
 	ocfs2_orphan_scan_stop(osb);
 
-- 
cgit v1.2.3-70-g09d2


From 1bec1aed1e7e632b3cc43b6807c2b4dcd1572e28 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 22 Jul 2009 09:59:00 -0400
Subject: Btrfs: fix definition of struct btrfs_extent_inline_ref

use __le64 instead of u64 in on-disk structure definition.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a404ecc53eb1..da0763135bf0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -483,7 +483,7 @@ struct btrfs_shared_data_ref {
 
 struct btrfs_extent_inline_ref {
 	u8 type;
-	u64 offset;
+	__le64 offset;
 } __attribute__ ((__packed__));
 
 /* old style backrefs item */
-- 
cgit v1.2.3-70-g09d2


From bf1fb512a58d7aeb41aaa40d6d2d2d29e08e506a Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 22 Jul 2009 09:59:00 -0400
Subject: Btrfs: properly update space information after shrinking device.

Change 'goto done' to 'break' for the case of all device extents have
been freed, so that the code updates space information will be execute.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3ab80e9cd767..f057730a72bb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2007,7 +2007,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 			goto done;
 		if (ret) {
 			ret = 0;
-			goto done;
+			break;
 		}
 
 		l = path->nodes[0];
@@ -2015,7 +2015,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
 
 		if (key.objectid != device->devid)
-			goto done;
+			break;
 
 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
 		length = btrfs_dev_extent_length(l, dev_extent);
-- 
cgit v1.2.3-70-g09d2


From e457afec60fdbd86b963d36f4a8a9285088c6043 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 22 Jul 2009 09:59:00 -0400
Subject: Btrfs: fix double increment of path->slots[0] in btrfs_next_leaf

if 1 is returned by btrfs_search_slot, the path already points to the
first item with 'key > searching key'. So increasing path->slots[0] by
one is superfluous in that case.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 60a45f3a4e91..7bb66c65ddfd 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4146,7 +4146,8 @@ again:
 	 * advance the path if there are now more items available.
 	 */
 	if (nritems > 0 && path->slots[0] < nritems - 1) {
-		path->slots[0]++;
+		if (ret == 0)
+			path->slots[0]++;
 		ret = 0;
 		goto done;
 	}
-- 
cgit v1.2.3-70-g09d2


From 33c66f430bfa3a033e70470e4c93f967156b696d Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 22 Jul 2009 09:59:00 -0400
Subject: Btrfs: fix locking issue in btrfs_find_next_key

When walking up the tree, btrfs_find_next_key assumes the upper level tree
block is properly locked. This isn't always true even path->keep_locks is 1.
This is because btrfs_find_next_key may advance path->slots[] several times
instead of only once.

When 'path->slots[level] >= btrfs_header_nritems(path->nodes[level])' is found,
we can't guarantee the original value of 'path->slots[level]' is
'btrfs_header_nritems(path->nodes[level]) - 1'. If it's not, the tree block at
'level + 1' isn't locked.

This patch fixes the issue by explicitly checking the locking state,
re-searching the tree if it's not locked.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c      | 95 +++++++++++++++++++++++++++++++++------------------
 fs/btrfs/relocation.c |  3 ++
 2 files changed, 65 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7bb66c65ddfd..fdd423a550d6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1701,6 +1701,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct extent_buffer *b;
 	int slot;
 	int ret;
+	int err;
 	int level;
 	int lowest_unlock = 1;
 	u8 lowest_level = 0;
@@ -1737,8 +1738,6 @@ again:
 			p->locks[level] = 1;
 
 		if (cow) {
-			int wret;
-
 			/*
 			 * if we don't really need to cow this block
 			 * then we don't want to set the path blocking,
@@ -1749,12 +1748,12 @@ again:
 
 			btrfs_set_path_blocking(p);
 
-			wret = btrfs_cow_block(trans, root, b,
-					       p->nodes[level + 1],
-					       p->slots[level + 1], &b);
-			if (wret) {
+			err = btrfs_cow_block(trans, root, b,
+					      p->nodes[level + 1],
+					      p->slots[level + 1], &b);
+			if (err) {
 				free_extent_buffer(b);
-				ret = wret;
+				ret = err;
 				goto done;
 			}
 		}
@@ -1793,41 +1792,45 @@ cow_done:
 		ret = bin_search(b, key, level, &slot);
 
 		if (level != 0) {
-			if (ret && slot > 0)
+			int dec = 0;
+			if (ret && slot > 0) {
+				dec = 1;
 				slot -= 1;
+			}
 			p->slots[level] = slot;
-			ret = setup_nodes_for_search(trans, root, p, b, level,
+			err = setup_nodes_for_search(trans, root, p, b, level,
 						     ins_len);
-			if (ret == -EAGAIN)
+			if (err == -EAGAIN)
 				goto again;
-			else if (ret)
+			if (err) {
+				ret = err;
 				goto done;
+			}
 			b = p->nodes[level];
 			slot = p->slots[level];
 
 			unlock_up(p, level, lowest_unlock);
 
-			/* this is only true while dropping a snapshot */
 			if (level == lowest_level) {
-				ret = 0;
+				if (dec)
+					p->slots[level]++;
 				goto done;
 			}
 
-			ret = read_block_for_search(trans, root, p,
+			err = read_block_for_search(trans, root, p,
 						    &b, level, slot, key);
-			if (ret == -EAGAIN)
+			if (err == -EAGAIN)
 				goto again;
-
-			if (ret == -EIO)
+			if (err) {
+				ret = err;
 				goto done;
+			}
 
 			if (!p->skip_locking) {
-				int lret;
-
 				btrfs_clear_path_blocking(p, NULL);
-				lret = btrfs_try_spin_lock(b);
+				err = btrfs_try_spin_lock(b);
 
-				if (!lret) {
+				if (!err) {
 					btrfs_set_path_blocking(p);
 					btrfs_tree_lock(b);
 					btrfs_clear_path_blocking(p, b);
@@ -1837,16 +1840,14 @@ cow_done:
 			p->slots[level] = slot;
 			if (ins_len > 0 &&
 			    btrfs_leaf_free_space(root, b) < ins_len) {
-				int sret;
-
 				btrfs_set_path_blocking(p);
-				sret = split_leaf(trans, root, key,
-						      p, ins_len, ret == 0);
+				err = split_leaf(trans, root, key,
+						 p, ins_len, ret == 0);
 				btrfs_clear_path_blocking(p, NULL);
 
-				BUG_ON(sret > 0);
-				if (sret) {
-					ret = sret;
+				BUG_ON(err > 0);
+				if (err) {
+					ret = err;
 					goto done;
 				}
 			}
@@ -4042,10 +4043,9 @@ out:
  * calling this function.
  */
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-			struct btrfs_key *key, int lowest_level,
+			struct btrfs_key *key, int level,
 			int cache_only, u64 min_trans)
 {
-	int level = lowest_level;
 	int slot;
 	struct extent_buffer *c;
 
@@ -4058,11 +4058,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 		c = path->nodes[level];
 next:
 		if (slot >= btrfs_header_nritems(c)) {
-			level++;
-			if (level == BTRFS_MAX_LEVEL)
+			int ret;
+			int orig_lowest;
+			struct btrfs_key cur_key;
+			if (level + 1 >= BTRFS_MAX_LEVEL ||
+			    !path->nodes[level + 1])
 				return 1;
-			continue;
+
+			if (path->locks[level + 1]) {
+				level++;
+				continue;
+			}
+
+			slot = btrfs_header_nritems(c) - 1;
+			if (level == 0)
+				btrfs_item_key_to_cpu(c, &cur_key, slot);
+			else
+				btrfs_node_key_to_cpu(c, &cur_key, slot);
+
+			orig_lowest = path->lowest_level;
+			btrfs_release_path(root, path);
+			path->lowest_level = level;
+			ret = btrfs_search_slot(NULL, root, &cur_key, path,
+						0, 0);
+			path->lowest_level = orig_lowest;
+			if (ret < 0)
+				return ret;
+
+			c = path->nodes[level];
+			slot = path->slots[level];
+			if (ret == 0)
+				slot++;
+			goto next;
 		}
+
 		if (level == 0)
 			btrfs_item_key_to_cpu(c, key, slot);
 		else {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 008397934778..e71264d1c2c9 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -670,6 +670,8 @@ again:
 			err = ret;
 			goto out;
 		}
+		if (ret > 0 && path2->slots[level] > 0)
+			path2->slots[level]--;
 
 		eb = path2->nodes[level];
 		WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
@@ -1609,6 +1611,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		BUG_ON(level == 0);
 		path->lowest_level = level;
 		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+		path->lowest_level = 0;
 		if (ret < 0) {
 			btrfs_free_path(path);
 			return ret;
-- 
cgit v1.2.3-70-g09d2


From 4a8c9a62d7f7f058eed4b8a6f2c890a887778093 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 22 Jul 2009 10:07:05 -0400
Subject: Btrfs: make sure all dirty blocks are written at commit time

Write dirty block groups may allocate new block, and so may add new delayed
back ref. btrfs_run_delayed_refs may make some block groups dirty.

commit_cowonly_roots does not handle the recursion properly, and some dirty
blocks can be left unwritten at commit time. This patch moves
btrfs_run_delayed_refs into the loop that writes dirty block groups, and makes
the code not break out of the loop until there are no dirty block groups or
delayed back refs.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 70 +++++++++++++++++++++++++++++---------------------
 fs/btrfs/transaction.c |  9 +------
 2 files changed, 42 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a5aca3997d42..62a332d34fdb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2387,13 +2387,29 @@ fail:
 
 }
 
+static struct btrfs_block_group_cache *
+next_block_group(struct btrfs_root *root,
+		 struct btrfs_block_group_cache *cache)
+{
+	struct rb_node *node;
+	spin_lock(&root->fs_info->block_group_cache_lock);
+	node = rb_next(&cache->cache_node);
+	btrfs_put_block_group(cache);
+	if (node) {
+		cache = rb_entry(node, struct btrfs_block_group_cache,
+				 cache_node);
+		atomic_inc(&cache->count);
+	} else
+		cache = NULL;
+	spin_unlock(&root->fs_info->block_group_cache_lock);
+	return cache;
+}
+
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
-	struct btrfs_block_group_cache *cache, *entry;
-	struct rb_node *n;
+	struct btrfs_block_group_cache *cache;
 	int err = 0;
-	int werr = 0;
 	struct btrfs_path *path;
 	u64 last = 0;
 
@@ -2402,39 +2418,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	while (1) {
-		cache = NULL;
-		spin_lock(&root->fs_info->block_group_cache_lock);
-		for (n = rb_first(&root->fs_info->block_group_cache_tree);
-		     n; n = rb_next(n)) {
-			entry = rb_entry(n, struct btrfs_block_group_cache,
-					 cache_node);
-			if (entry->dirty) {
-				cache = entry;
-				break;
-			}
+		if (last == 0) {
+			err = btrfs_run_delayed_refs(trans, root,
+						     (unsigned long)-1);
+			BUG_ON(err);
 		}
-		spin_unlock(&root->fs_info->block_group_cache_lock);
 
-		if (!cache)
-			break;
+		cache = btrfs_lookup_first_block_group(root->fs_info, last);
+		while (cache) {
+			if (cache->dirty)
+				break;
+			cache = next_block_group(root, cache);
+		}
+		if (!cache) {
+			if (last == 0)
+				break;
+			last = 0;
+			continue;
+		}
 
 		cache->dirty = 0;
-		last += cache->key.offset;
+		last = cache->key.objectid + cache->key.offset;
 
-		err = write_one_cache_group(trans, root,
-					    path, cache);
-		/*
-		 * if we fail to write the cache group, we want
-		 * to keep it marked dirty in hopes that a later
-		 * write will work
-		 */
-		if (err) {
-			werr = err;
-			continue;
-		}
+		err = write_one_cache_group(trans, root, path, cache);
+		BUG_ON(err);
+		btrfs_put_block_group(cache);
 	}
+
 	btrfs_free_path(path);
-	return werr;
+	return 0;
 }
 
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2dbf1c1f56ee..81f7124c3051 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -444,9 +444,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 
 	btrfs_write_dirty_block_groups(trans, root);
 
-	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	BUG_ON(ret);
-
 	while (1) {
 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
 		if (old_root_bytenr == root->node->start)
@@ -457,9 +454,8 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 					&root->root_key,
 					&root->root_item);
 		BUG_ON(ret);
-		btrfs_write_dirty_block_groups(trans, root);
 
-		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+		ret = btrfs_write_dirty_block_groups(trans, root);
 		BUG_ON(ret);
 	}
 	free_extent_buffer(root->commit_root);
@@ -495,9 +491,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 		root = list_entry(next, struct btrfs_root, dirty_list);
 
 		update_cowonly_root(trans, root);
-
-		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-		BUG_ON(ret);
 	}
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 29c5e8ce01f9dad7e24b99c21e4f836d6b0289e0 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Wed, 22 Jul 2009 16:49:00 -0400
Subject: Btrfs: convert nested spin_lock_irqsave to spin_lock

If spin_lock_irqsave is called twice in a row with the same second
argument, the interrupt state at the point of the second call overwrites
the value saved by the first call.  Indeed, the second call does not need
to save the interrupt state, so it is changed to a simple spin_lock.

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 6e4f6c50a120..019e8af449ab 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work)
 	 * list
 	 */
 	if (worker->idle) {
-		spin_lock_irqsave(&worker->workers->lock, flags);
+		spin_lock(&worker->workers->lock);
 		worker->idle = 0;
 		list_move_tail(&worker->worker_list,
 			       &worker->workers->worker_list);
-		spin_unlock_irqrestore(&worker->workers->lock, flags);
+		spin_unlock(&worker->workers->lock);
 	}
 	if (!worker->working) {
 		wake = 1;
-- 
cgit v1.2.3-70-g09d2


From 3acada49c2794c5aac21849e2ea05790c6dd2faa Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 22 Jul 2009 16:49:01 -0400
Subject: Btrfs: Remove broken sanity check from btrfs_rmap_block()

It was never actually doing anything anyway (see the loop condition),
and it would be difficult to make it work for RAID[56].

Even if it was actually working, it's checking for the wrong thing
anyway. Instead of checking whether we list a block which _doesn't_ land
at the relevant physical location, it should be checking that we _have_
listed all the logical blocks which refer to the required physical
location on all devices.

This function is only called from remove_sb_from_cache() to ensure that
we reserve the logical blocks which would reside at the same physical
location as the superblock copies. So listing more blocks than we need
is actually OK.

With RAID[56] we're going to throw away an entire stripe for each block
we have to ignore, so we _are_ going to list blocks other than the
ones which actually contain the superblock.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f057730a72bb..55c37276a29f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2795,26 +2795,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 		}
 	}
 
-	for (i = 0; i > nr; i++) {
-		struct btrfs_multi_bio *multi;
-		struct btrfs_bio_stripe *stripe;
-		int ret;
-
-		length = 1;
-		ret = btrfs_map_block(map_tree, WRITE, buf[i],
-				      &length, &multi, 0);
-		BUG_ON(ret);
-
-		stripe = multi->stripes;
-		for (j = 0; j < multi->num_stripes; j++) {
-			if (stripe->physical >= physical &&
-			    physical < stripe->physical + length)
-				break;
-		}
-		BUG_ON(j >= multi->num_stripes);
-		kfree(multi);
-	}
-
 	*logical = buf;
 	*naddrs = nr;
 	*stripe_len = map->stripe_len;
-- 
cgit v1.2.3-70-g09d2


From 33c17ad5717c887568c1de61f15e5d58ed66d189 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Wed, 22 Jul 2009 16:49:01 -0400
Subject: Btrfs: adjust NULL test

Move the call to BUG_ON to before the dereference of the tested value.

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a48c084f6d3a..3ea827ddf0fe 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2607,8 +2607,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	if (root->ref_cows)
 		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
 	path = btrfs_alloc_path();
-	path->reada = -1;
 	BUG_ON(!path);
+	path->reada = -1;
 
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	key.objectid = inode->i_ino;
-- 
cgit v1.2.3-70-g09d2


From c271b492419a18908ba19ee02b231fb305a27023 Mon Sep 17 00:00:00 2001
From: Daniel Cadete <danielcadete10@gmail.com>
Date: Wed, 22 Jul 2009 16:52:13 -0400
Subject: Btrfs: remove of redundant btrfs_header_level

This removes the continues call's of btrfs_header_level. One call of
btrfs_header_level(c) its enough.

Signed-off-by Daniel Cadete <danielncadete10@gmail.com>

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/print-tree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6d6523da0a30..0d126be22b63 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 	}
 	printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
 	       (unsigned long long)btrfs_header_bytenr(c),
-	       btrfs_header_level(c), nr,
+	      level, nr,
 	       (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	for (i = 0; i < nr; i++) {
 		btrfs_node_key_to_cpu(c, &key, i);
@@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 					btrfs_level_size(root, level - 1),
 					btrfs_node_ptr_generation(c, i));
 		if (btrfs_is_leaf(next) &&
-		    btrfs_header_level(c) != 1)
+		   level != 1)
 			BUG();
 		if (btrfs_header_level(next) !=
-			btrfs_header_level(c) - 1)
+		       level - 1)
 			BUG();
 		btrfs_print_tree(root, next);
 		free_extent_buffer(next);
-- 
cgit v1.2.3-70-g09d2


From 83121942b28daffc9526b14b7843d8cdbd3db641 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 22 Jul 2009 16:52:13 -0400
Subject: Btrfs: Fix crash on read failures at mount

If the tree roots hit read errors during mount, btrfs is not properly
erroring out.  We need to check the uptodate bits after
reading in the tree root node.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0d50d49d990a..55d9d188e693 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1783,6 +1783,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 					   btrfs_super_chunk_root(disk_super),
 					   blocksize, generation);
 	BUG_ON(!chunk_root->node);
+	if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+		printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+		       sb->s_id);
+		goto fail_chunk_root;
+	}
 	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
 	chunk_root->commit_root = btrfs_root_node(chunk_root);
 
@@ -1810,6 +1815,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 					  blocksize, generation);
 	if (!tree_root->node)
 		goto fail_chunk_root;
+	if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+		printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+		       sb->s_id);
+		goto fail_tree_root;
+	}
 	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
 	tree_root->commit_root = btrfs_root_node(tree_root);
 
-- 
cgit v1.2.3-70-g09d2


From ce6e7fcd43aab1f77e56aa36936dd7d2d05a1ffa Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 22 Jul 2009 15:08:58 -0400
Subject: cifs: disable serverino if server doesn't support it

A recent regression when dealing with older servers. This bug was
introduced when we made serverino the default...

When the server can't provide inode numbers, disable it for the mount.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 18afe57b2461..b6a47b32f21e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -513,9 +513,12 @@ int cifs_get_inode_info(struct inode **pinode,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			if (rc1) {
-				/* BB EOPNOSUPP disable SERVER_INUM? */
 				cFYI(1, ("GetSrvInodeNum rc %d", rc1));
 				fattr.cf_uniqueid = iunique(sb, ROOT_I);
+				/* disable serverino if call not supported */
+				if (rc1 == -EINVAL)
+					cifs_sb->mnt_cifs_flags &=
+							~CIFS_MOUNT_SERVER_INUM;
 			}
 		} else {
 			fattr.cf_uniqueid = iunique(sb, ROOT_I);
-- 
cgit v1.2.3-70-g09d2


From 03aa3a49ad3592a9e4e1ab19c6da3e852288caf1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 21 Jul 2009 19:42:03 -0400
Subject: cifs: fix sb->s_maxbytes so that it casts properly to a signed value

This off-by-one bug causes sendfile() to not work properly. When a task
calls sendfile() on a file on a CIFS filesystem, the syscall returns -1
and sets errno to EOVERFLOW.

do_sendfile uses s_maxbytes to verify the returned offset of the file.
The problem there is that this value is cast to a signed value (loff_t).
When this is done on the s_maxbytes value that cifs uses, it becomes
negative and the comparisons against it fail.

Even though s_maxbytes is an unsigned value, it seems that it's not OK
to set it in such a way that it'll end up negative when it's cast to a
signed value. These casts happen in other codepaths besides sendfile
too, but the VFS is a little hard to follow in this area and I can't
be sure if there are other bugs that this will fix.

It's not clear to me why s_maxbytes isn't just declared as loff_t in the
first place, but either way we still need to fix these values to make
sendfile work properly. This is also an opportunity to replace the magic
bit-shift values here with the standard #defines for this.

This fixes the reproducer program I have that does a sendfile and
will probably also fix the situation where apache is serving from a
CIFS share.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9bb5c8750736..fc44d316d0bb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2452,10 +2452,10 @@ try_mount_again:
 		tcon->local_lease = volume_info->local_lease;
 	}
 	if (pSesInfo) {
-		if (pSesInfo->capabilities & CAP_LARGE_FILES) {
-			sb->s_maxbytes = (u64) 1 << 63;
-		} else
-			sb->s_maxbytes = (u64) 1 << 31;	/* 2 GB */
+		if (pSesInfo->capabilities & CAP_LARGE_FILES)
+			sb->s_maxbytes = MAX_LFS_FILESIZE;
+		else
+			sb->s_maxbytes = MAX_NON_LFS;
 	}
 
 	/* BB FIXME fix time_gran to be larger for LANMAN sessions */
-- 
cgit v1.2.3-70-g09d2


From f1230c97978f52268d8c66e6f88e54c3d2092a75 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 22 Jul 2009 23:13:01 +0000
Subject: [CIFS] fix sparse warning

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b6a47b32f21e..82d83839655e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -212,7 +212,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
  * junction to the new submount (ie to setup the fake directory
  * which represents a DFS referral).
  */
-void
+static void
 cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -388,7 +388,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 }
 
 /* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
-void
+static void
 cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 		       struct cifs_sb_info *cifs_sb, bool adjust_tz)
 {
-- 
cgit v1.2.3-70-g09d2


From 4a19fb11a90fdbbcb3bc02effa036230d035ca28 Mon Sep 17 00:00:00 2001
From: Stefan Bader <stefan.bader@canonical.com>
Date: Thu, 23 Jul 2009 11:26:05 +0200
Subject: jfs: Fix early release of acl in jfs_get_acl

BugLink: http://bugs.launchpad.net/ubuntu/+bug/396780

Commit 073aaa1b142461d91f83da66db1184d7c1b1edea "helpers for acl
caching + switch to those" introduced new helper functions for
acl handling but seems to have introduced a regression for jfs as
the acl is released before returning it to the caller, instead of
leaving this for the caller to do.
This causes the acl object to be used after freeing it, leading
to kernel panics in completely different places.

Thanks to Christophe Dumez for reporting and bisecting into this.

Reported-by: Christophe Dumez <dchris@gmail.com>
Tested-by: Christophe Dumez <dchris@gmail.com>
Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Acked-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/acl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 91fa3ad6e8c2..a29c7c3e3fb8 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -67,10 +67,8 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
 		acl = posix_acl_from_xattr(value, size);
 	}
 	kfree(value);
-	if (!IS_ERR(acl)) {
+	if (!IS_ERR(acl))
 		set_cached_acl(inode, type, acl);
-		posix_acl_release(acl);
-	}
 	return acl;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 82e12644cf5227dab15201fbcaf0ca6330ebd70f Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 23 Jul 2009 08:12:58 +0800
Subject: ocfs2: Use ocfs2_rec_clusters in ocfs2_adjust_adjacent_records.

In ocfs2_adjust_adjacent_records, we will adjust adjacent records
according to the extent_list in the lower level. But actually
the lower level tree will either be a leaf or a branch. If we only
use ocfs2_is_empty_extent we will meet with some problem if the lower
tree is a branch (tree_depth > 1). So use !ocfs2_rec_clusters instead.
And actually only the leaf record can have holes. So add a BUG_ON
for non-leaf branch.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 11085af71247..f9a3e8942669 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1914,7 +1914,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
 	 * immediately to their right.
 	 */
 	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
-	if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
+	if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
+		BUG_ON(right_child_el->l_tree_depth);
 		BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
 		left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
 	}
-- 
cgit v1.2.3-70-g09d2


From b57ac2c43e66cb4aa76c9d2d0e9e0e04f19cc5a6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 13:17:15 +0200
Subject: ocfs2: Make global quota files blocksize aligned

Change i_size of global quota files so that it always remains aligned to block
size. This is mainly because the end of quota block may contain checksum (if
checksumming is enabled) and it's a bit awkward for it to be "outside" of quota
file (and it makes life harder for ocfs2-tools).

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/quota_global.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index edfa60cd155c..a66cb82fd6e6 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -211,14 +211,17 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 
 	mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
 	if (gqinode->i_size < off + len) {
+		loff_t rounded_end =
+				ocfs2_align_bytes_to_blocks(sb, off + len);
+
 		down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-		err = ocfs2_extend_no_holes(gqinode, off + len, off);
+		err = ocfs2_extend_no_holes(gqinode, rounded_end, off);
 		up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
 		if (err < 0)
 			goto out;
 		err = ocfs2_simple_size_update(gqinode,
 					       oinfo->dqi_gqi_bh,
-					       off + len);
+					       rounded_end);
 		if (err < 0)
 			goto out;
 		new = 1;
-- 
cgit v1.2.3-70-g09d2


From 4b3fa1904c0d192461ebba692e4940a334b74353 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 13:17:16 +0200
Subject: ocfs2: Mark buffer uptodate before calling ocfs2_journal_access_dq()

In a code path extending local quota files we marked new header
buffer uptodate only after calling ocfs2_journal_access_dq() which
triggers a bug. Fix it and also call ocfs2 variant of the function
marking buffer uptodate.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/quota_local.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 5a460fa82553..b624f9bc9281 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -20,6 +20,7 @@
 #include "sysfile.h"
 #include "dlmglue.h"
 #include "quota.h"
+#include "uptodate.h"
 
 /* Number of local quota structures per block */
 static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -979,6 +980,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 		mlog_errno(status);
 		goto out;
 	}
+	ocfs2_set_new_buffer_uptodate(lqinode, bh);
+
 	dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
 
 	handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
@@ -999,7 +1002,6 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 	memset(dchunk->dqc_bitmap, 0,
 	       sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
 	       OCFS2_QBLK_RESERVED_SPACE);
-	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 	status = ocfs2_journal_dirty(handle, bh);
 	if (status < 0) {
-- 
cgit v1.2.3-70-g09d2


From 0e7f387bf386c99e8ee322649fe4fc23b9cccc6c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 13:17:17 +0200
Subject: ocfs2: Initialize blocks allocated to local quota file

When we extend local quota file, we should initialize data
in newly allocated block. Firstly because on recovery we could
parse bogus data, secondly so that block checksums are properly
computed.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/quota_local.c | 98 ++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 83 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index b624f9bc9281..9d2993de1082 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -941,7 +941,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 	struct ocfs2_local_disk_chunk *dchunk;
 	int status;
 	handle_t *handle;
-	struct buffer_head *bh = NULL;
+	struct buffer_head *bh = NULL, *dbh = NULL;
 	u64 p_blkno;
 
 	/* We are protected by dqio_sem so no locking needed */
@@ -965,34 +965,32 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 		mlog_errno(status);
 		goto out;
 	}
+	handle = ocfs2_start_trans(OCFS2_SB(sb), 3);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
 
+	/* Initialize chunk header */
 	down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
 	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
 					     &p_blkno, NULL, NULL);
 	up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
 	if (status < 0) {
 		mlog_errno(status);
-		goto out;
+		goto out_trans;
 	}
 	bh = sb_getblk(sb, p_blkno);
 	if (!bh) {
 		status = -ENOMEM;
 		mlog_errno(status);
-		goto out;
+		goto out_trans;
 	}
-	ocfs2_set_new_buffer_uptodate(lqinode, bh);
-
 	dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
-
-	handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		mlog_errno(status);
-		goto out;
-	}
-
+	ocfs2_set_new_buffer_uptodate(lqinode, bh);
 	status = ocfs2_journal_access_dq(handle, lqinode, bh,
-					 OCFS2_JOURNAL_ACCESS_WRITE);
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_trans;
@@ -1009,6 +1007,38 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 		goto out_trans;
 	}
 
+	/* Initialize new block with structures */
+	down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
+					     &p_blkno, NULL, NULL);
+	up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	dbh = sb_getblk(sb, p_blkno);
+	if (!dbh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out_trans;
+	}
+	ocfs2_set_new_buffer_uptodate(lqinode, dbh);
+	status = ocfs2_journal_access_dq(handle, lqinode, dbh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	lock_buffer(dbh);
+	memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
+	unlock_buffer(dbh);
+	status = ocfs2_journal_dirty(handle, dbh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	/* Update local quotafile info */
 	oinfo->dqi_blocks += 2;
 	oinfo->dqi_chunks++;
 	status = ocfs2_local_write_info(sb, type);
@@ -1033,6 +1063,7 @@ out_trans:
 	ocfs2_commit_trans(OCFS2_SB(sb), handle);
 out:
 	brelse(bh);
+	brelse(dbh);
 	kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
 	return ERR_PTR(status);
 }
@@ -1050,6 +1081,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 	struct ocfs2_local_disk_chunk *dchunk;
 	int epb = ol_quota_entries_per_block(sb);
 	unsigned int chunk_blocks;
+	struct buffer_head *bh;
+	u64 p_blkno;
 	int status;
 	handle_t *handle;
 
@@ -1077,12 +1110,46 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 		mlog_errno(status);
 		goto out;
 	}
-	handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+
+	/* Get buffer from the just added block */
+	down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+					     &p_blkno, NULL, NULL);
+	up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	bh = sb_getblk(sb, p_blkno);
+	if (!bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(lqinode, bh);
+
+	handle = ocfs2_start_trans(OCFS2_SB(sb), 3);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
 		goto out;
 	}
+	/* Zero created block */
+	status = ocfs2_journal_access_dq(handle, lqinode, bh,
+				 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	lock_buffer(bh);
+	memset(bh->b_data, 0, sb->s_blocksize);
+	unlock_buffer(bh);
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	/* Update chunk header */
 	status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
 				 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
@@ -1099,6 +1166,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 		mlog_errno(status);
 		goto out_trans;
 	}
+	/* Update file header */
 	oinfo->dqi_blocks++;
 	status = ocfs2_local_write_info(sb, type);
 	if (status < 0) {
-- 
cgit v1.2.3-70-g09d2


From 7669f54c55df225cbb2db939a6c9f111445ffc1c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 13:17:18 +0200
Subject: ocfs2: Zero out padding of on disk dquot structure

Padding fields of on-disk dquot structure were not zeroed. Zero them
so that it's easier to use them later.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/quota_global.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index a66cb82fd6e6..7248db681351 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -69,6 +69,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
 	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
 	d->dqb_btime = cpu_to_le64(m->dqb_btime);
 	d->dqb_itime = cpu_to_le64(m->dqb_itime);
+	d->dqb_pad1 = d->dqb_pad2 = 0;
 }
 
 static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
-- 
cgit v1.2.3-70-g09d2


From 1c1d9793ff6720531c0125a28d321f283716e32f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 13:17:19 +0200
Subject: ocfs2: Fix initialization of blockcheck stats

We just set blockcheck stats to zeros but we should also
properly initialize the spinlock there.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f2893878f8ad..b0ee0fdf799a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -777,6 +777,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
 		}
 		di = (struct ocfs2_dinode *) (*bh)->b_data;
 		memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
+		spin_lock_init(&stats->b_lock);
 		status = ocfs2_verify_volume(di, *bh, blksize, stats);
 		if (status >= 0)
 			goto bail;
-- 
cgit v1.2.3-70-g09d2


From 4539f1df25bcd0fdf0d8a5e2c92de6bece83c7a0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 13:17:20 +0200
Subject: ocfs2: Remove syncjiff field from quota info

syncjiff is just a converted value of syncms. Some places which
are updating syncms forgot to update syncjiff as well. Since the
conversion is just a simple division / multiplication and it does
not happen frequently, just remove the syncjiff field to avoid
forgotten conversions.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/quota.h        | 1 -
 fs/ocfs2/quota_global.c | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 7365e2e08706..3fb96fcd4c81 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -50,7 +50,6 @@ struct ocfs2_mem_dqinfo {
 	unsigned int dqi_chunks;	/* Number of chunks in local quota file */
 	unsigned int dqi_blocks;	/* Number of blocks allocated for local quota file */
 	unsigned int dqi_syncms;	/* How often should we sync with other nodes */
-	unsigned int dqi_syncjiff;	/* Precomputed dqi_syncms in jiffies */
 	struct list_head dqi_chunk;	/* List of chunks */
 	struct inode *dqi_gqinode;	/* Global quota file inode */
 	struct ocfs2_lock_res dqi_gqlock;	/* Lock protecting quota information structure */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 7248db681351..dab45467b5fd 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -346,7 +346,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
 	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
 	oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
-	oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
 	oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
 	oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
 	oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
@@ -356,7 +355,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 	oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
 	INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
 	queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
-			   oinfo->dqi_syncjiff);
+			   msecs_to_jiffies(oinfo->dqi_syncms));
 
 out_err:
 	mlog_exit(status);
@@ -611,7 +610,7 @@ static void qsync_work_fn(struct work_struct *work)
 
 	dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
 	queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
-			   oinfo->dqi_syncjiff);
+			   msecs_to_jiffies(oinfo->dqi_syncms));
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 0584974a77796581eb3a64b6c5005edac4a95128 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 13:17:21 +0200
Subject: ocfs2: Define credit counts for quota operations

Numbers of needed credits for some quota operations were written
as raw numbers. Create appropriate defines instead.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/journal.h      | 13 ++++++++++---
 fs/ocfs2/quota_global.c | 18 +++++++++++++-----
 fs/ocfs2/quota_local.c  | 16 ++++++++++++----
 3 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 81e8abcd246e..4a4d3b55fd22 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -330,20 +330,27 @@ int                  ocfs2_journal_dirty(handle_t *handle,
 /* extended attribute block update */
 #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
 
+/* Update of a single quota block */
+#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1
+
 /* global quotafile inode update, data block */
-#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
+				   OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
 
+#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS
 /*
  * The two writes below can accidentally see global info dirty due
  * to set_info() quotactl so make them prepared for the writes.
  */
 /* quota data block, global info */
 /* Write to local quota file */
-#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+			      OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
 
 /* global quota data block, local quota data block, global quota inode,
  * global quota info */
-#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+			     2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
 
 static inline int ocfs2_quota_trans_credits(struct super_block *sb)
 {
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index dab45467b5fd..cc8785cf8f61 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -648,10 +648,15 @@ int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
 		return 0;
 
 	oinfo = sb_dqinfo(sb, type)->dqi_priv;
-	/* We modify tree, leaf block, global info, local chunk header,
-	 * global and local inode */
-	return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
-	       2 * OCFS2_INODE_UPDATE_CREDITS;
+	/*
+	 * We modify tree, leaf block, global info, local chunk header,
+	 * global and local inode; OCFS2_QINFO_WRITE_CREDITS already
+	 * accounts for inode update
+	 */
+	return oinfo->dqi_gi.dqi_qtree_depth +
+	       OCFS2_QINFO_WRITE_CREDITS +
+	       2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
+	       OCFS2_INODE_UPDATE_CREDITS;
 }
 
 static int ocfs2_release_dquot(struct dquot *dquot)
@@ -701,7 +706,10 @@ int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
 	 * global file we can modify info, tree and leaf block */
 	return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
 	       ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
-	       3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
+	       OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+	       2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
+	       oinfo->dqi_gi.dqi_qtree_depth +
+	       2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
 }
 
 static int ocfs2_acquire_dquot(struct dquot *dquot)
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 9d2993de1082..bdb09cb6e1fe 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -101,7 +101,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
 	handle_t *handle;
 	int status;
 
-	handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+				   OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
@@ -611,7 +612,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
 			goto out_bh;
 		/* Mark quota file as clean if we are recovering quota file of
 		 * some other node. */
-		handle = ocfs2_start_trans(osb, 1);
+		handle = ocfs2_start_trans(osb,
+					   OCFS2_LOCAL_QINFO_WRITE_CREDITS);
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
 			mlog_errno(status);
@@ -965,7 +967,10 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 		mlog_errno(status);
 		goto out;
 	}
-	handle = ocfs2_start_trans(OCFS2_SB(sb), 3);
+	/* Local quota info and two new blocks we initialize */
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+			OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+			2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
@@ -1128,7 +1133,10 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 	}
 	ocfs2_set_new_buffer_uptodate(lqinode, bh);
 
-	handle = ocfs2_start_trans(OCFS2_SB(sb), 3);
+	/* Local quota info, chunk header and the new block we initialize */
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+			OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+			2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
-- 
cgit v1.2.3-70-g09d2


From 963030817060e4f109be1993b9ae8f81dbf5e11a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 13 Jul 2009 21:29:25 -0400
Subject: Btrfs: use hybrid extents+bitmap rb tree for free space

Currently btrfs has a problem where it can use a ridiculous amount of RAM simply
tracking free space.  As free space gets fragmented, we end up with thousands of
entries on an rb-tree per block group, which usually spans 1 gig of area.  Since
we currently don't ever flush free space cache back to disk this gets to be a
bit unweildly on large fs's with lots of fragmentation.

This patch solves this problem by using PAGE_SIZE bitmaps for parts of the free
space cache.  Initially we calculate a threshold of extent entries we can
handle, which is however many extent entries we can cram into 16k of ram.  The
maximum amount of RAM that should ever be used to track 1 gigabyte of diskspace
will be 32k of RAM, which scales much better than we did before.

Once we pass the extent threshold, we start adding bitmaps and using those
instead for tracking the free space.  This patch also makes it so that any free
space thats less than 4 * sectorsize we go ahead and put into a bitmap.  This is
nice since we try and allocate out of the front of a block group, so if the
front of a block group is heavily fragmented and then has a huge chunk of free
space at the end, we go ahead and add the fragmented areas to bitmaps and use a
normal extent entry to track the big chunk at the back of the block group.

I've also taken the opportunity to revamp how we search for free space.
Previously we indexed free space via an offset indexed rb tree and a bytes
indexed rb tree.  I've dropped the bytes indexed rb tree and use only the offset
indexed rb tree.  This cuts the number of tree operations we were doing
previously down by half, and gives us a little bit of a better allocation
pattern since we will always start from a specific offset and search forward
from there, instead of searching for the size we need and try and get it as
close as possible to the offset we want.

I've given this a healthy amount of testing pre-new format stuff, as well as
post-new format stuff.  I've booted up my fedora box which is installed on btrfs
with this patch and ran with it for a few days without issues.  I've not seen
any performance regressions in any of my tests.

Since the last patch Yan Zheng fixed a problem where we could have overlapping
entries, so updating their offset inline would cause problems.  Thanks,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h            |    8 +-
 fs/btrfs/extent-tree.c      |   25 +-
 fs/btrfs/free-space-cache.c | 1001 ++++++++++++++++++++++++++++++++++---------
 fs/btrfs/free-space-cache.h |    8 +
 4 files changed, 826 insertions(+), 216 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index da0763135bf0..0cbf3491bb7c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -709,6 +709,9 @@ struct btrfs_free_cluster {
 	/* first extent starting offset */
 	u64 window_start;
 
+	/* if this cluster simply points at a bitmap in the block group */
+	bool points_to_bitmap;
+
 	struct btrfs_block_group_cache *block_group;
 	/*
 	 * when a cluster is allocated from a block group, we put the
@@ -726,6 +729,10 @@ struct btrfs_block_group_cache {
 	u64 pinned;
 	u64 reserved;
 	u64 flags;
+	u64 sectorsize;
+	int extents_thresh;
+	int free_extents;
+	int total_bitmaps;
 	int cached;
 	int ro;
 	int dirty;
@@ -734,7 +741,6 @@ struct btrfs_block_group_cache {
 
 	/* free space cache stuff */
 	spinlock_t tree_lock;
-	struct rb_root free_space_bytes;
 	struct rb_root free_space_offset;
 
 	/* block group cache stuff */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 62a332d34fdb..98697be6bdde 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3649,7 +3649,6 @@ refill_cluster:
 			goto loop;
 checks:
 		search_start = stripe_align(root, offset);
-
 		/* move on to the next group */
 		if (search_start + num_bytes >= search_end) {
 			btrfs_add_free_space(block_group, offset, num_bytes);
@@ -7040,6 +7039,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		mutex_init(&cache->cache_mutex);
 		INIT_LIST_HEAD(&cache->list);
 		INIT_LIST_HEAD(&cache->cluster_list);
+		cache->sectorsize = root->sectorsize;
+
+		/*
+		 * we only want to have 32k of ram per block group for keeping
+		 * track of free space, and if we pass 1/2 of that we want to
+		 * start converting things over to using bitmaps
+		 */
+		cache->extents_thresh = ((1024 * 32) / 2) /
+			sizeof(struct btrfs_free_space);
+
 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
 				   sizeof(cache->item));
@@ -7091,6 +7100,15 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+	cache->sectorsize = root->sectorsize;
+
+	/*
+	 * we only want to have 32k of ram per block group for keeping track
+	 * of free space, and if we pass 1/2 of that we want to start
+	 * converting things over to using bitmaps
+	 */
+	cache->extents_thresh = ((1024 * 32) / 2) /
+		sizeof(struct btrfs_free_space);
 	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
 	spin_lock_init(&cache->tree_lock);
@@ -7103,6 +7121,11 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	cache->flags = type;
 	btrfs_set_block_group_flags(&cache->item, type);
 
+	cache->cached = 1;
+	ret = btrfs_add_free_space(cache, chunk_offset, size);
+	BUG_ON(ret);
+	remove_sb_from_cache(root, cache);
+
 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
 				&cache->space_info);
 	BUG_ON(ret);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4538e48581a5..ab8cad8b46c9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,45 +16,46 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/math64.h>
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
 
-struct btrfs_free_space {
-	struct rb_node bytes_index;
-	struct rb_node offset_index;
-	u64 offset;
-	u64 bytes;
-};
+#define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
+#define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
 
-static int tree_insert_offset(struct rb_root *root, u64 offset,
-			      struct rb_node *node)
+static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
+					  u64 offset)
 {
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct btrfs_free_space *info;
+	BUG_ON(offset < bitmap_start);
+	offset -= bitmap_start;
+	return (unsigned long)(div64_u64(offset, sectorsize));
+}
 
-	while (*p) {
-		parent = *p;
-		info = rb_entry(parent, struct btrfs_free_space, offset_index);
+static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
+{
+	return (unsigned long)(div64_u64(bytes, sectorsize));
+}
 
-		if (offset < info->offset)
-			p = &(*p)->rb_left;
-		else if (offset > info->offset)
-			p = &(*p)->rb_right;
-		else
-			return -EEXIST;
-	}
+static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
+				   u64 offset)
+{
+	u64 bitmap_start;
+	u64 bytes_per_bitmap;
 
-	rb_link_node(node, parent, p);
-	rb_insert_color(node, root);
+	bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
+	bitmap_start = offset - block_group->key.objectid;
+	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+	bitmap_start *= bytes_per_bitmap;
+	bitmap_start += block_group->key.objectid;
 
-	return 0;
+	return bitmap_start;
 }
 
-static int tree_insert_bytes(struct rb_root *root, u64 bytes,
-			     struct rb_node *node)
+static int tree_insert_offset(struct rb_root *root, u64 offset,
+			      struct rb_node *node, int bitmap)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
@@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
 
 	while (*p) {
 		parent = *p;
-		info = rb_entry(parent, struct btrfs_free_space, bytes_index);
+		info = rb_entry(parent, struct btrfs_free_space, offset_index);
 
-		if (bytes < info->bytes)
+		if (offset < info->offset) {
 			p = &(*p)->rb_left;
-		else
+		} else if (offset > info->offset) {
 			p = &(*p)->rb_right;
+		} else {
+			/*
+			 * we could have a bitmap entry and an extent entry
+			 * share the same offset.  If this is the case, we want
+			 * the extent entry to always be found first if we do a
+			 * linear search through the tree, since we want to have
+			 * the quickest allocation time, and allocating from an
+			 * extent is faster than allocating from a bitmap.  So
+			 * if we're inserting a bitmap and we find an entry at
+			 * this offset, we want to go right, or after this entry
+			 * logically.  If we are inserting an extent and we've
+			 * found a bitmap, we want to go left, or before
+			 * logically.
+			 */
+			if (bitmap) {
+				WARN_ON(info->bitmap);
+				p = &(*p)->rb_right;
+			} else {
+				WARN_ON(!info->bitmap);
+				p = &(*p)->rb_left;
+			}
+		}
 	}
 
 	rb_link_node(node, parent, p);
@@ -79,110 +102,142 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
 /*
  * searches the tree for the given offset.
  *
- * fuzzy == 1: this is used for allocations where we are given a hint of where
- * to look for free space.  Because the hint may not be completely on an offset
- * mark, or the hint may no longer point to free space we need to fudge our
- * results a bit.  So we look for free space starting at or after offset with at
- * least bytes size.  We prefer to find as close to the given offset as we can.
- * Also if the offset is within a free space range, then we will return the free
- * space that contains the given offset, which means we can return a free space
- * chunk with an offset before the provided offset.
- *
- * fuzzy == 0: this is just a normal tree search.  Give us the free space that
- * starts at the given offset which is at least bytes size, and if its not there
- * return NULL.
+ * fuzzy - If this is set, then we are trying to make an allocation, and we just
+ * want a section that has at least bytes size and comes at or after the given
+ * offset.
  */
-static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
-						   u64 offset, u64 bytes,
-						   int fuzzy)
+static struct btrfs_free_space *
+tree_search_offset(struct btrfs_block_group_cache *block_group,
+		   u64 offset, int bitmap_only, int fuzzy)
 {
-	struct rb_node *n = root->rb_node;
-	struct btrfs_free_space *entry, *ret = NULL;
+	struct rb_node *n = block_group->free_space_offset.rb_node;
+	struct btrfs_free_space *entry, *prev = NULL;
+
+	/* find entry that is closest to the 'offset' */
+	while (1) {
+		if (!n) {
+			entry = NULL;
+			break;
+		}
 
-	while (n) {
 		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+		prev = entry;
 
-		if (offset < entry->offset) {
-			if (fuzzy &&
-			    (!ret || entry->offset < ret->offset) &&
-			    (bytes <= entry->bytes))
-				ret = entry;
+		if (offset < entry->offset)
 			n = n->rb_left;
-		} else if (offset > entry->offset) {
-			if (fuzzy &&
-			    (entry->offset + entry->bytes - 1) >= offset &&
-			    bytes <= entry->bytes) {
-				ret = entry;
-				break;
-			}
+		else if (offset > entry->offset)
 			n = n->rb_right;
-		} else {
-			if (bytes > entry->bytes) {
-				n = n->rb_right;
-				continue;
-			}
-			ret = entry;
+		else
 			break;
-		}
 	}
 
-	return ret;
-}
-
-/*
- * return a chunk at least bytes size, as close to offset that we can get.
- */
-static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
-						  u64 offset, u64 bytes)
-{
-	struct rb_node *n = root->rb_node;
-	struct btrfs_free_space *entry, *ret = NULL;
+	if (bitmap_only) {
+		if (!entry)
+			return NULL;
+		if (entry->bitmap)
+			return entry;
 
-	while (n) {
-		entry = rb_entry(n, struct btrfs_free_space, bytes_index);
+		/*
+		 * bitmap entry and extent entry may share same offset,
+		 * in that case, bitmap entry comes after extent entry.
+		 */
+		n = rb_next(n);
+		if (!n)
+			return NULL;
+		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+		if (entry->offset != offset)
+			return NULL;
 
-		if (bytes < entry->bytes) {
+		WARN_ON(!entry->bitmap);
+		return entry;
+	} else if (entry) {
+		if (entry->bitmap) {
 			/*
-			 * We prefer to get a hole size as close to the size we
-			 * are asking for so we don't take small slivers out of
-			 * huge holes, but we also want to get as close to the
-			 * offset as possible so we don't have a whole lot of
-			 * fragmentation.
+			 * if previous extent entry covers the offset,
+			 * we should return it instead of the bitmap entry
 			 */
-			if (offset <= entry->offset) {
-				if (!ret)
-					ret = entry;
-				else if (entry->bytes < ret->bytes)
-					ret = entry;
-				else if (entry->offset < ret->offset)
-					ret = entry;
+			n = &entry->offset_index;
+			while (1) {
+				n = rb_prev(n);
+				if (!n)
+					break;
+				prev = rb_entry(n, struct btrfs_free_space,
+						offset_index);
+				if (!prev->bitmap) {
+					if (prev->offset + prev->bytes > offset)
+						entry = prev;
+					break;
+				}
 			}
-			n = n->rb_left;
-		} else if (bytes > entry->bytes) {
-			n = n->rb_right;
+		}
+		return entry;
+	}
+
+	if (!prev)
+		return NULL;
+
+	/* find last entry before the 'offset' */
+	entry = prev;
+	if (entry->offset > offset) {
+		n = rb_prev(&entry->offset_index);
+		if (n) {
+			entry = rb_entry(n, struct btrfs_free_space,
+					offset_index);
+			BUG_ON(entry->offset > offset);
 		} else {
-			/*
-			 * Ok we may have multiple chunks of the wanted size,
-			 * so we don't want to take the first one we find, we
-			 * want to take the one closest to our given offset, so
-			 * keep searching just in case theres a better match.
-			 */
-			n = n->rb_right;
-			if (offset > entry->offset)
-				continue;
-			else if (!ret || entry->offset < ret->offset)
-				ret = entry;
+			if (fuzzy)
+				return entry;
+			else
+				return NULL;
 		}
 	}
 
-	return ret;
+	if (entry->bitmap) {
+		n = &entry->offset_index;
+		while (1) {
+			n = rb_prev(n);
+			if (!n)
+				break;
+			prev = rb_entry(n, struct btrfs_free_space,
+					offset_index);
+			if (!prev->bitmap) {
+				if (prev->offset + prev->bytes > offset)
+					return prev;
+				break;
+			}
+		}
+		if (entry->offset + BITS_PER_BITMAP *
+		    block_group->sectorsize > offset)
+			return entry;
+	} else if (entry->offset + entry->bytes > offset)
+		return entry;
+
+	if (!fuzzy)
+		return NULL;
+
+	while (1) {
+		if (entry->bitmap) {
+			if (entry->offset + BITS_PER_BITMAP *
+			    block_group->sectorsize > offset)
+				break;
+		} else {
+			if (entry->offset + entry->bytes > offset)
+				break;
+		}
+
+		n = rb_next(&entry->offset_index);
+		if (!n)
+			return NULL;
+		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+	}
+	return entry;
 }
 
 static void unlink_free_space(struct btrfs_block_group_cache *block_group,
 			      struct btrfs_free_space *info)
 {
 	rb_erase(&info->offset_index, &block_group->free_space_offset);
-	rb_erase(&info->bytes_index, &block_group->free_space_bytes);
+	block_group->free_extents--;
 }
 
 static int link_free_space(struct btrfs_block_group_cache *block_group,
@@ -190,17 +245,311 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 {
 	int ret = 0;
 
-
-	BUG_ON(!info->bytes);
+	BUG_ON(!info->bitmap && !info->bytes);
 	ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
-				 &info->offset_index);
+				 &info->offset_index, (info->bitmap != NULL));
 	if (ret)
 		return ret;
 
-	ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
-				&info->bytes_index);
-	if (ret)
-		return ret;
+	block_group->free_extents++;
+	return ret;
+}
+
+static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+{
+	u64 max_bytes, possible_bytes;
+
+	/*
+	 * The goal is to keep the total amount of memory used per 1gb of space
+	 * at or below 32k, so we need to adjust how much memory we allow to be
+	 * used by extent based free space tracking
+	 */
+	max_bytes = MAX_CACHE_BYTES_PER_GIG *
+		(div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+
+	possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
+		(sizeof(struct btrfs_free_space) *
+		 block_group->extents_thresh);
+
+	if (possible_bytes > max_bytes) {
+		int extent_bytes = max_bytes -
+			(block_group->total_bitmaps * PAGE_CACHE_SIZE);
+
+		if (extent_bytes <= 0) {
+			block_group->extents_thresh = 0;
+			return;
+		}
+
+		block_group->extents_thresh = extent_bytes /
+			(sizeof(struct btrfs_free_space));
+	}
+}
+
+static void bitmap_clear_bits(struct btrfs_free_space *info, u64 offset, u64 bytes,
+			      u64 sectorsize)
+{
+	unsigned long start, end;
+	unsigned long i;
+
+	start = offset_to_bit(info->offset, sectorsize, offset);
+	end = start + bytes_to_bits(bytes, sectorsize);
+	BUG_ON(end > BITS_PER_BITMAP);
+
+	for (i = start; i < end; i++)
+		clear_bit(i, info->bitmap);
+
+	info->bytes -= bytes;
+}
+
+static void bitmap_set_bits(struct btrfs_free_space *info, u64 offset, u64 bytes,
+			    u64 sectorsize)
+{
+	unsigned long start, end;
+	unsigned long i;
+
+	start = offset_to_bit(info->offset, sectorsize, offset);
+	end = start + bytes_to_bits(bytes, sectorsize);
+	BUG_ON(end > BITS_PER_BITMAP);
+
+	for (i = start; i < end; i++)
+		set_bit(i, info->bitmap);
+
+	info->bytes += bytes;
+}
+
+static int search_bitmap(struct btrfs_block_group_cache *block_group,
+			 struct btrfs_free_space *bitmap_info, u64 *offset,
+			 u64 *bytes)
+{
+	unsigned long found_bits = 0;
+	unsigned long bits, i;
+	unsigned long next_zero;
+
+	i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
+			  max_t(u64, *offset, bitmap_info->offset));
+	bits = bytes_to_bits(*bytes, block_group->sectorsize);
+
+	for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
+	     i < BITS_PER_BITMAP;
+	     i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+		next_zero = find_next_zero_bit(bitmap_info->bitmap,
+					       BITS_PER_BITMAP, i);
+		if ((next_zero - i) >= bits) {
+			found_bits = next_zero - i;
+			break;
+		}
+		i = next_zero;
+	}
+
+	if (found_bits) {
+		*offset = (u64)(i * block_group->sectorsize) +
+			bitmap_info->offset;
+		*bytes = (u64)(found_bits) * block_group->sectorsize;
+		return 0;
+	}
+
+	return -1;
+}
+
+static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
+						*block_group, u64 *offset,
+						u64 *bytes, int debug)
+{
+	struct btrfs_free_space *entry;
+	struct rb_node *node;
+	int ret;
+
+	if (!block_group->free_space_offset.rb_node)
+		return NULL;
+
+	entry = tree_search_offset(block_group,
+				   offset_to_bitmap(block_group, *offset),
+				   0, 1);
+	if (!entry)
+		return NULL;
+
+	for (node = &entry->offset_index; node; node = rb_next(node)) {
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+		if (entry->bytes < *bytes)
+			continue;
+
+		if (entry->bitmap) {
+			ret = search_bitmap(block_group, entry, offset, bytes);
+			if (!ret)
+				return entry;
+			continue;
+		}
+
+		*offset = entry->offset;
+		*bytes = entry->bytes;
+		return entry;
+	}
+
+	return NULL;
+}
+
+static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+			   struct btrfs_free_space *info, u64 offset)
+{
+	u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+	int max_bitmaps = (int)div64_u64(block_group->key.offset +
+					 bytes_per_bg - 1, bytes_per_bg);
+	BUG_ON(block_group->total_bitmaps >= max_bitmaps);
+
+	info->offset = offset_to_bitmap(block_group, offset);
+	link_free_space(block_group, info);
+	block_group->total_bitmaps++;
+
+	recalculate_thresholds(block_group);
+}
+
+static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
+			      struct btrfs_free_space *bitmap_info,
+			      u64 *offset, u64 *bytes)
+{
+	u64 end;
+
+again:
+	end = bitmap_info->offset +
+		(u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
+
+	if (*offset > bitmap_info->offset && *offset + *bytes > end) {
+		bitmap_clear_bits(bitmap_info, *offset,
+				  end - *offset + 1, block_group->sectorsize);
+		*bytes -= end - *offset + 1;
+		*offset = end + 1;
+	} else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
+		bitmap_clear_bits(bitmap_info, *offset,
+				  *bytes, block_group->sectorsize);
+		*bytes = 0;
+	}
+
+	if (*bytes) {
+		if (!bitmap_info->bytes) {
+			unlink_free_space(block_group, bitmap_info);
+			kfree(bitmap_info->bitmap);
+			kfree(bitmap_info);
+			block_group->total_bitmaps--;
+			recalculate_thresholds(block_group);
+		}
+
+		bitmap_info = tree_search_offset(block_group,
+						 offset_to_bitmap(block_group,
+								  *offset),
+						 1, 0);
+		if (!bitmap_info)
+			return -EINVAL;
+
+		if (!bitmap_info->bitmap)
+			return -EAGAIN;
+
+		goto again;
+	} else if (!bitmap_info->bytes) {
+		unlink_free_space(block_group, bitmap_info);
+		kfree(bitmap_info->bitmap);
+		kfree(bitmap_info);
+		block_group->total_bitmaps--;
+		recalculate_thresholds(block_group);
+	}
+
+	return 0;
+}
+
+static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
+			      struct btrfs_free_space *info)
+{
+	struct btrfs_free_space *bitmap_info;
+	int added = 0;
+	u64 bytes, offset, end;
+	int ret;
+
+	/*
+	 * If we are below the extents threshold then we can add this as an
+	 * extent, and don't have to deal with the bitmap
+	 */
+	if (block_group->free_extents < block_group->extents_thresh &&
+	    info->bytes > block_group->sectorsize * 4)
+		return 0;
+
+	/*
+	 * some block groups are so tiny they can't be enveloped by a bitmap, so
+	 * don't even bother to create a bitmap for this
+	 */
+	if (BITS_PER_BITMAP * block_group->sectorsize >
+	    block_group->key.offset)
+		return 0;
+
+	bytes = info->bytes;
+	offset = info->offset;
+
+again:
+	bitmap_info = tree_search_offset(block_group,
+					 offset_to_bitmap(block_group, offset),
+					 1, 0);
+	if (!bitmap_info) {
+		BUG_ON(added);
+		goto new_bitmap;
+	}
+
+	end = bitmap_info->offset +
+		(u64)(BITS_PER_BITMAP * block_group->sectorsize);
+
+	if (offset >= bitmap_info->offset && offset + bytes > end) {
+		bitmap_set_bits(bitmap_info, offset, end - offset,
+				block_group->sectorsize);
+		bytes -= end - offset;
+		offset = end;
+		added = 0;
+	} else if (offset >= bitmap_info->offset && offset + bytes <= end) {
+		bitmap_set_bits(bitmap_info, offset, bytes,
+				block_group->sectorsize);
+		bytes = 0;
+	} else {
+		BUG();
+	}
+
+	if (!bytes) {
+		ret = 1;
+		goto out;
+	} else
+		goto again;
+
+new_bitmap:
+	if (info && info->bitmap) {
+		add_new_bitmap(block_group, info, offset);
+		added = 1;
+		info = NULL;
+		goto again;
+	} else {
+		spin_unlock(&block_group->tree_lock);
+
+		/* no pre-allocated info, allocate a new one */
+		if (!info) {
+			info = kzalloc(sizeof(struct btrfs_free_space),
+				       GFP_NOFS);
+			if (!info) {
+				spin_lock(&block_group->tree_lock);
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+
+		/* allocate the bitmap */
+		info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+		spin_lock(&block_group->tree_lock);
+		if (!info->bitmap) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		goto again;
+	}
+
+out:
+	if (info) {
+		if (info->bitmap)
+			kfree(info->bitmap);
+		kfree(info);
+	}
 
 	return ret;
 }
@@ -208,8 +557,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 			 u64 offset, u64 bytes)
 {
-	struct btrfs_free_space *right_info;
-	struct btrfs_free_space *left_info;
+	struct btrfs_free_space *right_info = NULL;
+	struct btrfs_free_space *left_info = NULL;
 	struct btrfs_free_space *info = NULL;
 	int ret = 0;
 
@@ -227,18 +576,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 	 * are adding, if there is remove that struct and add a new one to
 	 * cover the entire range
 	 */
-	right_info = tree_search_offset(&block_group->free_space_offset,
-					offset+bytes, 0, 0);
-	left_info = tree_search_offset(&block_group->free_space_offset,
-				       offset-1, 0, 1);
+	right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
+	if (right_info && rb_prev(&right_info->offset_index))
+		left_info = rb_entry(rb_prev(&right_info->offset_index),
+				     struct btrfs_free_space, offset_index);
+	else
+		left_info = tree_search_offset(block_group, offset - 1, 0, 0);
 
-	if (right_info) {
+	/*
+	 * If there was no extent directly to the left or right of this new
+	 * extent then we know we're going to have to allocate a new extent, so
+	 * before we do that see if we need to drop this into a bitmap
+	 */
+	if ((!left_info || left_info->bitmap) &&
+	    (!right_info || right_info->bitmap)) {
+		ret = insert_into_bitmap(block_group, info);
+
+		if (ret < 0) {
+			goto out;
+		} else if (ret) {
+			ret = 0;
+			goto out;
+		}
+	}
+
+	if (right_info && !right_info->bitmap) {
 		unlink_free_space(block_group, right_info);
 		info->bytes += right_info->bytes;
 		kfree(right_info);
 	}
 
-	if (left_info && left_info->offset + left_info->bytes == offset) {
+	if (left_info && !left_info->bitmap &&
+	    left_info->offset + left_info->bytes == offset) {
 		unlink_free_space(block_group, left_info);
 		info->offset = left_info->offset;
 		info->bytes += left_info->bytes;
@@ -248,11 +617,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 	ret = link_free_space(block_group, info);
 	if (ret)
 		kfree(info);
-
+out:
 	spin_unlock(&block_group->tree_lock);
 
 	if (ret) {
-		printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
+		printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
 		BUG_ON(ret == -EEXIST);
 	}
 
@@ -263,40 +632,65 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			    u64 offset, u64 bytes)
 {
 	struct btrfs_free_space *info;
+	struct btrfs_free_space *next_info = NULL;
 	int ret = 0;
 
 	spin_lock(&block_group->tree_lock);
 
-	info = tree_search_offset(&block_group->free_space_offset, offset, 0,
-				  1);
-	if (info && info->offset == offset) {
-		if (info->bytes < bytes) {
-			printk(KERN_ERR "Found free space at %llu, size %llu,"
-			       "trying to use %llu\n",
-			       (unsigned long long)info->offset,
-			       (unsigned long long)info->bytes,
-			       (unsigned long long)bytes);
+again:
+	info = tree_search_offset(block_group, offset, 0, 0);
+	if (!info) {
+		WARN_ON(1);
+		goto out_lock;
+	}
+
+	if (info->bytes < bytes && rb_next(&info->offset_index)) {
+		u64 end;
+		next_info = rb_entry(rb_next(&info->offset_index),
+					     struct btrfs_free_space,
+					     offset_index);
+
+		if (next_info->bitmap)
+			end = next_info->offset + BITS_PER_BITMAP *
+				block_group->sectorsize - 1;
+		else
+			end = next_info->offset + next_info->bytes;
+
+		if (next_info->bytes < bytes ||
+		    next_info->offset > offset || offset > end) {
+			printk(KERN_CRIT "Found free space at %llu, size %llu,"
+			      " trying to use %llu\n",
+			      (unsigned long long)info->offset,
+			      (unsigned long long)info->bytes,
+			      (unsigned long long)bytes);
 			WARN_ON(1);
 			ret = -EINVAL;
-			spin_unlock(&block_group->tree_lock);
-			goto out;
+			goto out_lock;
 		}
-		unlink_free_space(block_group, info);
 
-		if (info->bytes == bytes) {
-			kfree(info);
-			spin_unlock(&block_group->tree_lock);
-			goto out;
+		info = next_info;
+	}
+
+	if (info->bytes == bytes) {
+		unlink_free_space(block_group, info);
+		if (info->bitmap) {
+			kfree(info->bitmap);
+			block_group->total_bitmaps--;
 		}
+		kfree(info);
+		goto out_lock;
+	}
 
+	if (!info->bitmap && info->offset == offset) {
+		unlink_free_space(block_group, info);
 		info->offset += bytes;
 		info->bytes -= bytes;
+		link_free_space(block_group, info);
+		goto out_lock;
+	}
 
-		ret = link_free_space(block_group, info);
-		spin_unlock(&block_group->tree_lock);
-		BUG_ON(ret);
-	} else if (info && info->offset < offset &&
-		   info->offset + info->bytes >= offset + bytes) {
+	if (!info->bitmap && info->offset <= offset &&
+	    info->offset + info->bytes >= offset + bytes) {
 		u64 old_start = info->offset;
 		/*
 		 * we're freeing space in the middle of the info,
@@ -312,7 +706,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			info->offset = offset + bytes;
 			info->bytes = old_end - info->offset;
 			ret = link_free_space(block_group, info);
-			BUG_ON(ret);
+			WARN_ON(ret);
+			if (ret)
+				goto out_lock;
 		} else {
 			/* the hole we're creating ends at the end
 			 * of the info struct, just free the info
@@ -320,32 +716,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			kfree(info);
 		}
 		spin_unlock(&block_group->tree_lock);
-		/* step two, insert a new info struct to cover anything
-		 * before the hole
+
+		/* step two, insert a new info struct to cover
+		 * anything before the hole
 		 */
 		ret = btrfs_add_free_space(block_group, old_start,
 					   offset - old_start);
-		BUG_ON(ret);
-	} else {
-		spin_unlock(&block_group->tree_lock);
-		if (!info) {
-			printk(KERN_ERR "couldn't find space %llu to free\n",
-			       (unsigned long long)offset);
-			printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
-			       block_group->cached,
-			       (unsigned long long)block_group->key.objectid,
-			       (unsigned long long)block_group->key.offset);
-			btrfs_dump_free_space(block_group, bytes);
-		} else if (info) {
-			printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
-			       "but wanted offset=%llu bytes=%llu\n",
-			       (unsigned long long)info->offset,
-			       (unsigned long long)info->bytes,
-			       (unsigned long long)offset,
-			       (unsigned long long)bytes);
-		}
-		WARN_ON(1);
+		WARN_ON(ret);
+		goto out;
 	}
+
+	ret = remove_from_bitmap(block_group, info, &offset, &bytes);
+	if (ret == -EAGAIN)
+		goto again;
+	BUG_ON(ret);
+out_lock:
+	spin_unlock(&block_group->tree_lock);
 out:
 	return ret;
 }
@@ -361,10 +747,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 		info = rb_entry(n, struct btrfs_free_space, offset_index);
 		if (info->bytes >= bytes)
 			count++;
-		printk(KERN_ERR "entry offset %llu, bytes %llu\n",
+		printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
 		       (unsigned long long)info->offset,
-		       (unsigned long long)info->bytes);
+		       (unsigned long long)info->bytes,
+		       (info->bitmap) ? "yes" : "no");
 	}
+	printk(KERN_INFO "block group has cluster?: %s\n",
+	       list_empty(&block_group->cluster_list) ? "no" : "yes");
 	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
 	       "\n", count);
 }
@@ -397,26 +786,35 @@ __btrfs_return_cluster_to_free_space(
 {
 	struct btrfs_free_space *entry;
 	struct rb_node *node;
+	bool bitmap;
 
 	spin_lock(&cluster->lock);
 	if (cluster->block_group != block_group)
 		goto out;
 
+	bitmap = cluster->points_to_bitmap;
+	cluster->block_group = NULL;
 	cluster->window_start = 0;
+	list_del_init(&cluster->block_group_list);
+	cluster->points_to_bitmap = false;
+
+	if (bitmap)
+		goto out;
+
 	node = rb_first(&cluster->root);
-	while(node) {
+	while (node) {
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 		node = rb_next(&entry->offset_index);
 		rb_erase(&entry->offset_index, &cluster->root);
-		link_free_space(block_group, entry);
+		BUG_ON(entry->bitmap);
+		tree_insert_offset(&block_group->free_space_offset,
+				   entry->offset, &entry->offset_index, 0);
 	}
-	list_del_init(&cluster->block_group_list);
-
-	btrfs_put_block_group(cluster->block_group);
-	cluster->block_group = NULL;
 	cluster->root.rb_node = NULL;
+
 out:
 	spin_unlock(&cluster->lock);
+	btrfs_put_block_group(block_group);
 	return 0;
 }
 
@@ -425,20 +823,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 	struct btrfs_free_space *info;
 	struct rb_node *node;
 	struct btrfs_free_cluster *cluster;
-	struct btrfs_free_cluster *safe;
+	struct list_head *head;
 
 	spin_lock(&block_group->tree_lock);
-
-	list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
-				 block_group_list) {
+	while ((head = block_group->cluster_list.next) !=
+	       &block_group->cluster_list) {
+		cluster = list_entry(head, struct btrfs_free_cluster,
+				     block_group_list);
 
 		WARN_ON(cluster->block_group != block_group);
 		__btrfs_return_cluster_to_free_space(block_group, cluster);
+		if (need_resched()) {
+			spin_unlock(&block_group->tree_lock);
+			cond_resched();
+			spin_lock(&block_group->tree_lock);
+		}
 	}
 
-	while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
-		info = rb_entry(node, struct btrfs_free_space, bytes_index);
+	while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
+		info = rb_entry(node, struct btrfs_free_space, offset_index);
 		unlink_free_space(block_group, info);
+		if (info->bitmap)
+			kfree(info->bitmap);
 		kfree(info);
 		if (need_resched()) {
 			spin_unlock(&block_group->tree_lock);
@@ -446,6 +852,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 			spin_lock(&block_group->tree_lock);
 		}
 	}
+
 	spin_unlock(&block_group->tree_lock);
 }
 
@@ -453,27 +860,37 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 			       u64 offset, u64 bytes, u64 empty_size)
 {
 	struct btrfs_free_space *entry = NULL;
+	u64 bytes_search = bytes + empty_size;
 	u64 ret = 0;
 
 	spin_lock(&block_group->tree_lock);
-	entry = tree_search_offset(&block_group->free_space_offset, offset,
-				   bytes + empty_size, 1);
+	entry = find_free_space(block_group, &offset, &bytes_search, 0);
 	if (!entry)
-		entry = tree_search_bytes(&block_group->free_space_bytes,
-					  offset, bytes + empty_size);
-	if (entry) {
+		goto out;
+
+	ret = offset;
+	if (entry->bitmap) {
+		bitmap_clear_bits(entry, offset, bytes,
+				  block_group->sectorsize);
+		if (!entry->bytes) {
+			unlink_free_space(block_group, entry);
+			kfree(entry->bitmap);
+			kfree(entry);
+			block_group->total_bitmaps--;
+			recalculate_thresholds(block_group);
+		}
+	} else {
 		unlink_free_space(block_group, entry);
-		ret = entry->offset;
 		entry->offset += bytes;
 		entry->bytes -= bytes;
-
 		if (!entry->bytes)
 			kfree(entry);
 		else
 			link_free_space(block_group, entry);
 	}
-	spin_unlock(&block_group->tree_lock);
 
+out:
+	spin_unlock(&block_group->tree_lock);
 	return ret;
 }
 
@@ -517,6 +934,47 @@ int btrfs_return_cluster_to_free_space(
 	return ret;
 }
 
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+				   struct btrfs_free_cluster *cluster,
+				   u64 bytes, u64 min_start)
+{
+	struct btrfs_free_space *entry;
+	int err;
+	u64 search_start = cluster->window_start;
+	u64 search_bytes = bytes;
+	u64 ret = 0;
+
+	spin_lock(&block_group->tree_lock);
+	spin_lock(&cluster->lock);
+
+	if (!cluster->points_to_bitmap)
+		goto out;
+
+	if (cluster->block_group != block_group)
+		goto out;
+
+	entry = tree_search_offset(block_group, search_start, 0, 0);
+
+	if (!entry || !entry->bitmap)
+		goto out;
+
+	search_start = min_start;
+	search_bytes = bytes;
+
+	err = search_bitmap(block_group, entry, &search_start,
+			    &search_bytes);
+	if (err)
+		goto out;
+
+	ret = search_start;
+	bitmap_clear_bits(entry, ret, bytes, block_group->sectorsize);
+out:
+	spin_unlock(&cluster->lock);
+	spin_unlock(&block_group->tree_lock);
+
+	return ret;
+}
+
 /*
  * given a cluster, try to allocate 'bytes' from it, returns 0
  * if it couldn't find anything suitably large, or a logical disk offset
@@ -530,6 +988,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 	struct rb_node *node;
 	u64 ret = 0;
 
+	if (cluster->points_to_bitmap)
+		return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
+					       min_start);
+
 	spin_lock(&cluster->lock);
 	if (bytes > cluster->max_size)
 		goto out;
@@ -567,9 +1029,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 	}
 out:
 	spin_unlock(&cluster->lock);
+
 	return ret;
 }
 
+static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+				struct btrfs_free_space *entry,
+				struct btrfs_free_cluster *cluster,
+				u64 offset, u64 bytes, u64 min_bytes)
+{
+	unsigned long next_zero;
+	unsigned long i;
+	unsigned long search_bits;
+	unsigned long total_bits;
+	unsigned long found_bits;
+	unsigned long start = 0;
+	unsigned long total_found = 0;
+	bool found = false;
+
+	i = offset_to_bit(entry->offset, block_group->sectorsize,
+			  max_t(u64, offset, entry->offset));
+	search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+	total_bits = bytes_to_bits(bytes, block_group->sectorsize);
+
+again:
+	found_bits = 0;
+	for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
+	     i < BITS_PER_BITMAP;
+	     i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+		next_zero = find_next_zero_bit(entry->bitmap,
+					       BITS_PER_BITMAP, i);
+		if (next_zero - i >= search_bits) {
+			found_bits = next_zero - i;
+			break;
+		}
+		i = next_zero;
+	}
+
+	if (!found_bits)
+		return -1;
+
+	if (!found) {
+		start = i;
+		found = true;
+	}
+
+	total_found += found_bits;
+
+	if (cluster->max_size < found_bits * block_group->sectorsize)
+		cluster->max_size = found_bits * block_group->sectorsize;
+
+	if (total_found < total_bits) {
+		i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
+		if (i - start > total_bits * 2) {
+			total_found = 0;
+			cluster->max_size = 0;
+			found = false;
+		}
+		goto again;
+	}
+
+	cluster->window_start = start * block_group->sectorsize +
+		entry->offset;
+	cluster->points_to_bitmap = true;
+
+	return 0;
+}
+
 /*
  * here we try to find a cluster of blocks in a block group.  The goal
  * is to find at least bytes free and up to empty_size + bytes free.
@@ -587,12 +1113,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 	struct btrfs_free_space *entry = NULL;
 	struct rb_node *node;
 	struct btrfs_free_space *next;
-	struct btrfs_free_space *last;
+	struct btrfs_free_space *last = NULL;
 	u64 min_bytes;
 	u64 window_start;
 	u64 window_free;
 	u64 max_extent = 0;
-	int total_retries = 0;
+	bool found_bitmap = false;
 	int ret;
 
 	/* for metadata, allow allocates with more holes */
@@ -620,30 +1146,79 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 again:
-	min_bytes = min(min_bytes, bytes + empty_size);
-	entry = tree_search_bytes(&block_group->free_space_bytes,
-				  offset, min_bytes);
+	entry = tree_search_offset(block_group, offset, found_bitmap, 1);
 	if (!entry) {
 		ret = -ENOSPC;
 		goto out;
 	}
+
+	/*
+	 * If found_bitmap is true, we exhausted our search for extent entries,
+	 * and we just want to search all of the bitmaps that we can find, and
+	 * ignore any extent entries we find.
+	 */
+	while (entry->bitmap || found_bitmap ||
+	       (!entry->bitmap && entry->bytes < min_bytes)) {
+		struct rb_node *node = rb_next(&entry->offset_index);
+
+		if (entry->bitmap && entry->bytes > bytes + empty_size) {
+			ret = btrfs_bitmap_cluster(block_group, entry, cluster,
+						   offset, bytes + empty_size,
+						   min_bytes);
+			if (!ret)
+				goto got_it;
+		}
+
+		if (!node) {
+			ret = -ENOSPC;
+			goto out;
+		}
+		entry = rb_entry(node, struct btrfs_free_space, offset_index);
+	}
+
+	/*
+	 * We already searched all the extent entries from the passed in offset
+	 * to the end and didn't find enough space for the cluster, and we also
+	 * didn't find any bitmaps that met our criteria, just go ahead and exit
+	 */
+	if (found_bitmap) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	cluster->points_to_bitmap = false;
 	window_start = entry->offset;
 	window_free = entry->bytes;
 	last = entry;
 	max_extent = entry->bytes;
 
-	while(1) {
+	while (1) {
 		/* out window is just right, lets fill it */
 		if (window_free >= bytes + empty_size)
 			break;
 
 		node = rb_next(&last->offset_index);
 		if (!node) {
+			if (found_bitmap)
+				goto again;
 			ret = -ENOSPC;
 			goto out;
 		}
 		next = rb_entry(node, struct btrfs_free_space, offset_index);
 
+		/*
+		 * we found a bitmap, so if this search doesn't result in a
+		 * cluster, we know to go and search again for the bitmaps and
+		 * start looking for space there
+		 */
+		if (next->bitmap) {
+			if (!found_bitmap)
+				offset = next->offset;
+			found_bitmap = true;
+			last = next;
+			continue;
+		}
+
 		/*
 		 * we haven't filled the empty size and the window is
 		 * very large.  reset and try again
@@ -655,19 +1230,6 @@ again:
 			window_free = entry->bytes;
 			last = entry;
 			max_extent = 0;
-			total_retries++;
-			if (total_retries % 64 == 0) {
-				if (min_bytes >= (bytes + empty_size)) {
-					ret = -ENOSPC;
-					goto out;
-				}
-				/*
-				 * grow our allocation a bit, we're not having
-				 * much luck
-				 */
-				min_bytes *= 2;
-				goto again;
-			}
 		} else {
 			last = next;
 			window_free += next->bytes;
@@ -685,11 +1247,19 @@ again:
 	 * The cluster includes an rbtree, but only uses the offset index
 	 * of each free space cache entry.
 	 */
-	while(1) {
+	while (1) {
 		node = rb_next(&entry->offset_index);
-		unlink_free_space(block_group, entry);
+		if (entry->bitmap && node) {
+			entry = rb_entry(node, struct btrfs_free_space,
+					 offset_index);
+			continue;
+		} else if (entry->bitmap && !node) {
+			break;
+		}
+
+		rb_erase(&entry->offset_index, &block_group->free_space_offset);
 		ret = tree_insert_offset(&cluster->root, entry->offset,
-					 &entry->offset_index);
+					 &entry->offset_index, 0);
 		BUG_ON(ret);
 
 		if (!node || entry == last)
@@ -697,8 +1267,10 @@ again:
 
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 	}
-	ret = 0;
+
 	cluster->max_size = max_extent;
+got_it:
+	ret = 0;
 	atomic_inc(&block_group->count);
 	list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
 	cluster->block_group = block_group;
@@ -718,6 +1290,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
 	spin_lock_init(&cluster->refill_lock);
 	cluster->root.rb_node = NULL;
 	cluster->max_size = 0;
+	cluster->points_to_bitmap = false;
 	INIT_LIST_HEAD(&cluster->block_group_list);
 	cluster->block_group = NULL;
 }
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 266fb8764054..890a8e79011b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -19,6 +19,14 @@
 #ifndef __BTRFS_FREE_SPACE_CACHE
 #define __BTRFS_FREE_SPACE_CACHE
 
+struct btrfs_free_space {
+	struct rb_node offset_index;
+	u64 offset;
+	u64 bytes;
+	unsigned long *bitmap;
+	struct list_head list;
+};
+
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 			 u64 bytenr, u64 size);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-- 
cgit v1.2.3-70-g09d2


From 817d52f8dba26d0295c26035531c30ce5f1e3c3e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 13 Jul 2009 21:29:25 -0400
Subject: Btrfs: async block group caching

This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner.  Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation.  If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching.  This is how I tested
the speedup from this

mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo

Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.

Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group.  This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.

I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached.  This drastically reduces the amount of time it takes to
cache the block groups.  Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.

This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents.  Thank you,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h            |  22 ++-
 fs/btrfs/disk-io.c          |   3 +
 fs/btrfs/extent-tree.c      | 471 +++++++++++++++++++++++++++++++++++---------
 fs/btrfs/free-space-cache.c |  42 ++--
 fs/btrfs/transaction.c      |  23 ++-
 fs/btrfs/tree-log.c         |   2 +-
 6 files changed, 439 insertions(+), 124 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0cbf3491bb7c..42b03c4ee494 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -691,6 +691,7 @@ struct btrfs_space_info {
 	struct list_head block_groups;
 	spinlock_t lock;
 	struct rw_semaphore groups_sem;
+	atomic_t caching_threads;
 };
 
 /*
@@ -721,11 +722,17 @@ struct btrfs_free_cluster {
 	struct list_head block_group_list;
 };
 
+enum btrfs_caching_type {
+	BTRFS_CACHE_NO		= 0,
+	BTRFS_CACHE_STARTED	= 1,
+	BTRFS_CACHE_FINISHED	= 2,
+};
+
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
+	struct btrfs_fs_info *fs_info;
 	spinlock_t lock;
-	struct mutex cache_mutex;
 	u64 pinned;
 	u64 reserved;
 	u64 flags;
@@ -733,15 +740,19 @@ struct btrfs_block_group_cache {
 	int extents_thresh;
 	int free_extents;
 	int total_bitmaps;
-	int cached;
 	int ro;
 	int dirty;
 
+	/* cache tracking stuff */
+	wait_queue_head_t caching_q;
+	int cached;
+
 	struct btrfs_space_info *space_info;
 
 	/* free space cache stuff */
 	spinlock_t tree_lock;
 	struct rb_root free_space_offset;
+	u64 free_space;
 
 	/* block group cache stuff */
 	struct rb_node cache_node;
@@ -834,6 +845,7 @@ struct btrfs_fs_info {
 	atomic_t async_submit_draining;
 	atomic_t nr_async_bios;
 	atomic_t async_delalloc_pages;
+	atomic_t async_caching_threads;
 
 	/*
 	 * this is used by the balancing code to wait for all the pending
@@ -950,6 +962,9 @@ struct btrfs_root {
 	/* the node lock is held while changing the node pointer */
 	spinlock_t node_lock;
 
+	/* taken when updating the commit root */
+	struct rw_semaphore commit_root_sem;
+
 	struct extent_buffer *commit_root;
 	struct btrfs_root *log_root;
 	struct btrfs_root *reloc_root;
@@ -1911,7 +1926,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_update_pinned_extents(struct btrfs_root *root,
-				u64 bytenr, u64 num, int pin);
+				u64 bytenr, u64 num, int pin, int mark_free);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
@@ -1996,6 +2011,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
 				 u64 bytes);
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
 			      u64 bytes);
+void btrfs_free_super_mirror_extents(struct btrfs_fs_info *info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 55d9d188e693..ec2c915f7f4a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -907,6 +907,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	spin_lock_init(&root->inode_lock);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
+	init_rwsem(&root->commit_root_sem);
 	init_waitqueue_head(&root->log_writer_wait);
 	init_waitqueue_head(&root->log_commit_wait[0]);
 	init_waitqueue_head(&root->log_commit_wait[1]);
@@ -1566,6 +1567,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
+	atomic_set(&fs_info->async_caching_threads, 0);
 	fs_info->sb = sb;
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
@@ -2337,6 +2339,7 @@ int close_ctree(struct btrfs_root *root)
 	free_extent_buffer(root->fs_info->csum_root->commit_root);
 
 	btrfs_free_block_groups(root->fs_info);
+	btrfs_free_super_mirror_extents(root->fs_info);
 
 	del_fs_roots(fs_info);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 98697be6bdde..9a489cc89fd3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -21,6 +21,7 @@
 #include <linux/blkdev.h>
 #include <linux/sort.h>
 #include <linux/rcupdate.h>
+#include <linux/kthread.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags, int force);
 
+static noinline int
+block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+	smp_mb();
+	return cache->cached == BTRFS_CACHE_FINISHED;
+}
+
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
 	return (cache->flags & bits) == bits;
@@ -145,21 +153,64 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 	return ret;
 }
 
+void btrfs_free_super_mirror_extents(struct btrfs_fs_info *info)
+{
+	u64 start, end, last = 0;
+	int ret;
+
+	while (1) {
+		ret = find_first_extent_bit(&info->pinned_extents, last,
+					    &start, &end, EXTENT_LOCKED);
+		if (ret)
+			break;
+
+		unlock_extent(&info->pinned_extents, start, end, GFP_NOFS);
+		last = end+1;
+	}
+}
+
+static int remove_sb_from_cache(struct btrfs_root *root,
+				struct btrfs_block_group_cache *cache)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u64 bytenr;
+	u64 *logical;
+	int stripe_len;
+	int i, nr, ret;
+
+	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		bytenr = btrfs_sb_offset(i);
+		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+				       cache->key.objectid, bytenr,
+				       0, &logical, &nr, &stripe_len);
+		BUG_ON(ret);
+		while (nr--) {
+			try_lock_extent(&fs_info->pinned_extents,
+					logical[nr],
+					logical[nr] + stripe_len - 1, GFP_NOFS);
+		}
+		kfree(logical);
+	}
+
+	return 0;
+}
+
 /*
  * this is only called by cache_block_group, since we could have freed extents
  * we need to check the pinned_extents for any extents that can't be used yet
  * since their free space will be released as soon as the transaction commits.
  */
-static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 			      struct btrfs_fs_info *info, u64 start, u64 end)
 {
-	u64 extent_start, extent_end, size;
+	u64 extent_start, extent_end, size, total_added = 0;
 	int ret;
 
 	while (start < end) {
 		ret = find_first_extent_bit(&info->pinned_extents, start,
 					    &extent_start, &extent_end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY|EXTENT_LOCKED|
+					    EXTENT_DELALLOC);
 		if (ret)
 			break;
 
@@ -167,6 +218,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 			start = extent_end + 1;
 		} else if (extent_start > start && extent_start < end) {
 			size = extent_start - start;
+			total_added += size;
 			ret = btrfs_add_free_space(block_group, start,
 						   size);
 			BUG_ON(ret);
@@ -178,84 +230,139 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 
 	if (start < end) {
 		size = end - start;
+		total_added += size;
 		ret = btrfs_add_free_space(block_group, start, size);
 		BUG_ON(ret);
 	}
 
-	return 0;
+	return total_added;
 }
 
-static int remove_sb_from_cache(struct btrfs_root *root,
-				struct btrfs_block_group_cache *cache)
+DEFINE_MUTEX(discard_mutex);
+
+/*
+ * if async kthreads are running when we cross transactions, we mark any pinned
+ * extents with EXTENT_DELALLOC and then let the caching kthreads clean up those
+ * extents when they are done.  Also we run this from btrfs_finish_extent_commit
+ * in case there were some pinned extents that were missed because we had
+ * already cached that block group.
+ */
+static void btrfs_discard_pinned_extents(struct btrfs_fs_info *fs_info,
+					 struct btrfs_block_group_cache *cache)
 {
-	u64 bytenr;
-	u64 *logical;
-	int stripe_len;
-	int i, nr, ret;
+	u64 start, end, last;
+	int ret;
 
-	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-		bytenr = btrfs_sb_offset(i);
-		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
-				       cache->key.objectid, bytenr, 0,
-				       &logical, &nr, &stripe_len);
-		BUG_ON(ret);
-		while (nr--) {
-			btrfs_remove_free_space(cache, logical[nr],
-						stripe_len);
+	if (!cache)
+		last = 0;
+	else
+		last = cache->key.objectid;
+
+	mutex_lock(&discard_mutex);
+	while (1) {
+		ret = find_first_extent_bit(&fs_info->pinned_extents, last,
+					    &start, &end, EXTENT_DELALLOC);
+		if (ret)
+			break;
+
+		if (cache && start >= cache->key.objectid + cache->key.offset)
+			break;
+
+
+		if (!cache) {
+			cache = btrfs_lookup_block_group(fs_info, start);
+			BUG_ON(!cache);
+
+			start = max(start, cache->key.objectid);
+			end = min(end, cache->key.objectid + cache->key.offset - 1);
+
+			if (block_group_cache_done(cache))
+				btrfs_add_free_space(cache, start,
+						     end - start + 1);
+			cache = NULL;
+		} else {
+			start = max(start, cache->key.objectid);
+			end = min(end, cache->key.objectid + cache->key.offset - 1);
+			btrfs_add_free_space(cache, start, end - start + 1);
+		}
+
+		clear_extent_bits(&fs_info->pinned_extents, start, end,
+				  EXTENT_DELALLOC, GFP_NOFS);
+		last = end + 1;
+
+		if (need_resched()) {
+			mutex_unlock(&discard_mutex);
+			cond_resched();
+			mutex_lock(&discard_mutex);
 		}
-		kfree(logical);
 	}
-	return 0;
+	mutex_unlock(&discard_mutex);
 }
 
-static int cache_block_group(struct btrfs_root *root,
-			     struct btrfs_block_group_cache *block_group)
+static int caching_kthread(void *data)
 {
+	struct btrfs_block_group_cache *block_group = data;
+	struct btrfs_fs_info *fs_info = block_group->fs_info;
+	u64 last = 0;
 	struct btrfs_path *path;
 	int ret = 0;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
 	int slot;
-	u64 last;
-
-	if (!block_group)
-		return 0;
+	u64 total_found = 0;
 
-	root = root->fs_info->extent_root;
-
-	if (block_group->cached)
-		return 0;
+	BUG_ON(!fs_info);
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	path->reada = 2;
+	atomic_inc(&fs_info->async_caching_threads);
+	atomic_inc(&block_group->space_info->caching_threads);
+	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+again:
+	/* need to make sure the commit_root doesn't disappear */
+	down_read(&fs_info->extent_root->commit_root_sem);
+
 	/*
-	 * we get into deadlocks with paths held by callers of this function.
-	 * since the alloc_mutex is protecting things right now, just
-	 * skip the locking here
+	 * We don't want to deadlock with somebody trying to allocate a new
+	 * extent for the extent root while also trying to search the extent
+	 * root to add free space.  So we skip locking and search the commit
+	 * root, since its read-only
 	 */
 	path->skip_locking = 1;
-	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+	path->search_commit_root = 1;
+	path->reada = 2;
+
 	key.objectid = last;
 	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
 
 	while (1) {
+		smp_mb();
+		if (block_group->fs_info->closing)
+			break;
+
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(leaf)) {
-			ret = btrfs_next_leaf(root, path);
+			ret = btrfs_next_leaf(fs_info->extent_root, path);
 			if (ret < 0)
 				goto err;
-			if (ret == 0)
-				continue;
-			else
+			else if (ret)
 				break;
+
+			if (need_resched()) {
+				btrfs_release_path(fs_info->extent_root, path);
+				up_read(&fs_info->extent_root->commit_root_sem);
+				cond_resched();
+				goto again;
+			}
+
+			continue;
 		}
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (key.objectid < block_group->key.objectid)
@@ -266,24 +373,63 @@ static int cache_block_group(struct btrfs_root *root,
 			break;
 
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
-			add_new_free_space(block_group, root->fs_info, last,
-					   key.objectid);
-
+			total_found += add_new_free_space(block_group,
+							  fs_info, last,
+							  key.objectid);
 			last = key.objectid + key.offset;
 		}
+
+		if (total_found > (1024 * 1024 * 2)) {
+			total_found = 0;
+			wake_up(&block_group->caching_q);
+		}
 next:
 		path->slots[0]++;
 	}
+	ret = 0;
 
-	add_new_free_space(block_group, root->fs_info, last,
-			   block_group->key.objectid +
-			   block_group->key.offset);
+	total_found += add_new_free_space(block_group, fs_info, last,
+					  block_group->key.objectid +
+					  block_group->key.offset);
+
+	spin_lock(&block_group->lock);
+	block_group->cached = BTRFS_CACHE_FINISHED;
+	spin_unlock(&block_group->lock);
 
-	block_group->cached = 1;
-	remove_sb_from_cache(root, block_group);
-	ret = 0;
 err:
 	btrfs_free_path(path);
+	up_read(&fs_info->extent_root->commit_root_sem);
+	atomic_dec(&fs_info->async_caching_threads);
+	atomic_dec(&block_group->space_info->caching_threads);
+	wake_up(&block_group->caching_q);
+
+	if (!ret)
+		btrfs_discard_pinned_extents(fs_info, block_group);
+
+	return 0;
+}
+
+static int cache_block_group(struct btrfs_block_group_cache *cache)
+{
+	struct task_struct *tsk;
+	int ret = 0;
+
+	spin_lock(&cache->lock);
+	if (cache->cached != BTRFS_CACHE_NO) {
+		spin_unlock(&cache->lock);
+		return ret;
+	}
+	cache->cached = BTRFS_CACHE_STARTED;
+	spin_unlock(&cache->lock);
+
+	tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+			  cache->key.objectid);
+	if (IS_ERR(tsk)) {
+		ret = PTR_ERR(tsk);
+		printk(KERN_ERR "error running thread %d\n", ret);
+		BUG();
+	}
+
 	return ret;
 }
 
@@ -1721,7 +1867,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 				BUG_ON(ret);
 			}
 			btrfs_update_pinned_extents(root, node->bytenr,
-						    node->num_bytes, 1);
+						    node->num_bytes, 1, 0);
 			update_reserved_extents(root, node->bytenr,
 						node->num_bytes, 0);
 		}
@@ -2496,6 +2642,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->force_alloc = 0;
 	*space_info = found;
 	list_add_rcu(&found->list, &info->space_info);
+	atomic_set(&found->caching_threads, 0);
 	return 0;
 }
 
@@ -2953,7 +3100,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 }
 
 int btrfs_update_pinned_extents(struct btrfs_root *root,
-				u64 bytenr, u64 num, int pin)
+				u64 bytenr, u64 num, int pin, int mark_free)
 {
 	u64 len;
 	struct btrfs_block_group_cache *cache;
@@ -2988,7 +3135,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
 			fs_info->total_pinned -= len;
-			if (cache->cached)
+			if (block_group_cache_done(cache) && mark_free)
 				btrfs_add_free_space(cache, bytenr, len);
 		}
 		btrfs_put_block_group(cache);
@@ -3034,14 +3181,27 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 	u64 last = 0;
 	u64 start;
 	u64 end;
+	bool caching_kthreads = false;
 	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
 	int ret;
 
+	if (atomic_read(&root->fs_info->async_caching_threads))
+		caching_kthreads = true;
+
 	while (1) {
 		ret = find_first_extent_bit(pinned_extents, last,
 					    &start, &end, EXTENT_DIRTY);
 		if (ret)
 			break;
+
+		/*
+		 * we need to make sure that the pinned extents don't go away
+		 * while we are caching block groups
+		 */
+		if (unlikely(caching_kthreads))
+			set_extent_delalloc(pinned_extents, start, end,
+					    GFP_NOFS);
+
 		set_extent_dirty(copy, start, end, GFP_NOFS);
 		last = end + 1;
 	}
@@ -3055,6 +3215,12 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	u64 start;
 	u64 end;
 	int ret;
+	int mark_free = 1;
+
+	ret = find_first_extent_bit(&root->fs_info->pinned_extents, 0,
+				    &start, &end, EXTENT_DELALLOC);
+	if (!ret)
+		mark_free = 0;
 
 	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
@@ -3065,11 +3231,16 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		ret = btrfs_discard_extent(root, start, end + 1 - start);
 
 		/* unlocks the pinned mutex */
-		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
+		btrfs_update_pinned_extents(root, start, end + 1 - start, 0,
+					    mark_free);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
 
 		cond_resched();
 	}
+
+	if (unlikely(!mark_free))
+		btrfs_discard_pinned_extents(root->fs_info, NULL);
+
 	return ret;
 }
 
@@ -3110,7 +3281,7 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 pinit:
 	btrfs_set_path_blocking(path);
 	/* unlocks the pinned mutex */
-	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1, 0);
 
 	BUG_ON(err < 0);
 	return 0;
@@ -3421,7 +3592,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
 		/* unlocks the pinned mutex */
-		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1, 0);
 		update_reserved_extents(root, bytenr, num_bytes, 0);
 		ret = 0;
 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
@@ -3447,6 +3618,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
 	return ret;
 }
 
+/*
+ * when we wait for progress in the block group caching, its because
+ * our allocation attempt failed at least once.  So, we must sleep
+ * and let some progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to
+ * show up, and then it will check the block group free space numbers
+ * for our min num_bytes.  Another option is to have it go ahead
+ * and look in the rbtree for a free extent of a given size, but this
+ * is a good start.
+ */
+static noinline int
+wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+				u64 num_bytes)
+{
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
+
+	if (block_group_cache_done(cache)) {
+		finish_wait(&cache->caching_q, &wait);
+		return 0;
+	}
+	schedule();
+	finish_wait(&cache->caching_q, &wait);
+
+	wait_event(cache->caching_q, block_group_cache_done(cache) ||
+		   (cache->free_space >= num_bytes));
+	return 0;
+}
+
+enum btrfs_loop_type {
+	LOOP_CACHED_ONLY = 0,
+	LOOP_CACHING_NOWAIT = 1,
+	LOOP_CACHING_WAIT = 2,
+	LOOP_ALLOC_CHUNK = 3,
+	LOOP_NO_EMPTY_SIZE = 4,
+};
+
 /*
  * walks the btree of allocated extents and find a hole of a given size.
  * The key ins is changed to record the hole:
@@ -3472,6 +3682,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_space_info *space_info;
 	int last_ptr_loop = 0;
 	int loop = 0;
+	bool found_uncached_bg = false;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3503,15 +3714,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	search_start = max(search_start, first_logical_byte(root, 0));
 	search_start = max(search_start, hint_byte);
 
-	if (!last_ptr) {
+	if (!last_ptr)
 		empty_cluster = 0;
-		loop = 1;
-	}
 
 	if (search_start == hint_byte) {
 		block_group = btrfs_lookup_block_group(root->fs_info,
 						       search_start);
-		if (block_group && block_group_bits(block_group, data)) {
+		/*
+		 * we don't want to use the block group if it doesn't match our
+		 * allocation bits, or if its not cached.
+		 */
+		if (block_group && block_group_bits(block_group, data) &&
+		    block_group_cache_done(block_group)) {
 			down_read(&space_info->groups_sem);
 			if (list_empty(&block_group->list) ||
 			    block_group->ro) {
@@ -3534,21 +3748,35 @@ search:
 	down_read(&space_info->groups_sem);
 	list_for_each_entry(block_group, &space_info->block_groups, list) {
 		u64 offset;
+		int cached;
 
 		atomic_inc(&block_group->count);
 		search_start = block_group->key.objectid;
 
 have_block_group:
-		if (unlikely(!block_group->cached)) {
-			mutex_lock(&block_group->cache_mutex);
-			ret = cache_block_group(root, block_group);
-			mutex_unlock(&block_group->cache_mutex);
-			if (ret) {
-				btrfs_put_block_group(block_group);
-				break;
+		if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+			/*
+			 * we want to start caching kthreads, but not too many
+			 * right off the bat so we don't overwhelm the system,
+			 * so only start them if there are less than 2 and we're
+			 * in the initial allocation phase.
+			 */
+			if (loop > LOOP_CACHING_NOWAIT ||
+			    atomic_read(&space_info->caching_threads) < 2) {
+				ret = cache_block_group(block_group);
+				BUG_ON(ret);
 			}
 		}
 
+		cached = block_group_cache_done(block_group);
+		if (unlikely(!cached)) {
+			found_uncached_bg = true;
+
+			/* if we only want cached bgs, loop */
+			if (loop == LOOP_CACHED_ONLY)
+				goto loop;
+		}
+
 		if (unlikely(block_group->ro))
 			goto loop;
 
@@ -3627,14 +3855,21 @@ refill_cluster:
 					spin_unlock(&last_ptr->refill_lock);
 					goto checks;
 				}
+			} else if (!cached && loop > LOOP_CACHING_NOWAIT) {
+				spin_unlock(&last_ptr->refill_lock);
+
+				wait_block_group_cache_progress(block_group,
+				       num_bytes + empty_cluster + empty_size);
+				goto have_block_group;
 			}
+
 			/*
 			 * at this point we either didn't find a cluster
 			 * or we weren't able to allocate a block from our
 			 * cluster.  Free the cluster we've been trying
 			 * to use, and go to the next block group
 			 */
-			if (loop < 2) {
+			if (loop < LOOP_NO_EMPTY_SIZE) {
 				btrfs_return_cluster_to_free_space(NULL,
 								   last_ptr);
 				spin_unlock(&last_ptr->refill_lock);
@@ -3645,8 +3880,15 @@ refill_cluster:
 
 		offset = btrfs_find_space_for_alloc(block_group, search_start,
 						    num_bytes, empty_size);
-		if (!offset)
+		if (!offset && (cached || (!cached &&
+					   loop == LOOP_CACHING_NOWAIT))) {
 			goto loop;
+		} else if (!offset && (!cached &&
+				       loop > LOOP_CACHING_NOWAIT)) {
+			wait_block_group_cache_progress(block_group,
+					num_bytes + empty_size);
+			goto have_block_group;
+		}
 checks:
 		search_start = stripe_align(root, offset);
 		/* move on to the next group */
@@ -3694,13 +3936,26 @@ loop:
 	}
 	up_read(&space_info->groups_sem);
 
-	/* loop == 0, try to find a clustered alloc in every block group
-	 * loop == 1, try again after forcing a chunk allocation
-	 * loop == 2, set empty_size and empty_cluster to 0 and try again
+	/* LOOP_CACHED_ONLY, only search fully cached block groups
+	 * LOOP_CACHING_NOWAIT, search partially cached block groups, but
+	 *			dont wait foR them to finish caching
+	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+	 *			again
 	 */
-	if (!ins->objectid && loop < 3 &&
-	    (empty_size || empty_cluster || allowed_chunk_alloc)) {
-		if (loop >= 2) {
+	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
+	    (found_uncached_bg || empty_size || empty_cluster ||
+	     allowed_chunk_alloc)) {
+		if (found_uncached_bg) {
+			found_uncached_bg = false;
+			if (loop < LOOP_CACHING_WAIT) {
+				loop++;
+				goto search;
+			}
+		}
+
+		if (loop == LOOP_ALLOC_CHUNK) {
 			empty_size = 0;
 			empty_cluster = 0;
 		}
@@ -3713,7 +3968,7 @@ loop:
 			space_info->force_alloc = 1;
 		}
 
-		if (loop < 3) {
+		if (loop < LOOP_NO_EMPTY_SIZE) {
 			loop++;
 			goto search;
 		}
@@ -3809,7 +4064,7 @@ again:
 			       num_bytes, data, 1);
 		goto again;
 	}
-	if (ret) {
+	if (ret == -ENOSPC) {
 		struct btrfs_space_info *sinfo;
 
 		sinfo = __find_space_info(root->fs_info, data);
@@ -3817,7 +4072,6 @@ again:
 		       "wanted %llu\n", (unsigned long long)data,
 		       (unsigned long long)num_bytes);
 		dump_space_info(sinfo, num_bytes);
-		BUG();
 	}
 
 	return ret;
@@ -3855,7 +4109,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
 				     empty_size, hint_byte, search_end, ins,
 				     data);
-	update_reserved_extents(root, ins->objectid, ins->offset, 1);
+	if (!ret)
+		update_reserved_extents(root, ins->objectid, ins->offset, 1);
+
 	return ret;
 }
 
@@ -4017,9 +4273,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_cache *block_group;
 
 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-	mutex_lock(&block_group->cache_mutex);
-	cache_block_group(root, block_group);
-	mutex_unlock(&block_group->cache_mutex);
+	cache_block_group(block_group);
+	wait_event(block_group->caching_q,
+		   block_group_cache_done(block_group));
 
 	ret = btrfs_remove_free_space(block_group, ins->objectid,
 				      ins->offset);
@@ -4050,7 +4306,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
 	ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
 				     empty_size, hint_byte, search_end,
 				     ins, 0);
-	BUG_ON(ret);
+	if (ret)
+		return ret;
 
 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
 		if (parent == 0)
@@ -6966,11 +7223,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 			 &info->block_group_cache_tree);
 		spin_unlock(&info->block_group_cache_lock);
 
-		btrfs_remove_free_space_cache(block_group);
 		down_write(&block_group->space_info->groups_sem);
 		list_del(&block_group->list);
 		up_write(&block_group->space_info->groups_sem);
 
+		if (block_group->cached == BTRFS_CACHE_STARTED)
+			wait_event(block_group->caching_q,
+				   block_group_cache_done(block_group));
+
+		btrfs_remove_free_space_cache(block_group);
+
 		WARN_ON(atomic_read(&block_group->count) != 1);
 		kfree(block_group);
 
@@ -7036,10 +7298,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		atomic_set(&cache->count, 1);
 		spin_lock_init(&cache->lock);
 		spin_lock_init(&cache->tree_lock);
-		mutex_init(&cache->cache_mutex);
+		cache->fs_info = info;
+		init_waitqueue_head(&cache->caching_q);
 		INIT_LIST_HEAD(&cache->list);
 		INIT_LIST_HEAD(&cache->cluster_list);
-		cache->sectorsize = root->sectorsize;
 
 		/*
 		 * we only want to have 32k of ram per block group for keeping
@@ -7057,6 +7319,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
 		cache->flags = btrfs_block_group_flags(&cache->item);
+		cache->sectorsize = root->sectorsize;
+
+		remove_sb_from_cache(root, cache);
+
+		/*
+		 * check for two cases, either we are full, and therefore
+		 * don't need to bother with the caching work since we won't
+		 * find any space, or we are empty, and we can just add all
+		 * the space in and be done with it.  This saves us _alot_ of
+		 * time, particularly in the full case.
+		 */
+		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+			cache->cached = BTRFS_CACHE_FINISHED;
+		} else if (btrfs_block_group_used(&cache->item) == 0) {
+			cache->cached = BTRFS_CACHE_FINISHED;
+			add_new_free_space(cache, root->fs_info,
+					   found_key.objectid,
+					   found_key.objectid +
+					   found_key.offset);
+		}
 
 		ret = update_space_info(info, cache->flags, found_key.offset,
 					btrfs_block_group_used(&cache->item),
@@ -7112,7 +7394,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
 	spin_lock_init(&cache->tree_lock);
-	mutex_init(&cache->cache_mutex);
+	init_waitqueue_head(&cache->caching_q);
 	INIT_LIST_HEAD(&cache->list);
 	INIT_LIST_HEAD(&cache->cluster_list);
 
@@ -7121,11 +7403,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	cache->flags = type;
 	btrfs_set_block_group_flags(&cache->item, type);
 
-	cache->cached = 1;
-	ret = btrfs_add_free_space(cache, chunk_offset, size);
-	BUG_ON(ret);
+	cache->cached = BTRFS_CACHE_FINISHED;
 	remove_sb_from_cache(root, cache);
 
+	add_new_free_space(cache, root->fs_info, chunk_offset,
+			   chunk_offset + size);
+
 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
 				&cache->space_info);
 	BUG_ON(ret);
@@ -7184,7 +7467,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	rb_erase(&block_group->cache_node,
 		 &root->fs_info->block_group_cache_tree);
 	spin_unlock(&root->fs_info->block_group_cache_lock);
-	btrfs_remove_free_space_cache(block_group);
+
 	down_write(&block_group->space_info->groups_sem);
 	/*
 	 * we must use list_del_init so people can check to see if they
@@ -7193,6 +7476,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	list_del_init(&block_group->list);
 	up_write(&block_group->space_info->groups_sem);
 
+	if (block_group->cached == BTRFS_CACHE_STARTED)
+		wait_event(block_group->caching_q,
+			   block_group_cache_done(block_group));
+
+	btrfs_remove_free_space_cache(block_group);
+
 	spin_lock(&block_group->space_info->lock);
 	block_group->space_info->total_bytes -= block_group->key.offset;
 	block_group->space_info->bytes_readonly -= block_group->key.offset;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ab8cad8b46c9..af99b78b288e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -238,6 +238,7 @@ static void unlink_free_space(struct btrfs_block_group_cache *block_group,
 {
 	rb_erase(&info->offset_index, &block_group->free_space_offset);
 	block_group->free_extents--;
+	block_group->free_space -= info->bytes;
 }
 
 static int link_free_space(struct btrfs_block_group_cache *block_group,
@@ -251,6 +252,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 	if (ret)
 		return ret;
 
+	block_group->free_space += info->bytes;
 	block_group->free_extents++;
 	return ret;
 }
@@ -285,36 +287,40 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
 	}
 }
 
-static void bitmap_clear_bits(struct btrfs_free_space *info, u64 offset, u64 bytes,
-			      u64 sectorsize)
+static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+			      struct btrfs_free_space *info, u64 offset,
+			      u64 bytes)
 {
 	unsigned long start, end;
 	unsigned long i;
 
-	start = offset_to_bit(info->offset, sectorsize, offset);
-	end = start + bytes_to_bits(bytes, sectorsize);
+	start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+	end = start + bytes_to_bits(bytes, block_group->sectorsize);
 	BUG_ON(end > BITS_PER_BITMAP);
 
 	for (i = start; i < end; i++)
 		clear_bit(i, info->bitmap);
 
 	info->bytes -= bytes;
+	block_group->free_space -= bytes;
 }
 
-static void bitmap_set_bits(struct btrfs_free_space *info, u64 offset, u64 bytes,
-			    u64 sectorsize)
+static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
+			    struct btrfs_free_space *info, u64 offset,
+			    u64 bytes)
 {
 	unsigned long start, end;
 	unsigned long i;
 
-	start = offset_to_bit(info->offset, sectorsize, offset);
-	end = start + bytes_to_bits(bytes, sectorsize);
+	start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+	end = start + bytes_to_bits(bytes, block_group->sectorsize);
 	BUG_ON(end > BITS_PER_BITMAP);
 
 	for (i = start; i < end; i++)
 		set_bit(i, info->bitmap);
 
 	info->bytes += bytes;
+	block_group->free_space += bytes;
 }
 
 static int search_bitmap(struct btrfs_block_group_cache *block_group,
@@ -414,13 +420,12 @@ again:
 		(u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
 
 	if (*offset > bitmap_info->offset && *offset + *bytes > end) {
-		bitmap_clear_bits(bitmap_info, *offset,
-				  end - *offset + 1, block_group->sectorsize);
+		bitmap_clear_bits(block_group, bitmap_info, *offset,
+				  end - *offset + 1);
 		*bytes -= end - *offset + 1;
 		*offset = end + 1;
 	} else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
-		bitmap_clear_bits(bitmap_info, *offset,
-				  *bytes, block_group->sectorsize);
+		bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
 		*bytes = 0;
 	}
 
@@ -495,14 +500,13 @@ again:
 		(u64)(BITS_PER_BITMAP * block_group->sectorsize);
 
 	if (offset >= bitmap_info->offset && offset + bytes > end) {
-		bitmap_set_bits(bitmap_info, offset, end - offset,
-				block_group->sectorsize);
+		bitmap_set_bits(block_group, bitmap_info, offset,
+				end - offset);
 		bytes -= end - offset;
 		offset = end;
 		added = 0;
 	} else if (offset >= bitmap_info->offset && offset + bytes <= end) {
-		bitmap_set_bits(bitmap_info, offset, bytes,
-				block_group->sectorsize);
+		bitmap_set_bits(block_group, bitmap_info, offset, bytes);
 		bytes = 0;
 	} else {
 		BUG();
@@ -870,8 +874,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 
 	ret = offset;
 	if (entry->bitmap) {
-		bitmap_clear_bits(entry, offset, bytes,
-				  block_group->sectorsize);
+		bitmap_clear_bits(block_group, entry, offset, bytes);
 		if (!entry->bytes) {
 			unlink_free_space(block_group, entry);
 			kfree(entry->bitmap);
@@ -891,6 +894,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 
 out:
 	spin_unlock(&block_group->tree_lock);
+
 	return ret;
 }
 
@@ -967,7 +971,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
 		goto out;
 
 	ret = search_start;
-	bitmap_clear_bits(entry, ret, bytes, block_group->sectorsize);
+	bitmap_clear_bits(block_group, entry, ret, bytes);
 out:
 	spin_unlock(&cluster->lock);
 	spin_unlock(&block_group->tree_lock);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81f7124c3051..32454d1c566f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,6 +40,14 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
 	}
 }
 
+static noinline void switch_commit_root(struct btrfs_root *root)
+{
+	down_write(&root->commit_root_sem);
+	free_extent_buffer(root->commit_root);
+	root->commit_root = btrfs_root_node(root);
+	up_write(&root->commit_root_sem);
+}
+
 /*
  * either allocate a new transaction or hop into the existing one
  */
@@ -458,8 +466,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 		ret = btrfs_write_dirty_block_groups(trans, root);
 		BUG_ON(ret);
 	}
-	free_extent_buffer(root->commit_root);
-	root->commit_root = btrfs_root_node(root);
+	switch_commit_root(root);
 	return 0;
 }
 
@@ -537,8 +544,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 			btrfs_update_reloc_root(trans, root);
 
 			if (root->commit_root != root->node) {
-				free_extent_buffer(root->commit_root);
-				root->commit_root = btrfs_root_node(root);
+				switch_commit_root(root);
 				btrfs_set_root_node(&root->root_item,
 						    root->node);
 			}
@@ -1002,15 +1008,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
 			    root->fs_info->tree_root->node);
-	free_extent_buffer(root->fs_info->tree_root->commit_root);
-	root->fs_info->tree_root->commit_root =
-				btrfs_root_node(root->fs_info->tree_root);
+	switch_commit_root(root->fs_info->tree_root);
 
 	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
 			    root->fs_info->chunk_root->node);
-	free_extent_buffer(root->fs_info->chunk_root->commit_root);
-	root->fs_info->chunk_root->commit_root =
-				btrfs_root_node(root->fs_info->chunk_root);
+	switch_commit_root(root->fs_info->chunk_root);
 
 	update_super_roots(root);
 
@@ -1050,6 +1052,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	cur_trans->commit_done = 1;
 
 	root->fs_info->last_trans_committed = cur_trans->transid;
+
 	wake_up(&cur_trans->commit_wait);
 
 	put_transaction(cur_trans);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c13922206d1b..195606862618 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -264,7 +264,7 @@ static int process_one_buffer(struct btrfs_root *log,
 {
 	if (wc->pin)
 		btrfs_update_pinned_extents(log->fs_info->extent_root,
-					    eb->start, eb->len, 1);
+					    eb->start, eb->len, 1, 0);
 
 	if (btrfs_buffer_uptodate(eb, gen)) {
 		if (wc->write)
-- 
cgit v1.2.3-70-g09d2


From 20736abaa361bea488df6a1f66f6b37fb01107b9 Mon Sep 17 00:00:00 2001
From: Diego Calleja <diegocg@gmail.com>
Date: Fri, 24 Jul 2009 11:06:52 -0400
Subject: Btrfs: Remove code duplication in comp_keys

comp_keys is duplicating what is done in btrfs_comp_cpu_keys, so just
call it.

Signed-off-by: Diego Calleja <diegocg@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fdd423a550d6..91572091c0a4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 
 	btrfs_disk_key_to_cpu(&k1, disk);
 
-	if (k1.objectid > k2->objectid)
-		return 1;
-	if (k1.objectid < k2->objectid)
-		return -1;
-	if (k1.type > k2->type)
-		return 1;
-	if (k1.type < k2->type)
-		return -1;
-	if (k1.offset > k2->offset)
-		return 1;
-	if (k1.offset < k2->offset)
-		return -1;
-	return 0;
+	return btrfs_comp_cpu_keys(&k1, k2);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 1fcbac581be375ca0a686f72ee2b7fd1dbf386e7 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Fri, 24 Jul 2009 11:06:53 -0400
Subject: Btrfs: find_free_dev_extent doesn't handle holes at the start of the
 device

find_free_dev_extent does not properly handle the case where
the device is not complete free, and there is a free extent
at the beginning of the device.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 55c37276a29f..074c1c56d8c4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -758,9 +758,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto error;
-	ret = btrfs_previous_item(root, path, 0, key.type);
-	if (ret < 0)
-		goto error;
+	if (ret > 0) {
+		ret = btrfs_previous_item(root, path, key.objectid, key.type);
+		if (ret < 0)
+			goto error;
+		if (ret > 0)
+			start_found = 1;
+	}
 	l = path->nodes[0];
 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
 	while (1) {
-- 
cgit v1.2.3-70-g09d2


From 0a4eefbb745ec0e8a5b694ae3f40cc34082d8f61 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Fri, 24 Jul 2009 11:06:53 -0400
Subject: Btrfs: Fix ordering of key field checks in btrfs_previous_item

Check objectid of item before checking the item type, otherwise we may return
zero for a key that is actually too low.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 91572091c0a4..978449af4ccd 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4296,10 +4296,10 @@ int btrfs_previous_item(struct btrfs_root *root,
 			path->slots[0]--;
 
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.type == type)
-			return 0;
 		if (found_key.objectid < min_objectid)
 			break;
+		if (found_key.type == type)
+			return 0;
 		if (found_key.objectid == min_objectid &&
 		    found_key.type < type)
 			break;
-- 
cgit v1.2.3-70-g09d2


From d717aa1d31c36cb56059e97966cb76f0be021969 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Fri, 24 Jul 2009 12:42:46 -0400
Subject: Btrfs: Avoid delayed reference update looping

btrfs_split_leaf and btrfs_del_items can end up in a loop
where one is constantly spliting a given leaf and the other
is constantly merging it back with the adjacent nodes.

There is a better fix for this, but in the interest of something
small, this patch just changes btrfs_del_items back to balancing less
often.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 978449af4ccd..3fdcc0512d3a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1040,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
 		return 0;
 
-	if (btrfs_header_nritems(mid) > 2)
-		return 0;
-
 	if (btrfs_header_nritems(mid) < 2)
 		err_on_enospc = 1;
 
@@ -3796,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		/* delete the leaf if it is mostly empty */
-		if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) {
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
-- 
cgit v1.2.3-70-g09d2


From ebecd3d9d2adba144c15f1d35c78e0c26ead1bfd Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 24 Jul 2009 13:17:44 -0400
Subject: Btrfs: make flushoncommit mount option correctly wait on
 ordered_extents

The commit_transaction call to wait_ordered_extents when snap_pending
passes nocow_only=1 to process only NOCOW or PREALLOC extents.  This isn't
correct for the 'flushoncommit' mode, as it skips extents we just started
IO on in start_delalloc_inodes.

So, in the flushoncommit case, wait on all ordered extents.  Otherwise,
only pass the nocow_only flag to wait_ordered_extents if snap_pending.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 32454d1c566f..e51d2bc532f8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -942,9 +942,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		mutex_unlock(&root->fs_info->trans_mutex);
 
-		if (flush_on_commit || snap_pending) {
-			if (flush_on_commit)
-				btrfs_start_delalloc_inodes(root);
+		if (flush_on_commit) {
+			btrfs_start_delalloc_inodes(root);
+			ret = btrfs_wait_ordered_extents(root, 0);
+			BUG_ON(ret);
+		} else if (snap_pending) {
 			ret = btrfs_wait_ordered_extents(root, 1);
 			BUG_ON(ret);
 		}
-- 
cgit v1.2.3-70-g09d2


From 283bb1979fa8580c4037d8df251449368c292a3b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 24 Jul 2009 16:30:55 -0400
Subject: Btrfs: clear all space_info->full after removing a block group

Btrfs allocates individual extents from block groups, and each
block group has a specific type.  It may hold metadata, data
mirrored or striped etc.

When we balance space (btrfs-vol -b) or remove a drive (btrfs-vol -r)
we free block groups.  Once a block group is freed, the space it was
using on the device may be available for use by new block groups.

btrfs_remove_block_group was clearing the flag that said
'our devices are full, don't even try to allocate new block groups',
but it was only clearing that flag for a specific type of block group.

This commit clears the full flag for all of the types of block groups,
making it much more likely that we'll be able to balance space when
the drive is close to full.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9a489cc89fd3..508df5f7d2ea 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7486,7 +7486,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	block_group->space_info->total_bytes -= block_group->key.offset;
 	block_group->space_info->bytes_readonly -= block_group->key.offset;
 	spin_unlock(&block_group->space_info->lock);
-	block_group->space_info->full = 0;
+
+	btrfs_clear_space_info_full(root->fs_info);
 
 	btrfs_put_block_group(block_group);
 	btrfs_put_block_group(block_group);
-- 
cgit v1.2.3-70-g09d2


From 9779b72f0584fd53e0de53f62f205bf0dc0db553 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 24 Jul 2009 16:41:41 -0400
Subject: Btrfs: find smallest available device extent during chunk allocation

Allocating new block group is easy when the disk has plenty of space.
But things get difficult as the disk fills up, especially if
the FS has been run through btrfs-vol -b.  The balance operation
is likely to make the total bytes available on the device greater
than the largest extent we'll actually be able to allocate.

But the device extent allocation code incorrectly assumes that a device
with 5G free will be able to allocate a 5G extent.  It isn't normally a
problem because device extents don't get freed unless btrfs-vol -b
is run.

This fixes the device extent allocator to remember the largest free
extent it can find, and then uses that value as a fallback.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 074c1c56d8c4..5dbefd11b4af 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -721,7 +721,8 @@ error:
  */
 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
 					 struct btrfs_device *device,
-					 u64 num_bytes, u64 *start)
+					 u64 num_bytes, u64 *start,
+					 u64 *max_avail)
 {
 	struct btrfs_key key;
 	struct btrfs_root *root = device->dev_root;
@@ -807,6 +808,10 @@ no_more_items:
 			if (last_byte < search_start)
 				last_byte = search_start;
 			hole_size = key.offset - last_byte;
+
+			if (hole_size > *max_avail)
+				*max_avail = hole_size;
+
 			if (key.offset > last_byte &&
 			    hole_size >= num_bytes) {
 				*start = last_byte;
@@ -1625,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 	device->fs_devices->total_rw_bytes += diff;
 
 	device->total_bytes = new_size;
+	device->disk_total_bytes = new_size;
 	btrfs_clear_space_info_full(device->dev_root->fs_info);
 
 	return btrfs_update_device(trans, device);
@@ -2175,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			     max_chunk_size);
 
 again:
+	max_avail = 0;
 	if (!map || map->num_stripes != num_stripes) {
 		kfree(map);
 		map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -2223,7 +2230,8 @@ again:
 
 		if (device->in_fs_metadata && avail >= min_free) {
 			ret = find_free_dev_extent(trans, device,
-						   min_free, &dev_offset);
+						   min_free, &dev_offset,
+						   &max_avail);
 			if (ret == 0) {
 				list_move_tail(&device->dev_alloc_list,
 					       &private_devs);
-- 
cgit v1.2.3-70-g09d2


From 631c07c8d12bcc6ce4a0fbfbd64ea843d78e2b10 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 27 Jul 2009 13:57:00 -0400
Subject: Btrfs: Correct redundant test in add_inode_ref

dir has already been tested.  It seems that this test should be on the
recently returned value inode.

A simplified version of the semantic match that finds this problem is as
follows: (http://www.emn.fr/x-info/coccinelle/)

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 195606862618..11d0787c6188 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 		return -ENOENT;
 
 	inode = read_one_inode(root, key->objectid);
-	BUG_ON(!dir);
+	BUG_ON(!inode);
 
 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
 	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
-- 
cgit v1.2.3-70-g09d2


From 68b38550ddbea13d296184bf69edff387618b1d3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 27 Jul 2009 13:57:01 -0400
Subject: Btrfs: change how we unpin extents

We are racy with async block caching and unpinning extents.  This patch makes
things much less complicated by only unpinning the extent if the block group is
cached.  We check the block_group->cached var under the block_group->lock spin
lock.  If it is set to BTRFS_CACHE_FINISHED then we update the pinned counters,
and unpin the extent and add the free space back.  If it is not set to this, we
start the caching of the block group so the next time we unpin extents we can
unpin the extent.  This keeps us from racing with the async caching threads,
lets us kill the fs wide async thread counter, and keeps us from having to set
DELALLOC bits for every extent we hit if there are caching kthreads going.

One thing that needed to be changed was btrfs_free_super_mirror_extents.  Now
instead of just looking for LOCKED extents, we also look for DIRTY extents,
since we could have left some extents pinned in the previous transaction that
will never get freed now that we are unmounting, which would cause us to leak
memory.  So btrfs_free_super_mirror_extents has been changed to
btrfs_free_pinned_extents, and it will clear the extents locked for the super
mirror, and any remaining pinned extents that may be present.  Thank you,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   5 +-
 fs/btrfs/disk-io.c     |   3 +-
 fs/btrfs/extent-tree.c | 149 ++++++++++++++-----------------------------------
 fs/btrfs/tree-log.c    |   2 +-
 4 files changed, 46 insertions(+), 113 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 42b03c4ee494..17ad92c29cfd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -845,7 +845,6 @@ struct btrfs_fs_info {
 	atomic_t async_submit_draining;
 	atomic_t nr_async_bios;
 	atomic_t async_delalloc_pages;
-	atomic_t async_caching_threads;
 
 	/*
 	 * this is used by the balancing code to wait for all the pending
@@ -1926,7 +1925,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_update_pinned_extents(struct btrfs_root *root,
-				u64 bytenr, u64 num, int pin, int mark_free);
+				u64 bytenr, u64 num, int pin);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
@@ -2011,7 +2010,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
 				 u64 bytes);
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
 			      u64 bytes);
-void btrfs_free_super_mirror_extents(struct btrfs_fs_info *info);
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ec2c915f7f4a..c658397c7473 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1567,7 +1567,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
-	atomic_set(&fs_info->async_caching_threads, 0);
 	fs_info->sb = sb;
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
@@ -2339,7 +2338,7 @@ int close_ctree(struct btrfs_root *root)
 	free_extent_buffer(root->fs_info->csum_root->commit_root);
 
 	btrfs_free_block_groups(root->fs_info);
-	btrfs_free_super_mirror_extents(root->fs_info);
+	btrfs_free_pinned_extents(root->fs_info);
 
 	del_fs_roots(fs_info);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 508df5f7d2ea..08188f1615d9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -153,18 +153,26 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 	return ret;
 }
 
-void btrfs_free_super_mirror_extents(struct btrfs_fs_info *info)
+/*
+ * We always set EXTENT_LOCKED for the super mirror extents so we don't
+ * overwrite them, so those bits need to be unset.  Also, if we are unmounting
+ * with pinned extents still sitting there because we had a block group caching,
+ * we need to clear those now, since we are done.
+ */
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
 {
 	u64 start, end, last = 0;
 	int ret;
 
 	while (1) {
 		ret = find_first_extent_bit(&info->pinned_extents, last,
-					    &start, &end, EXTENT_LOCKED);
+					    &start, &end,
+					    EXTENT_LOCKED|EXTENT_DIRTY);
 		if (ret)
 			break;
 
-		unlock_extent(&info->pinned_extents, start, end, GFP_NOFS);
+		clear_extent_bits(&info->pinned_extents, start, end,
+				  EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
 		last = end+1;
 	}
 }
@@ -209,8 +217,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 	while (start < end) {
 		ret = find_first_extent_bit(&info->pinned_extents, start,
 					    &extent_start, &extent_end,
-					    EXTENT_DIRTY|EXTENT_LOCKED|
-					    EXTENT_DELALLOC);
+					    EXTENT_DIRTY|EXTENT_LOCKED);
 		if (ret)
 			break;
 
@@ -238,67 +245,6 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 	return total_added;
 }
 
-DEFINE_MUTEX(discard_mutex);
-
-/*
- * if async kthreads are running when we cross transactions, we mark any pinned
- * extents with EXTENT_DELALLOC and then let the caching kthreads clean up those
- * extents when they are done.  Also we run this from btrfs_finish_extent_commit
- * in case there were some pinned extents that were missed because we had
- * already cached that block group.
- */
-static void btrfs_discard_pinned_extents(struct btrfs_fs_info *fs_info,
-					 struct btrfs_block_group_cache *cache)
-{
-	u64 start, end, last;
-	int ret;
-
-	if (!cache)
-		last = 0;
-	else
-		last = cache->key.objectid;
-
-	mutex_lock(&discard_mutex);
-	while (1) {
-		ret = find_first_extent_bit(&fs_info->pinned_extents, last,
-					    &start, &end, EXTENT_DELALLOC);
-		if (ret)
-			break;
-
-		if (cache && start >= cache->key.objectid + cache->key.offset)
-			break;
-
-
-		if (!cache) {
-			cache = btrfs_lookup_block_group(fs_info, start);
-			BUG_ON(!cache);
-
-			start = max(start, cache->key.objectid);
-			end = min(end, cache->key.objectid + cache->key.offset - 1);
-
-			if (block_group_cache_done(cache))
-				btrfs_add_free_space(cache, start,
-						     end - start + 1);
-			cache = NULL;
-		} else {
-			start = max(start, cache->key.objectid);
-			end = min(end, cache->key.objectid + cache->key.offset - 1);
-			btrfs_add_free_space(cache, start, end - start + 1);
-		}
-
-		clear_extent_bits(&fs_info->pinned_extents, start, end,
-				  EXTENT_DELALLOC, GFP_NOFS);
-		last = end + 1;
-
-		if (need_resched()) {
-			mutex_unlock(&discard_mutex);
-			cond_resched();
-			mutex_lock(&discard_mutex);
-		}
-	}
-	mutex_unlock(&discard_mutex);
-}
-
 static int caching_kthread(void *data)
 {
 	struct btrfs_block_group_cache *block_group = data;
@@ -317,7 +263,6 @@ static int caching_kthread(void *data)
 	if (!path)
 		return -ENOMEM;
 
-	atomic_inc(&fs_info->async_caching_threads);
 	atomic_inc(&block_group->space_info->caching_threads);
 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 again:
@@ -399,13 +344,9 @@ next:
 err:
 	btrfs_free_path(path);
 	up_read(&fs_info->extent_root->commit_root_sem);
-	atomic_dec(&fs_info->async_caching_threads);
 	atomic_dec(&block_group->space_info->caching_threads);
 	wake_up(&block_group->caching_q);
 
-	if (!ret)
-		btrfs_discard_pinned_extents(fs_info, block_group);
-
 	return 0;
 }
 
@@ -1867,7 +1808,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 				BUG_ON(ret);
 			}
 			btrfs_update_pinned_extents(root, node->bytenr,
-						    node->num_bytes, 1, 0);
+						    node->num_bytes, 1);
 			update_reserved_extents(root, node->bytenr,
 						node->num_bytes, 0);
 		}
@@ -3100,19 +3041,15 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 }
 
 int btrfs_update_pinned_extents(struct btrfs_root *root,
-				u64 bytenr, u64 num, int pin, int mark_free)
+				u64 bytenr, u64 num, int pin)
 {
 	u64 len;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	if (pin) {
+	if (pin)
 		set_extent_dirty(&fs_info->pinned_extents,
 				bytenr, bytenr + num - 1, GFP_NOFS);
-	} else {
-		clear_extent_dirty(&fs_info->pinned_extents,
-				bytenr, bytenr + num - 1, GFP_NOFS);
-	}
 
 	while (num > 0) {
 		cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -3128,14 +3065,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 			spin_unlock(&cache->space_info->lock);
 			fs_info->total_pinned += len;
 		} else {
+			int unpin = 0;
+
+			/*
+			 * in order to not race with the block group caching, we
+			 * only want to unpin the extent if we are cached.  If
+			 * we aren't cached, we want to start async caching this
+			 * block group so we can free the extent the next time
+			 * around.
+			 */
 			spin_lock(&cache->space_info->lock);
 			spin_lock(&cache->lock);
-			cache->pinned -= len;
-			cache->space_info->bytes_pinned -= len;
+			unpin = (cache->cached == BTRFS_CACHE_FINISHED);
+			if (likely(unpin)) {
+				cache->pinned -= len;
+				cache->space_info->bytes_pinned -= len;
+				fs_info->total_pinned -= len;
+			}
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
-			fs_info->total_pinned -= len;
-			if (block_group_cache_done(cache) && mark_free)
+
+			if (likely(unpin))
+				clear_extent_dirty(&fs_info->pinned_extents,
+						   bytenr, bytenr + len -1,
+						   GFP_NOFS);
+			else
+				cache_block_group(cache);
+
+			if (unpin)
 				btrfs_add_free_space(cache, bytenr, len);
 		}
 		btrfs_put_block_group(cache);
@@ -3181,27 +3138,15 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 	u64 last = 0;
 	u64 start;
 	u64 end;
-	bool caching_kthreads = false;
 	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
 	int ret;
 
-	if (atomic_read(&root->fs_info->async_caching_threads))
-		caching_kthreads = true;
-
 	while (1) {
 		ret = find_first_extent_bit(pinned_extents, last,
 					    &start, &end, EXTENT_DIRTY);
 		if (ret)
 			break;
 
-		/*
-		 * we need to make sure that the pinned extents don't go away
-		 * while we are caching block groups
-		 */
-		if (unlikely(caching_kthreads))
-			set_extent_delalloc(pinned_extents, start, end,
-					    GFP_NOFS);
-
 		set_extent_dirty(copy, start, end, GFP_NOFS);
 		last = end + 1;
 	}
@@ -3215,12 +3160,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	u64 start;
 	u64 end;
 	int ret;
-	int mark_free = 1;
-
-	ret = find_first_extent_bit(&root->fs_info->pinned_extents, 0,
-				    &start, &end, EXTENT_DELALLOC);
-	if (!ret)
-		mark_free = 0;
 
 	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
@@ -3231,16 +3170,12 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		ret = btrfs_discard_extent(root, start, end + 1 - start);
 
 		/* unlocks the pinned mutex */
-		btrfs_update_pinned_extents(root, start, end + 1 - start, 0,
-					    mark_free);
+		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
 
 		cond_resched();
 	}
 
-	if (unlikely(!mark_free))
-		btrfs_discard_pinned_extents(root->fs_info, NULL);
-
 	return ret;
 }
 
@@ -3281,7 +3216,7 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 pinit:
 	btrfs_set_path_blocking(path);
 	/* unlocks the pinned mutex */
-	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1, 0);
+	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 
 	BUG_ON(err < 0);
 	return 0;
@@ -3592,7 +3527,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
 		/* unlocks the pinned mutex */
-		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1, 0);
+		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 		update_reserved_extents(root, bytenr, num_bytes, 0);
 		ret = 0;
 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 11d0787c6188..d91b0de7c502 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -264,7 +264,7 @@ static int process_one_buffer(struct btrfs_root *log,
 {
 	if (wc->pin)
 		btrfs_update_pinned_extents(log->fs_info->extent_root,
-					    eb->start, eb->len, 1, 0);
+					    eb->start, eb->len, 1);
 
 	if (btrfs_buffer_uptodate(eb, gen)) {
 		if (wc->write)
-- 
cgit v1.2.3-70-g09d2


From 7b91e2661addd8e2419cb45f6a322aa5dab9bcee Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 23 Jul 2009 15:22:30 -0400
Subject: cifs: fix error handling in mount-time DFS referral chasing code

If the referral is malformed or the hostname can't be resolved, then
the current code generates an oops. Fix it to handle these errors
gracefully.

Reported-by: Sandro Mathys <sm@sandro-mathys.ch>
Acked-by: Igor Mammedov <niallain@gmail.com>
CC: Stable <stable@kernel.org>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_dfs_ref.c | 12 +++++++++---
 fs/cifs/connect.c      | 13 ++++++++++---
 2 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 3bb11be8b6a8..606912d8f2a8 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -55,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
  * i.e. strips from UNC trailing path that is not part of share
  * name and fixup missing '\' in the begining of DFS node refferal
  * if neccessary.
- * Returns pointer to share name on success or NULL on error.
+ * Returns pointer to share name on success or ERR_PTR on error.
  * Caller is responsible for freeing returned string.
  */
 static char *cifs_get_share_name(const char *node_name)
@@ -68,7 +68,7 @@ static char *cifs_get_share_name(const char *node_name)
 	UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
 			 GFP_KERNEL);
 	if (!UNC)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	/* get share name and server name */
 	if (node_name[1] != '\\') {
@@ -87,7 +87,7 @@ static char *cifs_get_share_name(const char *node_name)
 		cERROR(1, ("%s: no server name end in node name: %s",
 			__func__, node_name));
 		kfree(UNC);
-		return NULL;
+		return ERR_PTR(-EINVAL);
 	}
 
 	/* find sharename end */
@@ -133,6 +133,12 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 		return ERR_PTR(-EINVAL);
 
 	*devname = cifs_get_share_name(ref->node_name);
+	if (IS_ERR(*devname)) {
+		rc = PTR_ERR(*devname);
+		*devname = NULL;
+		goto compose_mount_options_err;
+	}
+
 	rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
 	if (rc != 0) {
 		cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index fc44d316d0bb..f2486889b7bb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2544,11 +2544,20 @@ remote_path_check:
 
 			if (mount_data != mount_data_global)
 				kfree(mount_data);
+
 			mount_data = cifs_compose_mount_options(
 					cifs_sb->mountdata, full_path + 1,
 					referrals, &fake_devname);
-			kfree(fake_devname);
+
 			free_dfs_info_array(referrals, num_referrals);
+			kfree(fake_devname);
+			kfree(full_path);
+
+			if (IS_ERR(mount_data)) {
+				rc = PTR_ERR(mount_data);
+				mount_data = NULL;
+				goto mount_fail_check;
+			}
 
 			if (tcon)
 				cifs_put_tcon(tcon);
@@ -2556,8 +2565,6 @@ remote_path_check:
 				cifs_put_smb_ses(pSesInfo);
 
 			cleanup_volume_info(&volume_info);
-			FreeXid(xid);
-			kfree(full_path);
 			referral_walks_count++;
 			goto try_mount_again;
 		}
-- 
cgit v1.2.3-70-g09d2


From 785b4b3a5a992eeb8b495a89b6f690b383010aba Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Mon, 27 Jul 2009 21:44:40 -0400
Subject: ext4: fix build warning when EXT4FS_DEBUG is on
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When compiling with EXT4FS_DEBUG on, gcc will complain with following warnings:

linux-2.6/fs/ext4/ialloc.c: In function ‘ext4_count_free_inodes’:
linux-2.6/fs/ext4/ialloc.c:1192: warning: format ‘%lu’ expects type
‘long unsigned int’, but argument 2 has type ‘ext4_group_t’

So add a type cast to suppress it.

Signed-off-by: Peng Tao <bergwolf@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 29e6dc7299b8..f3624ead4f6c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 
 		x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
 		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
-			i, ext4_free_inodes_count(sb, gdp), x);
+			(unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
-- 
cgit v1.2.3-70-g09d2


From f25784b35f590c81d5fb8245a8cd45e1afb6f1b2 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Tue, 28 Jul 2009 08:41:57 -0400
Subject: Btrfs: Fix async caching interaction with unmount

- don't stop the caching thread until btrfs_commit_super return.

- if caching is interrupted by umount, set last to (u64)-1.
  otherwise the un-scanned range of block group will be considered
  as free extent.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 3 +++
 fs/btrfs/extent-tree.c | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c658397c7473..3a9b88759880 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2317,6 +2317,9 @@ int close_ctree(struct btrfs_root *root)
 			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
 	}
 
+	fs_info->closing = 2;
+	smp_mb();
+
 	if (fs_info->delalloc_bytes) {
 		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
 		       (unsigned long long)fs_info->delalloc_bytes);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 08188f1615d9..fadf69a2764b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -288,8 +288,10 @@ again:
 
 	while (1) {
 		smp_mb();
-		if (block_group->fs_info->closing)
+		if (block_group->fs_info->closing > 1) {
+			last = (u64)-1;
 			break;
+		}
 
 		leaf = path->nodes[0];
 		slot = path->slots[0];
-- 
cgit v1.2.3-70-g09d2


From 0f58b44582001c8bcdb75f36cf85ebbe5170e959 Mon Sep 17 00:00:00 2001
From: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Date: Tue, 14 Jul 2009 17:56:15 +0200
Subject: sysfs: fix hardlink count on device_move

Update directory hardlink count when moving kobjects to a new parent.
Fixes the following problem which occurs when several devices are
moved to the same parent and then unregistered:

> ls -laF /sys/devices/css0/defunct/
> total 0
> drwxr-xr-x 4294967295 root root    0 2009-07-14 17:02 ./
> drwxr-xr-x        114 root root    0 2009-07-14 17:02 ../
> drwxr-xr-x          2 root root    0 2009-07-14 17:01 power/
> -rw-r--r--          1 root root 4096 2009-07-14 17:01 uevent

Signed-off-by: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index d88d0fac9fa5..14f2d71ea3ce 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -939,8 +939,10 @@ again:
 	/* Remove from old parent's list and insert into new parent's list. */
 	sysfs_unlink_sibling(sd);
 	sysfs_get(new_parent_sd);
+	drop_nlink(old_parent->d_inode);
 	sysfs_put(sd->s_parent);
 	sd->s_parent = new_parent_sd;
+	inc_nlink(new_parent->d_inode);
 	sysfs_link_sibling(sd);
 
  out_unlock:
-- 
cgit v1.2.3-70-g09d2


From 6352a29305373ae6196491e6d4669f301e26492e Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Tue, 28 Jul 2009 13:57:01 -0500
Subject: eCryptfs: Check Tag 11 literal data buffer size

Tag 11 packets are stored in the metadata section of an eCryptfs file to
store the key signature(s) used to encrypt the file encryption key.
After extracting the packet length field to determine the key signature
length, a check is not performed to see if the length would exceed the
key signature buffer size that was passed into parse_tag_11_packet().

Thanks to Ramon de Carvalho Valle for finding this bug using fsfuzzer.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Cc: stable@kernel.org (2.6.27 and 30)
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/keystore.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index af737bb56cb7..5414253d4c97 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1449,6 +1449,12 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents,
 		rc = -EINVAL;
 		goto out;
 	}
+	if (unlikely((*tag_11_contents_size) > max_contents_bytes)) {
+		printk(KERN_ERR "Literal data section in tag 11 packet exceeds "
+		       "expected size\n");
+		rc = -EINVAL;
+		goto out;
+	}
 	if (data[(*packet_size)++] != 0x62) {
 		printk(KERN_WARNING "Unrecognizable packet\n");
 		rc = -EINVAL;
-- 
cgit v1.2.3-70-g09d2


From f151cd2c54ddc7714e2f740681350476cda03a28 Mon Sep 17 00:00:00 2001
From: Ramon de Carvalho Valle <ramon@risesecurity.org>
Date: Tue, 28 Jul 2009 13:58:22 -0500
Subject: eCryptfs: parse_tag_3_packet check tag 3 packet encrypted key size

The parse_tag_3_packet function does not check if the tag 3 packet contains a
encrypted key size larger than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES.

Signed-off-by: Ramon de Carvalho Valle <ramon@risesecurity.org>
[tyhicks@linux.vnet.ibm.com: Added printk newline and changed goto to out_free]
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Cc: stable@kernel.org (2.6.27 and 30)
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/keystore.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 5414253d4c97..259525c9abb8 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1303,6 +1303,13 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
 	}
 	(*new_auth_tok)->session_key.encrypted_key_size =
 		(body_size - (ECRYPTFS_SALT_SIZE + 5));
+	if ((*new_auth_tok)->session_key.encrypted_key_size
+	    > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
+		printk(KERN_WARNING "Tag 3 packet contains key larger "
+		       "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
+		rc = -EINVAL;
+		goto out_free;
+	}
 	if (unlikely(data[(*packet_size)++] != 0x04)) {
 		printk(KERN_WARNING "Unknown version number [%d]\n",
 		       data[(*packet_size) - 1]);
-- 
cgit v1.2.3-70-g09d2


From dddac6a7b445de95515f64fdf82fe5dc36c02f26 Mon Sep 17 00:00:00 2001
From: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Date: Wed, 29 Jul 2009 21:07:55 +0200
Subject: PM / Hibernate: Replace bdget call with simple atomic_inc of i_count

Create bdgrab().  This function copies an existing reference to a
block_device.  It is safe to call from any context.

Hibernation code wishes to copy a reference to the active swap device.
Right now it calls bdget() under a spinlock, but this is wrong because
bdget() can sleep.  It doesn't need a full bdget() because we already
hold a reference to active swap devices (and the spinlock protects
against swapoff).

Fixes http://bugzilla.kernel.org/show_bug.cgi?id=13827

Signed-off-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 fs/block_dev.c     | 10 ++++++++++
 include/linux/fs.h |  1 +
 mm/swapfile.c      |  4 ++--
 3 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3a6d4fb2a329..94dfda24c06e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -564,6 +564,16 @@ struct block_device *bdget(dev_t dev)
 
 EXPORT_SYMBOL(bdget);
 
+/**
+ * bdgrab -- Grab a reference to an already referenced block device
+ * @bdev:	Block device to grab a reference to.
+ */
+struct block_device *bdgrab(struct block_device *bdev)
+{
+	atomic_inc(&bdev->bd_inode->i_count);
+	return bdev;
+}
+
 long nr_blockdev_pages(void)
 {
 	struct block_device *bdev;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0872372184fe..a36ffa5a77a4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1946,6 +1946,7 @@ extern void putname(const char *name);
 extern int register_blkdev(unsigned int, const char *);
 extern void unregister_blkdev(unsigned int, const char *);
 extern struct block_device *bdget(dev_t);
+extern struct block_device *bdgrab(struct block_device *bdev);
 extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d1ade1a48ee7..8ffdc0d23c53 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -753,7 +753,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
 
 		if (!bdev) {
 			if (bdev_p)
-				*bdev_p = bdget(sis->bdev->bd_dev);
+				*bdev_p = bdgrab(sis->bdev);
 
 			spin_unlock(&swap_lock);
 			return i;
@@ -765,7 +765,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
 					struct swap_extent, list);
 			if (se->start_block == offset) {
 				if (bdev_p)
-					*bdev_p = bdget(sis->bdev->bd_dev);
+					*bdev_p = bdgrab(sis->bdev);
 
 				spin_unlock(&swap_lock);
 				bdput(bdev);
-- 
cgit v1.2.3-70-g09d2


From 5c8053652328693d10551131432ef3573e77ed2d Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 29 Jul 2009 15:04:11 -0700
Subject: fs/ramfs/file-nommu.c needs include/linux/sched.h

This file makes use of various macros defined in files like asm/current.h
or asm-generic/resource.h.  All these files can be included via sched.h.
The building of the !MMU ARM kernel (with additional patches) fails
without this change.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ramfs/file-nommu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index ebb2c417912c..11f0c06316de 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -20,6 +20,7 @@
 #include <linux/ramfs.h>
 #include <linux/pagevec.h>
 #include <linux/mman.h>
+#include <linux/sched.h>
 
 #include <asm/uaccess.h>
 #include "internal.h"
-- 
cgit v1.2.3-70-g09d2


From 5bd9052d79daa4c8beb45436c408b6de672adb82 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 30 Jul 2009 02:26:14 +0000
Subject: [CIFS] Updates fs/cifs/CHANGES

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 92888aa90749..651cefde385b 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,9 @@
+Version 1.60
+-------------
+Fix memory leak in reconnect.  Fix oops in DFS mount error path.
+Set s_maxbytes to smaller (the max that vfs can handle) so that
+sendfile will now work over cifs mounts again.
+
 Version 1.59
 ------------
 Client uses server inode numbers (which are persistent) rather than
-- 
cgit v1.2.3-70-g09d2


From 2163b1e616c41c286f5ab79912671cd4bf52057c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 25 Jun 2009 16:30:26 +0100
Subject: GFS2: Shrink the shrinker

This patch removes some of the special cases that the shrinker
was trying to deal with. As a result we leave fewer items on
the list and none at all which cannot be demoted. This makes
the list scanning more efficient and solves some issues seen
with large numbers of inodes.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 297421c0427a..fdb796c4f940 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1300,7 +1300,6 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 	struct gfs2_glock *gl;
 	int may_demote;
 	int nr_skipped = 0;
-	int got_ref = 0;
 	LIST_HEAD(skipped);
 
 	if (nr == 0)
@@ -1318,7 +1317,6 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 		/* Test for being demotable */
 		if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
 			gfs2_glock_hold(gl);
-			got_ref = 1;
 			spin_unlock(&lru_lock);
 			spin_lock(&gl->gl_spin);
 			may_demote = demote_ok(gl);
@@ -1327,25 +1325,14 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 			if (may_demote) {
 				handle_callback(gl, LM_ST_UNLOCKED, 0);
 				nr--;
-				if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-					gfs2_glock_put(gl);
-				got_ref = 0;
 			}
+			if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+				gfs2_glock_put(gl);
 			spin_lock(&lru_lock);
-			if (may_demote)
-				continue;
-		}
-		if (list_empty(&gl->gl_lru) &&
-		    (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
-			nr_skipped++;
-			list_add(&gl->gl_lru, &skipped);
-		}
-		if (got_ref) {
-			spin_unlock(&lru_lock);
-			gfs2_glock_put(gl);
-			spin_lock(&lru_lock);
-			got_ref = 0;
+			continue;
 		}
+		nr_skipped++;
+		list_add(&gl->gl_lru, &skipped);
 	}
 	list_splice(&skipped, &lru_list);
 	atomic_add(nr_skipped, &lru_count);
-- 
cgit v1.2.3-70-g09d2


From 1946f70ab5e4eb8b54a8eaaedba2293a3750ab7e Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Thu, 25 Jun 2009 15:09:51 -0500
Subject: GFS2: keep statfs info in sync on grows

GFS2 wasn't syncing its statfs info on grows.  This causes a problem
when you grow the filesystem on multiple nodes.  GFS2 would calculate
the new space based on the resource groups (which are always current),
and then assume that the filesystem had grown the from the existing
statfs size.  If you grew the filesystem on two different nodes in a
short time, the second node wouldn't see the statfs size change from the
first node, and would assume that it was grown by a larger amount than
it was.  When all these changes were synced out, the total fileystem
size would be incorrect (the first grow would be counted twice).

This patch syncs makes GFS2 read in the statfs changes from disk before
a grow, and write them out after the grow, while the master statfs inode
is locked.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/aops.c  | 39 +++++++++++++++++++++++++++++++++++++++
 fs/gfs2/super.c | 39 +++++++++++++++++++++++++--------------
 fs/gfs2/super.h |  4 ++++
 3 files changed, 68 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 03ebb439ace0..7ebae9a4ecc0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -624,6 +624,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 {
 	struct gfs2_inode *ip = GFS2_I(mapping->host);
 	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
 	int alloc_required;
 	int error = 0;
@@ -637,6 +638,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	error = gfs2_glock_nq(&ip->i_gh);
 	if (unlikely(error))
 		goto out_uninit;
+	if (&ip->i_inode == sdp->sd_rindex) {
+		error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+					   GL_NOCACHE, &m_ip->i_gh);
+		if (unlikely(error)) {
+			gfs2_glock_dq(&ip->i_gh);
+			goto out_uninit;
+		}
+	}
 
 	error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
 	if (error)
@@ -667,6 +676,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 		rblocks += data_blocks ? data_blocks : 1;
 	if (ind_blocks || data_blocks)
 		rblocks += RES_STATFS + RES_QUOTA;
+	if (&ip->i_inode == sdp->sd_rindex)
+		rblocks += 2 * RES_STATFS;
 
 	error = gfs2_trans_begin(sdp, rblocks,
 				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -712,6 +723,10 @@ out_alloc_put:
 		gfs2_alloc_put(ip);
 	}
 out_unlock:
+	if (&ip->i_inode == sdp->sd_rindex) {
+		gfs2_glock_dq(&m_ip->i_gh);
+		gfs2_holder_uninit(&m_ip->i_gh);
+	}
 	gfs2_glock_dq(&ip->i_gh);
 out_uninit:
 	gfs2_holder_uninit(&ip->i_gh);
@@ -725,14 +740,21 @@ out_uninit:
 static void adjust_fs_space(struct inode *inode)
 {
 	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
 	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
 	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+	struct buffer_head *m_bh, *l_bh;
 	u64 fs_total, new_free;
 
 	/* Total up the file system space, according to the latest rindex. */
 	fs_total = gfs2_ri_total(sdp);
+	if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
+		return;
 
 	spin_lock(&sdp->sd_statfs_spin);
+	gfs2_statfs_change_in(m_sc, m_bh->b_data +
+			      sizeof(struct gfs2_dinode));
 	if (fs_total > (m_sc->sc_total + l_sc->sc_total))
 		new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
 	else
@@ -741,6 +763,13 @@ static void adjust_fs_space(struct inode *inode)
 	fs_warn(sdp, "File system extended by %llu blocks.\n",
 		(unsigned long long)new_free);
 	gfs2_statfs_change(sdp, new_free, new_free, 0);
+
+	if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
+		goto out;
+	update_statfs(sdp, m_bh, l_bh);
+	brelse(l_bh);
+out:
+	brelse(m_bh);
 }
 
 /**
@@ -763,6 +792,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 	u64 to = pos + copied;
 	void *kaddr;
 	unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
@@ -794,6 +824,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 
 	brelse(dibh);
 	gfs2_trans_end(sdp);
+	if (inode == sdp->sd_rindex) {
+		gfs2_glock_dq(&m_ip->i_gh);
+		gfs2_holder_uninit(&m_ip->i_gh);
+	}
 	gfs2_glock_dq(&ip->i_gh);
 	gfs2_holder_uninit(&ip->i_gh);
 	return copied;
@@ -823,6 +857,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 	struct inode *inode = page->mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 	struct buffer_head *dibh;
 	struct gfs2_alloc *al = ip->i_alloc;
 	unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
@@ -865,6 +900,10 @@ failed:
 		gfs2_quota_unlock(ip);
 		gfs2_alloc_put(ip);
 	}
+	if (inode == sdp->sd_rindex) {
+		gfs2_glock_dq(&m_ip->i_gh);
+		gfs2_holder_uninit(&m_ip->i_gh);
+	}
 	gfs2_glock_dq(&ip->i_gh);
 	gfs2_holder_uninit(&ip->i_gh);
 	return ret;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0a6801336470..552e321cee5e 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -353,7 +353,7 @@ fail:
 	return error;
 }
 
-static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
+void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
 {
 	const struct gfs2_statfs_change *str = buf;
 
@@ -441,6 +441,29 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
 	brelse(l_bh);
 }
 
+void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+		   struct buffer_head *l_bh)
+{
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+
+	gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+
+	spin_lock(&sdp->sd_statfs_spin);
+	m_sc->sc_total += l_sc->sc_total;
+	m_sc->sc_free += l_sc->sc_free;
+	m_sc->sc_dinodes += l_sc->sc_dinodes;
+	memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
+	memset(l_bh->b_data + sizeof(struct gfs2_dinode),
+	       0, sizeof(struct gfs2_statfs_change));
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+	gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+}
+
 int gfs2_statfs_sync(struct gfs2_sbd *sdp)
 {
 	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@@ -477,19 +500,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
 	if (error)
 		goto out_bh2;
 
-	gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
-
-	spin_lock(&sdp->sd_statfs_spin);
-	m_sc->sc_total += l_sc->sc_total;
-	m_sc->sc_free += l_sc->sc_free;
-	m_sc->sc_dinodes += l_sc->sc_dinodes;
-	memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
-	memset(l_bh->b_data + sizeof(struct gfs2_dinode),
-	       0, sizeof(struct gfs2_statfs_change));
-	spin_unlock(&sdp->sd_statfs_spin);
-
-	gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
-	gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+	update_statfs(sdp, m_bh, l_bh);
 
 	gfs2_trans_end(sdp);
 
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index b56413e3e40d..22e0417ed996 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -40,6 +40,10 @@ extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
 extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
 extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
 			       s64 dinodes);
+extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
+				  const void *buf);
+extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+			  struct buffer_head *l_bh);
 extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
 
 extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
-- 
cgit v1.2.3-70-g09d2


From a51b56fff3f04fc5aa66b21a2a6d693ee9862d66 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Tue, 30 Jun 2009 13:51:11 -0500
Subject: GFS2: Fix panic in glock memory shrinker

It is possible for gfs2_shrink_glock_memory() to check a glock for
demotion
that's in the process of being freed by gfs2_glock_put().  In this case,
gfs2_shrink_glock_memory() will acquire a new reference to this glock,
and
then try to free the glock itself when it drops the refernce.  To solve
this, gfs2_shrink_glock_memory() just needs to check if the glock is in
the process of being freed, and if so skip it without ever unlocking the
lru_lock.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Acked-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index fdb796c4f940..827136ee794c 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1314,6 +1314,10 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 		list_del_init(&gl->gl_lru);
 		atomic_dec(&lru_count);
 
+		/* Check if glock is about to be freed */
+		if (atomic_read(&gl->gl_ref) == 0)
+			continue;
+
 		/* Test for being demotable */
 		if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
 			gfs2_glock_hold(gl);
-- 
cgit v1.2.3-70-g09d2


From 1e19a19584b332eb92a573b66b7342fb97e67507 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 10 Jul 2009 21:13:38 +0100
Subject: GFS2: Don't try and dealloc own inode

When searching for unlinked, but still allocated inodes during block
allocation, avoid the block relating to the inode that is doing the
allocation. This fixes a hang caused when an unlinked, but still
open, inode tries to allocate some more blocks and lands up
finding itself during the search for deallocatable inodes.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index daa4ae341a29..5e5074176daa 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -961,7 +961,8 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
  * Returns: The inode, if one has been found
  */
 
-static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
+static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+				     u64 skip)
 {
 	struct inode *inode;
 	u32 goal = 0, block;
@@ -985,6 +986,8 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 		goal++;
 		if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
 			continue;
+		if (no_addr == skip)
+			continue;
 		*last_unlinked = no_addr;
 		inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
 					  no_addr, -1, 1);
@@ -1104,7 +1107,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 			if (try_rgrp_fit(rgd, al))
 				goto out;
 			if (rgd->rd_flags & GFS2_RDF_CHECK)
-				inode = try_rgrp_unlink(rgd, last_unlinked);
+				inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
 			if (!rg_locked)
 				gfs2_glock_dq_uninit(&al->al_rgd_gh);
 			if (inode)
@@ -1138,7 +1141,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 			if (try_rgrp_fit(rgd, al))
 				goto out;
 			if (rgd->rd_flags & GFS2_RDF_CHECK)
-				inode = try_rgrp_unlink(rgd, last_unlinked);
+				inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
 			if (!rg_locked)
 				gfs2_glock_dq_uninit(&al->al_rgd_gh);
 			if (inode)
-- 
cgit v1.2.3-70-g09d2


From 8ff22a6f9bdaac87c0eeb1d56c736181f11b4221 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Fri, 10 Jul 2009 18:04:24 -0500
Subject: GFS2: Don't put unlikely reclaim candidates on the reclaim list.

GFS2 was placing far too many glocks on the reclaim list that were not good
candidates for freeing up from cache.  These locks would sit there and
repeatedly get scanned to see if they could be reclaimed, wasting a lot
of time when there was memory pressure. This fix does more checks on the
locks to see if they are actually likely to be removable from cache.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 72 ++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 46 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 827136ee794c..f041a89e1ab8 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -173,6 +173,26 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 	atomic_inc(&gl->gl_ref);
 }
 
+/**
+ * demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int demote_ok(const struct gfs2_glock *gl)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+	if (gl->gl_state == LM_ST_UNLOCKED)
+		return 0;
+	if (!list_empty(&gl->gl_holders))
+		return 0;
+	if (glops->go_demote_ok)
+		return glops->go_demote_ok(gl);
+	return 1;
+}
+
 /**
  * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
  * @gl: the glock
@@ -181,14 +201,34 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 
 static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 {
+	int may_reclaim;
+	may_reclaim = (demote_ok(gl) &&
+		       (atomic_read(&gl->gl_ref) == 1 ||
+			(gl->gl_name.ln_type == LM_TYPE_INODE &&
+			 atomic_read(&gl->gl_ref) <= 2)));
 	spin_lock(&lru_lock);
-	if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
+	if (list_empty(&gl->gl_lru) && may_reclaim) {
 		list_add_tail(&gl->gl_lru, &lru_list);
 		atomic_inc(&lru_count);
 	}
 	spin_unlock(&lru_lock);
 }
 
+/**
+ * gfs2_glock_put_nolock() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ * This function should only be used if the caller has its own reference
+ * to the glock, in addition to the one it is dropping.
+ */
+
+static void gfs2_glock_put_nolock(struct gfs2_glock *gl)
+{
+	if (atomic_dec_and_test(&gl->gl_ref))
+		GLOCK_BUG_ON(gl, 1);
+	gfs2_glock_schedule_for_reclaim(gl);
+}
+
 /**
  * gfs2_glock_put() - Decrement reference count on glock
  * @gl: The glock to put
@@ -214,9 +254,9 @@ int gfs2_glock_put(struct gfs2_glock *gl)
 		rv = 1;
 		goto out;
 	}
-	/* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
-	if (atomic_read(&gl->gl_ref) == 2)
-		gfs2_glock_schedule_for_reclaim(gl);
+	spin_lock(&gl->gl_spin);
+	gfs2_glock_schedule_for_reclaim(gl);
+	spin_unlock(&gl->gl_spin);
 	write_unlock(gl_lock_addr(gl->gl_hash));
 out:
 	return rv;
@@ -398,7 +438,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
 		if (held2)
 			gfs2_glock_hold(gl);
 		else
-			gfs2_glock_put(gl);
+			gfs2_glock_put_nolock(gl);
 	}
 
 	gl->gl_state = new_state;
@@ -633,7 +673,7 @@ out:
 out_sched:
 	gfs2_glock_hold(gl);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-		gfs2_glock_put(gl);
+		gfs2_glock_put_nolock(gl);
 out_unlock:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
 	goto out;
@@ -1274,26 +1314,6 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 		gfs2_glock_put(gl);
 }
 
-/**
- * demote_ok - Check to see if it's ok to unlock a glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-
-static int demote_ok(const struct gfs2_glock *gl)
-{
-	const struct gfs2_glock_operations *glops = gl->gl_ops;
-
-	if (gl->gl_state == LM_ST_UNLOCKED)
-		return 0;
-	if (!list_empty(&gl->gl_holders))
-		return 0;
-	if (glops->go_demote_ok)
-		return glops->go_demote_ok(gl);
-	return 1;
-}
-
 
 static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 {
-- 
cgit v1.2.3-70-g09d2


From 6b94617024bd6810cde1d0d491202c30d5a38d91 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Fri, 10 Jul 2009 18:13:26 -0500
Subject: GFS2: Fix incorrent statfs consistency check

Since both linked and unlinked inodes are counted by rgd->rd_dinodes, It
makes no sense to count them with the used data blocks (first check that
I changed), it makes sense to count them with the linked inodes (second
check), and it makes no sense to care if there are more unlinked inodes
than linked ones. This fixes these errors.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 5e5074176daa..fba795798d3a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -285,27 +285,19 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
 	}
 
 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
-	if (count[1] + count[2] != tmp) {
+	if (count[1] != tmp) {
 		if (gfs2_consist_rgrpd(rgd))
 			fs_err(sdp, "used data mismatch:  %u != %u\n",
 			       count[1], tmp);
 		return;
 	}
 
-	if (count[3] != rgd->rd_dinodes) {
+	if (count[2] + count[3] != rgd->rd_dinodes) {
 		if (gfs2_consist_rgrpd(rgd))
 			fs_err(sdp, "used metadata mismatch:  %u != %u\n",
-			       count[3], rgd->rd_dinodes);
+			       count[2] + count[3], rgd->rd_dinodes);
 		return;
 	}
-
-	if (count[2] > count[3]) {
-		if (gfs2_consist_rgrpd(rgd))
-			fs_err(sdp, "unlinked inodes > inodes:  %u\n",
-			       count[2]);
-		return;
-	}
-
 }
 
 static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
-- 
cgit v1.2.3-70-g09d2


From b94a170e96dc416828af9d350ae2e34b70ae7347 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Thu, 23 Jul 2009 18:52:34 -0500
Subject: GFS2: remove dcache entries for remote deleted inodes

When a file is deleted from a gfs2 filesystem on one node, a dcache
entry for it may still exist on other nodes in the cluster. If this
happens, gfs2 will be unable to free this file on disk. Because of this,
it's possible to have a gfs2 filesystem with no files on it and no free
space. With this patch, when a node receives a callback notifying it
that the file is being deleted on another node, it schedules a new
workqueue thread to remove the file's dcache entry.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c  | 43 ++++++++++++++++++++++++++++++++++++++-----
 fs/gfs2/glock.h  |  3 +++
 fs/gfs2/glops.c  | 21 +++++++++++++++++++++
 fs/gfs2/incore.h |  2 ++
 fs/gfs2/super.c  |  1 +
 5 files changed, 65 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f041a89e1ab8..8b674b1f3a55 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -63,6 +63,7 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
 static struct workqueue_struct *glock_workqueue;
+struct workqueue_struct *gfs2_delete_workqueue;
 static LIST_HEAD(lru_list);
 static atomic_t lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(lru_lock);
@@ -167,7 +168,7 @@ static void glock_free(struct gfs2_glock *gl)
  *
  */
 
-static void gfs2_glock_hold(struct gfs2_glock *gl)
+void gfs2_glock_hold(struct gfs2_glock *gl)
 {
 	GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
 	atomic_inc(&gl->gl_ref);
@@ -222,7 +223,7 @@ static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
  * to the glock, in addition to the one it is dropping.
  */
 
-static void gfs2_glock_put_nolock(struct gfs2_glock *gl)
+void gfs2_glock_put_nolock(struct gfs2_glock *gl)
 {
 	if (atomic_dec_and_test(&gl->gl_ref))
 		GLOCK_BUG_ON(gl, 1);
@@ -679,6 +680,29 @@ out_unlock:
 	goto out;
 }
 
+static void delete_work_func(struct work_struct *work)
+{
+	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_inode *ip = NULL;
+	struct inode *inode;
+	u64 no_addr = 0;
+
+	spin_lock(&gl->gl_spin);
+	ip = (struct gfs2_inode *)gl->gl_object;
+	if (ip)
+		no_addr = ip->i_no_addr;
+	spin_unlock(&gl->gl_spin);
+	if (ip) {
+		inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
+		if (inode) {
+			d_prune_aliases(inode);
+			iput(inode);
+		}
+	}
+	gfs2_glock_put(gl);
+}
+
 static void glock_work_func(struct work_struct *work)
 {
 	unsigned long delay = 0;
@@ -757,6 +781,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_sbd = sdp;
 	gl->gl_aspace = NULL;
 	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
+	INIT_WORK(&gl->gl_delete, delete_work_func);
 
 	/* If this glock protects actual on-disk data or metadata blocks,
 	   create a VFS inode to manage the pages/buffers holding them. */
@@ -898,6 +923,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
 			gl->gl_demote_state != state) {
 		gl->gl_demote_state = LM_ST_UNLOCKED;
 	}
+	if (gl->gl_ops->go_callback)
+		gl->gl_ops->go_callback(gl);
 	trace_gfs2_demote_rq(gl);
 }
 
@@ -1344,14 +1371,14 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 			spin_unlock(&lru_lock);
 			spin_lock(&gl->gl_spin);
 			may_demote = demote_ok(gl);
-			spin_unlock(&gl->gl_spin);
-			clear_bit(GLF_LOCK, &gl->gl_flags);
 			if (may_demote) {
 				handle_callback(gl, LM_ST_UNLOCKED, 0);
 				nr--;
 			}
 			if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-				gfs2_glock_put(gl);
+				gfs2_glock_put_nolock(gl);
+			spin_unlock(&gl->gl_spin);
+			clear_bit(GLF_LOCK, &gl->gl_flags);
 			spin_lock(&lru_lock);
 			continue;
 		}
@@ -1738,6 +1765,11 @@ int __init gfs2_glock_init(void)
 	glock_workqueue = create_workqueue("glock_workqueue");
 	if (IS_ERR(glock_workqueue))
 		return PTR_ERR(glock_workqueue);
+	gfs2_delete_workqueue = create_workqueue("delete_workqueue");
+	if (IS_ERR(gfs2_delete_workqueue)) {
+		destroy_workqueue(glock_workqueue);
+		return PTR_ERR(gfs2_delete_workqueue);
+	}
 
 	register_shrinker(&glock_shrinker);
 
@@ -1748,6 +1780,7 @@ void gfs2_glock_exit(void)
 {
 	unregister_shrinker(&glock_shrinker);
 	destroy_workqueue(glock_workqueue);
+	destroy_workqueue(gfs2_delete_workqueue);
 }
 
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index a602a28f6f08..c609894ec0d0 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -143,6 +143,7 @@ struct lm_lockops {
 
 #define GLR_TRYFAILED		13
 
+extern struct workqueue_struct *gfs2_delete_workqueue;
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
 	struct gfs2_holder *gh;
@@ -191,6 +192,8 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 int gfs2_glock_get(struct gfs2_sbd *sdp,
 		   u64 number, const struct gfs2_glock_operations *glops,
 		   int create, struct gfs2_glock **glp);
+void gfs2_glock_hold(struct gfs2_glock *gl);
+void gfs2_glock_put_nolock(struct gfs2_glock *gl);
 int gfs2_glock_put(struct gfs2_glock *gl);
 void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
 		      struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index d5e4ab155ca0..6985eef06c39 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -323,6 +323,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
 
 	if (gl->gl_state != LM_ST_UNLOCKED &&
 	    test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+		flush_workqueue(gfs2_delete_workqueue);
 		gfs2_meta_syncfs(sdp);
 		gfs2_log_shutdown(sdp);
 	}
@@ -372,6 +373,25 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
 	return 0;
 }
 
+/**
+ * iopen_go_callback - schedule the dcache entry for the inode to be deleted
+ * @gl: the glock
+ *
+ * gl_spin lock is held while calling this
+ */
+static void iopen_go_callback(struct gfs2_glock *gl)
+{
+	struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
+
+	if (gl->gl_demote_state == LM_ST_UNLOCKED &&
+	    gl->gl_state == LM_ST_SHARED &&
+	    ip && test_bit(GIF_USER, &ip->i_flags)) {
+		gfs2_glock_hold(gl);
+		if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+			gfs2_glock_put_nolock(gl);
+	}
+}
+
 const struct gfs2_glock_operations gfs2_meta_glops = {
 	.go_type = LM_TYPE_META,
 };
@@ -406,6 +426,7 @@ const struct gfs2_glock_operations gfs2_trans_glops = {
 
 const struct gfs2_glock_operations gfs2_iopen_glops = {
 	.go_type = LM_TYPE_IOPEN,
+	.go_callback = iopen_go_callback,
 };
 
 const struct gfs2_glock_operations gfs2_flock_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 225347fbff3c..61801ada36f0 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -159,6 +159,7 @@ struct gfs2_glock_operations {
 	int (*go_lock) (struct gfs2_holder *gh);
 	void (*go_unlock) (struct gfs2_holder *gh);
 	int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+	void (*go_callback) (struct gfs2_glock *gl);
 	const int go_type;
 	const unsigned long go_min_hold_time;
 };
@@ -228,6 +229,7 @@ struct gfs2_glock {
 	struct list_head gl_ail_list;
 	atomic_t gl_ail_count;
 	struct delayed_work gl_work;
+	struct work_struct gl_delete;
 };
 
 #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 552e321cee5e..f522bb017973 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -691,6 +691,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 	struct gfs2_holder t_gh;
 	int error;
 
+	flush_workqueue(gfs2_delete_workqueue);
 	gfs2_quota_sync(sdp);
 	gfs2_statfs_sync(sdp);
 
-- 
cgit v1.2.3-70-g09d2


From 276e680d192a67d222fcea51af37b056feffb665 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 30 Jul 2009 09:40:40 -0400
Subject: Btrfs: preserve commit_root for async caching

The async block group caching code uses the commit_root pointer
to get a stable version of the extent allocation tree for scanning.
This copy of the tree root isn't going to change and it significantly
reduces the complexity of the scanning code.

During a commit, we have a loop where we update the extent allocation
tree root.  We need to loop because updating the root pointer in
the tree of tree roots may allocate blocks which may change the
extent allocation tree.

Right now the commit_root pointer is changed inside this loop.  It
is more correct to change the commit_root pointer only after all the
looping is done.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  4 +---
 fs/btrfs/disk-io.c     |  2 +-
 fs/btrfs/extent-tree.c |  6 +++---
 fs/btrfs/transaction.c | 12 +++++++++---
 4 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 17ad92c29cfd..38eeb6c49c8a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -827,6 +827,7 @@ struct btrfs_fs_info {
 	struct mutex drop_mutex;
 	struct mutex volume_mutex;
 	struct mutex tree_reloc_mutex;
+	struct rw_semaphore extent_commit_sem;
 
 	/*
 	 * this protects the ordered operations list only while we are
@@ -961,9 +962,6 @@ struct btrfs_root {
 	/* the node lock is held while changing the node pointer */
 	spinlock_t node_lock;
 
-	/* taken when updating the commit root */
-	struct rw_semaphore commit_root_sem;
-
 	struct extent_buffer *commit_root;
 	struct btrfs_root *log_root;
 	struct btrfs_root *reloc_root;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3a9b88759880..3cf4cfa575c8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -907,7 +907,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	spin_lock_init(&root->inode_lock);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
-	init_rwsem(&root->commit_root_sem);
 	init_waitqueue_head(&root->log_writer_wait);
 	init_waitqueue_head(&root->log_commit_wait[0]);
 	init_waitqueue_head(&root->log_commit_wait[1]);
@@ -1624,6 +1623,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->cleaner_mutex);
 	mutex_init(&fs_info->volume_mutex);
 	mutex_init(&fs_info->tree_reloc_mutex);
+	init_rwsem(&fs_info->extent_commit_sem);
 
 	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
 	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fadf69a2764b..2fe21fa74913 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -267,7 +267,7 @@ static int caching_kthread(void *data)
 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 again:
 	/* need to make sure the commit_root doesn't disappear */
-	down_read(&fs_info->extent_root->commit_root_sem);
+	down_read(&fs_info->extent_commit_sem);
 
 	/*
 	 * We don't want to deadlock with somebody trying to allocate a new
@@ -304,7 +304,7 @@ again:
 
 			if (need_resched()) {
 				btrfs_release_path(fs_info->extent_root, path);
-				up_read(&fs_info->extent_root->commit_root_sem);
+				up_read(&fs_info->extent_commit_sem);
 				cond_resched();
 				goto again;
 			}
@@ -345,7 +345,7 @@ next:
 
 err:
 	btrfs_free_path(path);
-	up_read(&fs_info->extent_root->commit_root_sem);
+	up_read(&fs_info->extent_commit_sem);
 	atomic_dec(&block_group->space_info->caching_threads);
 	wake_up(&block_group->caching_q);
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e51d2bc532f8..de48e4ec808c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -42,10 +42,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
 
 static noinline void switch_commit_root(struct btrfs_root *root)
 {
-	down_write(&root->commit_root_sem);
 	free_extent_buffer(root->commit_root);
 	root->commit_root = btrfs_root_node(root);
-	up_write(&root->commit_root_sem);
 }
 
 /*
@@ -466,7 +464,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 		ret = btrfs_write_dirty_block_groups(trans, root);
 		BUG_ON(ret);
 	}
-	switch_commit_root(root);
+
+	if (root != root->fs_info->extent_root)
+		switch_commit_root(root);
+
 	return 0;
 }
 
@@ -499,6 +500,11 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 
 		update_cowonly_root(trans, root);
 	}
+
+	down_write(&fs_info->extent_commit_sem);
+	switch_commit_root(fs_info->extent_root);
+	up_write(&fs_info->extent_commit_sem);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From f36f3042eae238bdaefe7c79310afe573bfc3622 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 30 Jul 2009 10:04:48 -0400
Subject: Btrfs: be more polite in the async caching threads

The semaphore used by the async caching threads can prevent a
transaction commit, which can make the FS appear to stall.  This
releases the semaphore more often when a transaction commit is
in progress.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  5 +++--
 fs/btrfs/transaction.c | 10 ++++++++++
 fs/btrfs/transaction.h |  1 +
 3 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2fe21fa74913..dc84daee6bc4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -302,10 +302,11 @@ again:
 			else if (ret)
 				break;
 
-			if (need_resched()) {
+			if (need_resched() ||
+			    btrfs_transaction_in_commit(fs_info)) {
 				btrfs_release_path(fs_info->extent_root, path);
 				up_read(&fs_info->extent_commit_sem);
-				cond_resched();
+				schedule_timeout(1);
 				goto again;
 			}
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index de48e4ec808c..cdbb5022da52 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -857,6 +857,16 @@ static void update_super_roots(struct btrfs_root *root)
 	super->root_level = root_item->level;
 }
 
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
+{
+	int ret = 0;
+	spin_lock(&info->new_trans_lock);
+	if (info->running_transaction)
+		ret = info->running_transaction->in_commit;
+	spin_unlock(&info->new_trans_lock);
+	return ret;
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 961c3ee5a2e1..663c67404918 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -107,4 +107,5 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 					struct extent_io_tree *dirty_pages);
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 #endif
-- 
cgit v1.2.3-70-g09d2


From 4bf17af0dbfe4cf20cb750e22e8e926273e7a7a4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 14 Jul 2009 19:30:23 +0200
Subject: udf: Fix loading of VAT inode when drive wrongly reports number of
 recorded blocks

VAT inode is located in the last block recorded block of the medium. When the
drive errorneously reports number of recorded blocks, we failed to load the VAT
inode and thus mount the medium. This patch makes kernel try to read VAT inode
from the last block of the device if it is different from the last recorded
block.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/super.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 6832135159b6..9d1b8c2e6c45 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1087,11 +1087,23 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
 	struct udf_inode_info *vati;
 	uint32_t pos;
 	struct virtualAllocationTable20 *vat20;
+	sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
 
 	/* VAT file entry is in the last recorded block */
 	ino.partitionReferenceNum = type1_index;
 	ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
 	sbi->s_vat_inode = udf_iget(sb, &ino);
+	if (!sbi->s_vat_inode &&
+	    sbi->s_last_block != blocks - 1) {
+		printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
+		       " last recorded block (%lu), retrying with the last "
+		       "block of the device (%lu).\n",
+		       (unsigned long)sbi->s_last_block,
+		       (unsigned long)blocks - 1);
+		ino.partitionReferenceNum = type1_index;
+		ino.logicalBlockNum = blocks - 1 - map->s_partition_root;
+		sbi->s_vat_inode = udf_iget(sb, &ino);
+	}
 	if (!sbi->s_vat_inode)
 		return 1;
 
-- 
cgit v1.2.3-70-g09d2


From dee865656f2d8b866f8ac22c60d6363b914e9f12 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Jul 2009 18:12:17 +0200
Subject: quota: Silence lockdep on quota_on

Commit d01730d74d2b0155da50d44555001706294014f7 didn't completely fix
the problem since we still take dqio_mutex and i_mutex in the wrong
order. Move taking of i_mutex further down (luckily it's needed only
for updating inode flags) below where dqio_mutex is taken.

Tested-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 70f36c043d62..38f7bd559f35 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2043,7 +2043,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		invalidate_bdev(sb->s_bdev);
 	}
 	mutex_lock(&dqopt->dqonoff_mutex);
-	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
 	if (sb_has_quota_loaded(sb, type)) {
 		error = -EBUSY;
 		goto out_lock;
@@ -2054,9 +2053,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		 * possible) Also nobody should write to the file - we use
 		 * special IO operations which ignore the immutable bit. */
 		down_write(&dqopt->dqptr_sem);
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
 		oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
 					     S_NOQUOTA);
 		inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+		mutex_unlock(&inode->i_mutex);
 		up_write(&dqopt->dqptr_sem);
 		sb->dq_op->drop(inode);
 	}
@@ -2080,7 +2081,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		goto out_file_init;
 	}
 	mutex_unlock(&dqopt->dqio_mutex);
-	mutex_unlock(&inode->i_mutex);
 	spin_lock(&dq_state_lock);
 	dqopt->flags |= dquot_state_flag(flags, type);
 	spin_unlock(&dq_state_lock);
@@ -2096,13 +2096,14 @@ out_file_init:
 out_lock:
 	if (oldflags != -1) {
 		down_write(&dqopt->dqptr_sem);
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
 		/* Set the flags back (in the case of accidental quotaon()
 		 * on a wrong file we don't want to mess up the flags) */
 		inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
 		inode->i_flags |= oldflags;
+		mutex_unlock(&inode->i_mutex);
 		up_write(&dqopt->dqptr_sem);
 	}
-	mutex_unlock(&inode->i_mutex);
 	mutex_unlock(&dqopt->dqonoff_mutex);
 out_fmt:
 	put_quota_format(fmt);
-- 
cgit v1.2.3-70-g09d2


From e9956fae7dbce6ac770e7a00436c496ddbbd3215 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 30 Jul 2009 16:07:10 +0800
Subject: ocfs2/quota: Release lock for error in ocfs2_quota_write.

ocfs2_quota_write needs to release the lock if it fails to
read quota block. So use "goto out" instead of "return err".

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/quota_global.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index cc8785cf8f61..d604a6aa0a22 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -238,7 +238,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 	}
 	if (err) {
 		mlog_errno(err);
-		return err;
+		goto out;
 	}
 	lock_buffer(bh);
 	if (new)
-- 
cgit v1.2.3-70-g09d2


From 97db39a1f6f69e906e98118392400de5217aa33a Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Sun, 26 Jul 2009 21:52:01 -0500
Subject: xfs: reduce bmv_count in xfs_vn_fiemap

commit 6321e3ed2acf3ee9643cdd403e1c88605d7944ba caused
the full bmv_count's worth of getbmapx structures to get
allocated; telling it to do MAXEXTNUM was a bit insane,
resulting in ENOMEM every time.

Chop it down to something reasonable, the number of slots
in the caller's input buffer.  If this is too large the
caller may get ENOMEM but the reason should not be a
mystery, and they can try again with something smaller.

We add 1 to the value because in the normal getbmap
world, bmv_count includes the header and xfs_getbmap does:

        nex = bmv->bmv_count - 1;
        if (nex <= 0)
                return XFS_ERROR(EINVAL);

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Olaf Weber <olaf@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 58973bb46038..8070b34cc287 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -680,8 +680,8 @@ xfs_vn_fiemap(
 	else
 		bm.bmv_length = BTOBB(length);
 
-	/* our formatter will tell xfs_getbmap when to stop. */
-	bm.bmv_count = MAXEXTNUM;
+	/* We add one because in getbmap world count includes the header */
+	bm.bmv_count = fieinfo->fi_extents_max + 1;
 	bm.bmv_iflags = BMV_IF_PREALLOC;
 	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
 		bm.bmv_iflags |= BMV_IF_ATTRFORK;
-- 
cgit v1.2.3-70-g09d2


From c8a4051c3731b6db224482218cfd535ab9393ff8 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Fri, 31 Jul 2009 00:02:17 -0500
Subject: xfs: bump up nr_to_write in xfs_vm_writepage

VM calculation for nr_to_write seems off.  Bump it way
up, this gets simple streaming writes zippy again.
To be reviewed again after Jens' writeback changes.

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Cc: Chris Mason <chris.mason@oracle.com>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 7ec89fc05b2b..aecf2519db76 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1268,6 +1268,14 @@ xfs_vm_writepage(
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
 
+
+	/*
+	 *  VM calculation for nr_to_write seems off.  Bump it way
+	 *  up, this gets simple streaming writes zippy again.
+	 *  To be reviewed again after Jens' writeback changes.
+	 */
+	wbc->nr_to_write *= 4;
+
 	/*
 	 * Convert delayed allocate, unwritten or unmapped space
 	 * to real space and flush out to disk.
-- 
cgit v1.2.3-70-g09d2


From 6606bb97e146a387932efee263745b7240a11193 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 31 Jul 2009 11:03:58 -0400
Subject: Btrfs: fix btrfs_remove_from_free_space corner case

Yan Zheng hit a problem where we tried to remove some free space but failed
because we couldn't find the free space entry.  This is because the free space
was held within a bitmap that had a starting offset well before the actual
offset of the free space, and there were free space extents that were in the
same range as that offset, so tree_search_offset returned with NULL because we
couldn't find a free space extent that had that offset.  This is fixed by
making sure that if we fail to find the entry, we re-search again with
bitmap_only set to 1 and do an offset_to_bitmap so we can get the appropriate
bitmap.  A similar problem happens in btrfs_alloc_from_bitmap for the
clustering code, but that is not as bad since we will just go and redo our
cluster allocation.

Also this adds some debugging checks to make sure that the free space we are
trying to remove from the bitmap is in fact there.  This can probably go away
after a while, but since this code is only used by the tree-logging stuff it
would be nice to run with it for a while to make sure there are no problems.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 73 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 64 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index af99b78b288e..5edcee3a617f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -414,11 +414,29 @@ static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_gro
 			      u64 *offset, u64 *bytes)
 {
 	u64 end;
+	u64 search_start, search_bytes;
+	int ret;
 
 again:
 	end = bitmap_info->offset +
 		(u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
 
+	/*
+	 * XXX - this can go away after a few releases.
+	 *
+	 * since the only user of btrfs_remove_free_space is the tree logging
+	 * stuff, and the only way to test that is under crash conditions, we
+	 * want to have this debug stuff here just in case somethings not
+	 * working.  Search the bitmap for the space we are trying to use to
+	 * make sure its actually there.  If its not there then we need to stop
+	 * because something has gone wrong.
+	 */
+	search_start = *offset;
+	search_bytes = *bytes;
+	ret = search_bitmap(block_group, bitmap_info, &search_start,
+			    &search_bytes);
+	BUG_ON(ret < 0 || search_start != *offset);
+
 	if (*offset > bitmap_info->offset && *offset + *bytes > end) {
 		bitmap_clear_bits(block_group, bitmap_info, *offset,
 				  end - *offset + 1);
@@ -430,6 +448,7 @@ again:
 	}
 
 	if (*bytes) {
+		struct rb_node *next = rb_next(&bitmap_info->offset_index);
 		if (!bitmap_info->bytes) {
 			unlink_free_space(block_group, bitmap_info);
 			kfree(bitmap_info->bitmap);
@@ -438,16 +457,36 @@ again:
 			recalculate_thresholds(block_group);
 		}
 
-		bitmap_info = tree_search_offset(block_group,
-						 offset_to_bitmap(block_group,
-								  *offset),
-						 1, 0);
-		if (!bitmap_info)
+		/*
+		 * no entry after this bitmap, but we still have bytes to
+		 * remove, so something has gone wrong.
+		 */
+		if (!next)
 			return -EINVAL;
 
+		bitmap_info = rb_entry(next, struct btrfs_free_space,
+				       offset_index);
+
+		/*
+		 * if the next entry isn't a bitmap we need to return to let the
+		 * extent stuff do its work.
+		 */
 		if (!bitmap_info->bitmap)
 			return -EAGAIN;
 
+		/*
+		 * Ok the next item is a bitmap, but it may not actually hold
+		 * the information for the rest of this free space stuff, so
+		 * look for it, and if we don't find it return so we can try
+		 * everything over again.
+		 */
+		search_start = *offset;
+		search_bytes = *bytes;
+		ret = search_bitmap(block_group, bitmap_info, &search_start,
+				    &search_bytes);
+		if (ret < 0 || search_start != *offset)
+			return -EAGAIN;
+
 		goto again;
 	} else if (!bitmap_info->bytes) {
 		unlink_free_space(block_group, bitmap_info);
@@ -644,8 +683,17 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 again:
 	info = tree_search_offset(block_group, offset, 0, 0);
 	if (!info) {
-		WARN_ON(1);
-		goto out_lock;
+		/*
+		 * oops didn't find an extent that matched the space we wanted
+		 * to remove, look for a bitmap instead
+		 */
+		info = tree_search_offset(block_group,
+					  offset_to_bitmap(block_group, offset),
+					  1, 0);
+		if (!info) {
+			WARN_ON(1);
+			goto out_lock;
+		}
 	}
 
 	if (info->bytes < bytes && rb_next(&info->offset_index)) {
@@ -957,8 +1005,15 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
 	if (cluster->block_group != block_group)
 		goto out;
 
-	entry = tree_search_offset(block_group, search_start, 0, 0);
-
+	/*
+	 * search_start is the beginning of the bitmap, but at some point it may
+	 * be a good idea to point to the actual start of the free area in the
+	 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
+	 * to 1 to make sure we get the bitmap entry
+	 */
+	entry = tree_search_offset(block_group,
+				   offset_to_bitmap(block_group, search_start),
+				   1, 0);
 	if (!entry || !entry->bitmap)
 		goto out;
 
-- 
cgit v1.2.3-70-g09d2


From 013f1b12f4fc479f697acae2f31bad220162cd03 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 31 Jul 2009 14:57:55 -0400
Subject: Btrfs: make sure the async caching thread advances the key

The async caching thread can end up looping forever if a given
search puts it at the last key in a leaf.  It will end up calling
btrfs_next_leaf and then checking if it needs to politely drop
the read semaphore.

Most of the time this looping isn't noticed because it is able to
make progress the next time around.  But, during log replay,
we wait on the async caching thread to finish, and the async thread
is waiting on the commit, and no progress is really made.

The fix used here is to copy the key out of the next leaf,
that way our search lands there properly.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index dc84daee6bc4..72a2b9c28e9f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -265,10 +265,6 @@ static int caching_kthread(void *data)
 
 	atomic_inc(&block_group->space_info->caching_threads);
 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
-again:
-	/* need to make sure the commit_root doesn't disappear */
-	down_read(&fs_info->extent_commit_sem);
-
 	/*
 	 * We don't want to deadlock with somebody trying to allocate a new
 	 * extent for the extent root while also trying to search the extent
@@ -282,6 +278,10 @@ again:
 	key.objectid = last;
 	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+again:
+	/* need to make sure the commit_root doesn't disappear */
+	down_read(&fs_info->extent_commit_sem);
+
 	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
@@ -304,6 +304,19 @@ again:
 
 			if (need_resched() ||
 			    btrfs_transaction_in_commit(fs_info)) {
+				leaf = path->nodes[0];
+
+				/* this shouldn't happen, but if the
+				 * leaf is empty just move on.
+				 */
+				if (btrfs_header_nritems(leaf) == 0)
+					break;
+				/*
+				 * we need to copy the key out so that
+				 * we are sure the next search advances
+				 * us forward in the btree.
+				 */
+				btrfs_item_key_to_cpu(leaf, &key, 0);
 				btrfs_release_path(fs_info->extent_root, path);
 				up_read(&fs_info->extent_commit_sem);
 				schedule_timeout(1);
-- 
cgit v1.2.3-70-g09d2


From ab57a40827d99e2d8e59066a56b93bf6c844c916 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@gmail.com>
Date: Fri, 31 Jul 2009 14:28:02 -0500
Subject: ocfs2: Remove redundant BUG_ON in __dlm_queue_ast()

We BUG_ON() the same thing twice.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.de>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlm/dlmast.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index d07ddbe4b283..81eff8e58322 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -103,7 +103,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 		     lock->ast_pending, lock->ml.type);
 		BUG();
 	}
-	BUG_ON(!list_empty(&lock->ast_list));
 	if (lock->ast_pending)
 		mlog(0, "lock has an ast getting flushed right now\n");
 
-- 
cgit v1.2.3-70-g09d2


From a97778457f22181e8c38c4cd7d7e528378738a98 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Tue, 28 Jul 2009 17:55:29 +0900
Subject: nilfs2: fix oops due to inconsistent state in page with discrete
 b-tree nodes

Andrea Gelmini gave me a report that a kernel oops hit on a nilfs
filesystem with a 1KB block size when doing rsync.

This turned out to be caused by an inconsistency of dirty state
between a page and its buffers storing b-tree node blocks.

If the page had multiple buffers split over multiple logs, and if the
logs were written at a time, a dirty flag remained in the page even
every dirty flag in the buffers was cleared.

This will fix the failure by dropping the dirty flag properly for
pages with the discrete multiple b-tree nodes.

Reported-by: Andrea Gelmini <andrea.gelmini@gmail.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Tested-by: Andrea Gelmini <andrea.gelmini@gmail.com>
Cc: stable@kernel.org
---
 fs/nilfs2/segment.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 8b5e4778cf28..51ff3d0a4ee2 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1859,12 +1859,26 @@ static void nilfs_end_page_io(struct page *page, int err)
 	if (!page)
 		return;
 
-	if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page))
+	if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) {
 		/*
 		 * For b-tree node pages, this function may be called twice
 		 * or more because they might be split in a segment.
 		 */
+		if (PageDirty(page)) {
+			/*
+			 * For pages holding split b-tree node buffers, dirty
+			 * flag on the buffers may be cleared discretely.
+			 * In that case, the page is once redirtied for
+			 * remaining buffers, and it must be cancelled if
+			 * all the buffers get cleaned later.
+			 */
+			lock_page(page);
+			if (nilfs_page_buffers_clean(page))
+				__nilfs_clear_page_dirty(page);
+			unlock_page(page);
+		}
 		return;
+	}
 
 	__nilfs_end_page_io(page, err);
 }
-- 
cgit v1.2.3-70-g09d2


From 9b9d6b2434fe942895c341b9a982f158529788ec Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 31 Jul 2009 06:56:09 -0400
Subject: cifs: reinstate original behavior when uid=/gid= options are
 specified

This patch fixes the regression reported here:

http://bugzilla.kernel.org/show_bug.cgi?id=13861

commit 4ae1507f6d266d0cc3dd36e474d83aad70fec9e4 changed the default
behavior when the uid= or gid= option was specified for a mount. The
existing behavior was to always clobber the ownership information
provided by the server when these options were specified. The above
commit changed this behavior so that these options simply provided
defaults when the server did not provide this information (unless
"forceuid" or "forcegid" were specified)

This patch reverts this change so that the default behavior is restored.
It also adds "noforceuid" and "noforcegid" options to make it so that
ownership information from the server is preserved, even when the mount
has uid= or gid= options specified.

It also adds a couple of printk notices that pop up when forceuid or
forcegid options are specified without a uid= or gid= option.

Reported-by: Tom Chiverton <bugzilla.kernel.org@falkensweb.com>
Reviewed-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f2486889b7bb..1f3345d7fa79 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -803,6 +803,10 @@ cifs_parse_mount_options(char *options, const char *devname,
 	char *data;
 	unsigned int  temp_len, i, j;
 	char separator[2];
+	short int override_uid = -1;
+	short int override_gid = -1;
+	bool uid_specified = false;
+	bool gid_specified = false;
 
 	separator[0] = ',';
 	separator[1] = 0;
@@ -1093,18 +1097,20 @@ cifs_parse_mount_options(char *options, const char *devname,
 						    "too long.\n");
 				return 1;
 			}
-		} else if (strnicmp(data, "uid", 3) == 0) {
-			if (value && *value)
-				vol->linux_uid =
-					simple_strtoul(value, &value, 0);
-		} else if (strnicmp(data, "forceuid", 8) == 0) {
-				vol->override_uid = 1;
-		} else if (strnicmp(data, "gid", 3) == 0) {
-			if (value && *value)
-				vol->linux_gid =
-					simple_strtoul(value, &value, 0);
-		} else if (strnicmp(data, "forcegid", 8) == 0) {
-				vol->override_gid = 1;
+		} else if (!strnicmp(data, "uid", 3) && value && *value) {
+			vol->linux_uid = simple_strtoul(value, &value, 0);
+			uid_specified = true;
+		} else if (!strnicmp(data, "forceuid", 8)) {
+			override_uid = 1;
+		} else if (!strnicmp(data, "noforceuid", 10)) {
+			override_uid = 0;
+		} else if (!strnicmp(data, "gid", 3) && value && *value) {
+			vol->linux_gid = simple_strtoul(value, &value, 0);
+			gid_specified = true;
+		} else if (!strnicmp(data, "forcegid", 8)) {
+			override_gid = 1;
+		} else if (!strnicmp(data, "noforcegid", 10)) {
+			override_gid = 0;
 		} else if (strnicmp(data, "file_mode", 4) == 0) {
 			if (value && *value) {
 				vol->file_mode =
@@ -1355,6 +1361,18 @@ cifs_parse_mount_options(char *options, const char *devname,
 	if (vol->UNCip == NULL)
 		vol->UNCip = &vol->UNC[2];
 
+	if (uid_specified)
+		vol->override_uid = override_uid;
+	else if (override_uid == 1)
+		printk(KERN_NOTICE "CIFS: ignoring forceuid mount option "
+				   "specified with no uid= option.\n");
+
+	if (gid_specified)
+		vol->override_gid = override_gid;
+	else if (override_gid == 1)
+		printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
+				   "specified with no gid= option.\n");
+
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 01a261e09a21e0ba342d3907a79cf5c78ee3f37a Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 2 Aug 2009 17:45:55 +0900
Subject: nilfs2: fix missing unlock in error path of nilfs_mdt_write_page

This adds a missing unlock of nilfs->ns_writer_mutex in
nilfs_mdt_write_page() function.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/mdt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 3d3ddb3f5177..2dfd47714ae5 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -412,8 +412,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 		return 0; /* Do not request flush for shadow page cache */
 	if (!sb) {
 		writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
-		if (!writer)
+		if (!writer) {
+			nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
 			return -EROFS;
+		}
 		sb = writer->s_super;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 4486d6ede16b362f89b29845af6fe1a26ae78a54 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 3 Aug 2009 12:45:10 -0400
Subject: cifs: show noforceuid/noforcegid mount options (try #2)

Since forceuid is the default, we now need to show when it's disabled.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 44f30504b82d..84b75253b05a 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -376,10 +376,14 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 	seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
 		seq_printf(s, ",forceuid");
+	else
+		seq_printf(s, ",noforceuid");
 
 	seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
 		seq_printf(s, ",forcegid");
+	else
+		seq_printf(s, ",noforcegid");
 
 	cifs_show_address(s, tcon->ses->server);
 
-- 
cgit v1.2.3-70-g09d2


From 24e2fb615fd6b624c320cec9ea9d91a75dad902e Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Sun, 2 Aug 2009 13:00:18 +0200
Subject: cifs: Read buffer overflow

Check whether index is within bounds before testing the element.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_unicode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 60e3c4253de0..714a542cbafc 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,7 +44,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
 	int maxwords = maxbytes / 2;
 	char tmp[NLS_MAX_CHARSET_SIZE];
 
-	for (i = 0; from[i] && i < maxwords; i++) {
+	for (i = 0; i < maxwords && from[i]; i++) {
 		charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
 					     NLS_MAX_CHARSET_SIZE);
 		if (charlen > 0)
-- 
cgit v1.2.3-70-g09d2


From d098564f3b2b5d555e51bca765a6a9e0dda8f2cd Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 4 Aug 2009 03:53:28 +0000
Subject: [CIFS] Update readme to reflect forceuid mount parms

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES |  3 ++-
 fs/cifs/README  | 25 ++++++++++++-------------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 651cefde385b..e85b1e4389e0 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -2,7 +2,8 @@ Version 1.60
 -------------
 Fix memory leak in reconnect.  Fix oops in DFS mount error path.
 Set s_maxbytes to smaller (the max that vfs can handle) so that
-sendfile will now work over cifs mounts again.
+sendfile will now work over cifs mounts again.  Add noforcegid
+and noforceuid mount parameters.
 
 Version 1.59
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index ad92921dbde4..79c1a93400be 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,11 +262,11 @@ A partial list of the supported mount options follows:
 		mount.	
   domain	Set the SMB/CIFS workgroup name prepended to the
 		username during CIFS session establishment
-  forceuid	Set the default uid for inodes based on the uid
-		passed in. For mounts to servers
+  forceuid	Set the default uid for inodes to the uid
+		passed in on mount. For mounts to servers
 		which do support the CIFS Unix extensions, such as a
 		properly configured Samba server, the server provides
-		the uid, gid and mode so this parameter should  not be
+		the uid, gid and mode so this parameter should not be
 		specified unless the server and clients uid and gid
 		numbering differ.  If the server and client are in the
 		same domain (e.g. running winbind or nss_ldap) and
@@ -278,11 +278,7 @@ A partial list of the supported mount options follows:
 		of existing files will be the uid (gid) of the person
 		who executed the mount (root, except when mount.cifs
 		is configured setuid for user mounts) unless the "uid=" 
-		(gid) mount option is specified.  For the uid (gid) of newly
-		created files and directories, ie files created since 
-		the last mount of the server share, the expected uid 
-		(gid) is cached as long as the inode remains in 
-		memory on the client.   Also note that permission
+		(gid) mount option is specified. Also note that permission
 		checks (authorization checks) on accesses to a file occur
 		at the server, but there are cases in which an administrator
 		may want to restrict at the client as well.  For those
@@ -290,12 +286,15 @@ A partial list of the supported mount options follows:
 		(such as Windows), permissions can also be checked at the
 		client, and a crude form of client side permission checking 
 		can be enabled by specifying file_mode and dir_mode on 
-		the client.  Note that the mount.cifs helper must be
-		at version 1.10 or higher to support specifying the uid
-		(or gid) in non-numeric form.
-  forcegid	(similar to above but for the groupid instead of uid)
+		the client.  (default)
+  forcegid	(similar to above but for the groupid instead of uid) (default)
+  noforceuid	Fill in file owner information (uid) by requesting it from
+		the server if possible. With this option, the value given in
+		the uid= option (on mount) will only be used if the server
+		can not support returning uids on inodes.
+  noforcegid	(similar to above but for the group owner, gid, instead of uid)
   uid		Set the default uid for inodes, and indicate to the
-		cifs kernel driver which local user mounted . If the server
+		cifs kernel driver which local user mounted. If the server
 		supports the unix extensions the default uid is
 		not used to fill in the owner fields of inodes (files)
 		unless the "forceuid" parameter is specified.
-- 
cgit v1.2.3-70-g09d2


From 57ca7deb062abf56168d15f000c16e25f88a9cf3 Mon Sep 17 00:00:00 2001
From: Anders Grafström <grfstrm@users.sourceforge.net>
Date: Tue, 4 Aug 2009 13:11:47 +0200
Subject: jffs2: Fix return value from jffs2_do_readpage_nolock()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes "kernel BUG at fs/jffs2/file.c:251!".
This pseudocode hopefully illustrates the scenario that triggers it:

jffs2_write_begin {
     jffs2_do_readpage_nolock {
         jffs2_read_inode_range {
             jffs2_read_dnode {
                 Data CRC 33c102e9 != calculated CRC 0ef77e7b for node at 005d42e4
                 return -EIO;
             }
         }
         ClearPageUptodate(pg);
         return 0;
     }
}

jffs2_write_end {
     BUG_ON(!PageUptodate(pg));
}

Signed-off-by: Anders Grafström <grfstrm@users.sourceforge.net>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5edc2bf20581..23c947539864 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -99,7 +99,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
 	kunmap(pg);
 
 	D2(printk(KERN_DEBUG "readpage finished\n"));
-	return 0;
+	return ret;
 }
 
 int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg)
-- 
cgit v1.2.3-70-g09d2


From 54e346215e4fe2ca8c94c54e546cc61902060510 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 7 Aug 2009 14:38:25 -0300
Subject: vfs: fix inode_init_always calling convention

Currently inode_init_always calls into ->destroy_inode if the additional
initialization fails.  That's not only counter-intuitive because
inode_init_always did not allocate the inode structure, but in case of
XFS it's actively harmful as ->destroy_inode might delete the inode from
a radix-tree that has never been added.  This in turn might end up
deleting the inode for the same inum that has been instanciated by
another process and cause lots of cause subtile problems.

Also in the case of re-initializing a reclaimable inode in XFS it would
free an inode we still want to keep alive.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/inode.c         | 30 +++++++++++++++++-------------
 fs/xfs/xfs_iget.c  | 17 +++++------------
 include/linux/fs.h |  2 +-
 3 files changed, 23 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 901bad1e5f12..af2c05235cc8 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -120,12 +120,11 @@ static void wake_up_inode(struct inode *inode)
  * These are initializations that need to be done on every inode
  * allocation as the fields are not initialised by slab allocation.
  */
-struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
+int inode_init_always(struct super_block *sb, struct inode *inode)
 {
 	static const struct address_space_operations empty_aops;
 	static struct inode_operations empty_iops;
 	static const struct file_operations empty_fops;
-
 	struct address_space *const mapping = &inode->i_data;
 
 	inode->i_sb = sb;
@@ -152,7 +151,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->dirtied_when = 0;
 
 	if (security_inode_alloc(inode))
-		goto out_free_inode;
+		goto out;
 
 	/* allocate and initialize an i_integrity */
 	if (ima_inode_alloc(inode))
@@ -198,16 +197,12 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_fsnotify_mask = 0;
 #endif
 
-	return inode;
+	return 0;
 
 out_free_security:
 	security_inode_free(inode);
-out_free_inode:
-	if (inode->i_sb->s_op->destroy_inode)
-		inode->i_sb->s_op->destroy_inode(inode);
-	else
-		kmem_cache_free(inode_cachep, (inode));
-	return NULL;
+out:
+	return -ENOMEM;
 }
 EXPORT_SYMBOL(inode_init_always);
 
@@ -220,9 +215,18 @@ static struct inode *alloc_inode(struct super_block *sb)
 	else
 		inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
 
-	if (inode)
-		return inode_init_always(sb, inode);
-	return NULL;
+	if (!inode)
+		return NULL;
+
+	if (unlikely(inode_init_always(sb, inode))) {
+		if (inode->i_sb->s_op->destroy_inode)
+			inode->i_sb->s_op->destroy_inode(inode);
+		else
+			kmem_cache_free(inode_cachep, inode);
+		return NULL;
+	}
+
+	return inode;
 }
 
 void destroy_inode(struct inode *inode)
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 5fcec6f020a7..719c85b155f4 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -64,6 +64,10 @@ xfs_inode_alloc(
 	ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
 	if (!ip)
 		return NULL;
+	if (inode_init_always(mp->m_super, VFS_I(ip))) {
+		kmem_zone_free(xfs_inode_zone, ip);
+		return NULL;
+	}
 
 	ASSERT(atomic_read(&ip->i_iocount) == 0);
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
@@ -105,17 +109,6 @@ xfs_inode_alloc(
 #ifdef XFS_DIR2_TRACE
 	ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
-	/*
-	* Now initialise the VFS inode. We do this after the xfs_inode
-	* initialisation as internal failures will result in ->destroy_inode
-	* being called and that will pass down through the reclaim path and
-	* free the XFS inode. This path requires the XFS inode to already be
-	* initialised. Hence if this call fails, the xfs_inode has already
-	* been freed and we should not reference it at all in the error
-	* handling.
-	*/
-	if (!inode_init_always(mp->m_super, VFS_I(ip)))
-		return NULL;
 
 	/* prevent anyone from using this yet */
 	VFS_I(ip)->i_state = I_NEW|I_LOCK;
@@ -167,7 +160,7 @@ xfs_iget_cache_hit(
 		 * errors cleanly, then tag it so it can be set up correctly
 		 * later.
 		 */
-		if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+		if (inode_init_always(mp->m_super, VFS_I(ip))) {
 			error = ENOMEM;
 			goto out_error;
 		}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a36ffa5a77a4..0c3b5e58a986 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2137,7 +2137,7 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
 
 extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin);
 
-extern struct inode * inode_init_always(struct super_block *, struct inode *);
+extern int inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
 extern void inode_add_to_lists(struct super_block *, struct inode *);
 extern void iput(struct inode *);
-- 
cgit v1.2.3-70-g09d2


From 2e00c97e2c1d2ffc9e26252ca26b237678b0b772 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 7 Aug 2009 14:38:29 -0300
Subject: vfs: add __destroy_inode

When we want to tear down an inode that lost the add to the cache race
in XFS we must not call into ->destroy_inode because that would delete
the inode that won the race from the inode cache radix tree.

This patch provides the __destroy_inode helper needed to fix this,
the actual fix will be in th next patch.  As XFS was the only reason
destroy_inode was exported we shift the export to the new __destroy_inode.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
---
 fs/inode.c         | 10 +++++++---
 include/linux/fs.h |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index af2c05235cc8..ae7b67e48661 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -229,7 +229,7 @@ static struct inode *alloc_inode(struct super_block *sb)
 	return inode;
 }
 
-void destroy_inode(struct inode *inode)
+void __destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
 	ima_inode_free(inode);
@@ -241,13 +241,17 @@ void destroy_inode(struct inode *inode)
 	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
 		posix_acl_release(inode->i_default_acl);
 #endif
+}
+EXPORT_SYMBOL(__destroy_inode);
+
+void destroy_inode(struct inode *inode)
+{
+	__destroy_inode(inode);
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
 	else
 		kmem_cache_free(inode_cachep, (inode));
 }
-EXPORT_SYMBOL(destroy_inode);
-
 
 /*
  * These are initializations that only need to be done
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0c3b5e58a986..67888a9e0655 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2164,6 +2164,7 @@ extern void __iget(struct inode * inode);
 extern void iget_failed(struct inode *);
 extern void clear_inode(struct inode *);
 extern void destroy_inode(struct inode *);
+extern void __destroy_inode(struct inode *);
 extern struct inode *new_inode(struct super_block *);
 extern int should_remove_suid(struct dentry *);
 extern int file_remove_suid(struct file *);
-- 
cgit v1.2.3-70-g09d2


From b36ec0428a06fcbdb67d61e9e664154e5dd9a8c7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 7 Aug 2009 14:38:34 -0300
Subject: xfs: fix freeing of inodes not yet added to the inode cache

When freeing an inode that lost race getting added to the inode cache we
must not call into ->destroy_inode, because that would delete the inode
that won the race from the inode cache radix tree.

This patch uses splits a new xfs_inode_free helper out of xfs_ireclaim
and uses that plus __destroy_inode to make sure we really only free
the memory allocted for the inode that lost the race, and not mess with
the inode cache state.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reported-by: Alex Samad <alex@samad.com.au>
Reported-by: Andrew Randrianasulu <randrik@mail.ru>
Reported-by: Stephane <sharnois@max-t.com>
Reported-by: Tommy <tommy@news-service.com>
Reported-by: Miah Gregory <mace@darksilence.net>
Reported-by: Gabriel Barazer <gabriel@oxeva.fr>
Reported-by: Leandro Lucarella <llucax@gmail.com>
Reported-by: Daniel Burr <dburr@fami.com.au>
Reported-by: Nickolay <newmail@spaces.ru>
Reported-by: Michael Guntsche <mike@it-loops.com>
Reported-by: Dan Carley <dan.carley+linuxkern-bugs@gmail.com>
Reported-by: Michael Ole Olsen <gnu@gmx.net>
Reported-by: Michael Weissenbacher <mw@dermichi.com>
Reported-by: Martin Spott <Martin.Spott@mgras.net>
Reported-by: Christian Kujau <lists@nerdbynature.de>
Tested-by: Michael Guntsche <mike@it-loops.com>
Tested-by: Dan Carley <dan.carley+linuxkern-bugs@gmail.com>
Tested-by: Christian Kujau <lists@nerdbynature.de>
---
 fs/xfs/xfs_iget.c  | 125 +++++++++++++++++++++++++++++------------------------
 fs/xfs/xfs_inode.h |  17 --------
 2 files changed, 68 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 719c85b155f4..34ec86923f7e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -116,6 +116,71 @@ xfs_inode_alloc(
 	return ip;
 }
 
+STATIC void
+xfs_inode_free(
+	struct xfs_inode	*ip)
+{
+	switch (ip->i_d.di_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFLNK:
+		xfs_idestroy_fork(ip, XFS_DATA_FORK);
+		break;
+	}
+
+	if (ip->i_afp)
+		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+#ifdef XFS_INODE_TRACE
+	ktrace_free(ip->i_trace);
+#endif
+#ifdef XFS_BMAP_TRACE
+	ktrace_free(ip->i_xtrace);
+#endif
+#ifdef XFS_BTREE_TRACE
+	ktrace_free(ip->i_btrace);
+#endif
+#ifdef XFS_RW_TRACE
+	ktrace_free(ip->i_rwtrace);
+#endif
+#ifdef XFS_ILOCK_TRACE
+	ktrace_free(ip->i_lock_trace);
+#endif
+#ifdef XFS_DIR2_TRACE
+	ktrace_free(ip->i_dir_trace);
+#endif
+
+	if (ip->i_itemp) {
+		/*
+		 * Only if we are shutting down the fs will we see an
+		 * inode still in the AIL. If it is there, we should remove
+		 * it to prevent a use-after-free from occurring.
+		 */
+		xfs_log_item_t	*lip = &ip->i_itemp->ili_item;
+		struct xfs_ail	*ailp = lip->li_ailp;
+
+		ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
+				       XFS_FORCED_SHUTDOWN(ip->i_mount));
+		if (lip->li_flags & XFS_LI_IN_AIL) {
+			spin_lock(&ailp->xa_lock);
+			if (lip->li_flags & XFS_LI_IN_AIL)
+				xfs_trans_ail_delete(ailp, lip);
+			else
+				spin_unlock(&ailp->xa_lock);
+		}
+		xfs_inode_item_destroy(ip);
+		ip->i_itemp = NULL;
+	}
+
+	/* asserts to verify all state is correct here */
+	ASSERT(atomic_read(&ip->i_iocount) == 0);
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!spin_is_locked(&ip->i_flags_lock));
+	ASSERT(completion_done(&ip->i_flush));
+
+	kmem_zone_free(xfs_inode_zone, ip);
+}
+
 /*
  * Check the validity of the inode we just found it the cache
  */
@@ -292,7 +357,8 @@ out_preload_end:
 	if (lock_flags)
 		xfs_iunlock(ip, lock_flags);
 out_destroy:
-	xfs_destroy_inode(ip);
+	__destroy_inode(VFS_I(ip));
+	xfs_inode_free(ip);
 	return error;
 }
 
@@ -497,62 +563,7 @@ xfs_ireclaim(
 	xfs_qm_dqdetach(ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 
-	switch (ip->i_d.di_mode & S_IFMT) {
-	case S_IFREG:
-	case S_IFDIR:
-	case S_IFLNK:
-		xfs_idestroy_fork(ip, XFS_DATA_FORK);
-		break;
-	}
-
-	if (ip->i_afp)
-		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-
-#ifdef XFS_INODE_TRACE
-	ktrace_free(ip->i_trace);
-#endif
-#ifdef XFS_BMAP_TRACE
-	ktrace_free(ip->i_xtrace);
-#endif
-#ifdef XFS_BTREE_TRACE
-	ktrace_free(ip->i_btrace);
-#endif
-#ifdef XFS_RW_TRACE
-	ktrace_free(ip->i_rwtrace);
-#endif
-#ifdef XFS_ILOCK_TRACE
-	ktrace_free(ip->i_lock_trace);
-#endif
-#ifdef XFS_DIR2_TRACE
-	ktrace_free(ip->i_dir_trace);
-#endif
-	if (ip->i_itemp) {
-		/*
-		 * Only if we are shutting down the fs will we see an
-		 * inode still in the AIL. If it is there, we should remove
-		 * it to prevent a use-after-free from occurring.
-		 */
-		xfs_log_item_t	*lip = &ip->i_itemp->ili_item;
-		struct xfs_ail	*ailp = lip->li_ailp;
-
-		ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
-				       XFS_FORCED_SHUTDOWN(ip->i_mount));
-		if (lip->li_flags & XFS_LI_IN_AIL) {
-			spin_lock(&ailp->xa_lock);
-			if (lip->li_flags & XFS_LI_IN_AIL)
-				xfs_trans_ail_delete(ailp, lip);
-			else
-				spin_unlock(&ailp->xa_lock);
-		}
-		xfs_inode_item_destroy(ip);
-		ip->i_itemp = NULL;
-	}
-	/* asserts to verify all state is correct here */
-	ASSERT(atomic_read(&ip->i_iocount) == 0);
-	ASSERT(atomic_read(&ip->i_pincount) == 0);
-	ASSERT(!spin_is_locked(&ip->i_flags_lock));
-	ASSERT(completion_done(&ip->i_flush));
-	kmem_zone_free(xfs_inode_zone, ip);
+	xfs_inode_free(ip);
 }
 
 /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1804f866a71d..65f24a3cc992 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -309,23 +309,6 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
 	return &ip->i_vnode;
 }
 
-/*
- * Get rid of a partially initialized inode.
- *
- * We have to go through destroy_inode to make sure allocations
- * from init_inode_always like the security data are undone.
- *
- * We mark the inode bad so that it takes the short cut in
- * the reclaim path instead of going through the flush path
- * which doesn't make sense for an inode that has never seen the
- * light of day.
- */
-static inline void xfs_destroy_inode(struct xfs_inode *ip)
-{
-	make_bad_inode(VFS_I(ip));
-	return destroy_inode(VFS_I(ip));
-}
-
 /*
  * i_flags helper functions
  */
-- 
cgit v1.2.3-70-g09d2


From 69130c7cf96ea853dc5be599dd6a4b98907d39cc Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 6 Aug 2009 15:07:37 -0700
Subject: compat_ioctl: hook up compat handler for FIEMAP ioctl

The FIEMAP_IOC_FIEMAP mapping ioctl was missing a 32-bit compat handler,
which means that 32-bit suerspace on 64-bit kernels cannot use this ioctl
command.

The structure is nicely aligned, padded, and sized, so it is just this
simple.

Tested w/ 32-bit ioctl tester (from Josef) on a 64-bit kernel on ext4.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Cc: <linux-ext4@vger.kernel.org>
Cc: Mark Lord <lkml@rtr.ca>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Josef Bacik <josef@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat_ioctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f28f070a60fc..f91fd51b32e3 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1905,6 +1905,7 @@ COMPATIBLE_IOCTL(FIONCLEX)
 COMPATIBLE_IOCTL(FIOASYNC)
 COMPATIBLE_IOCTL(FIONBIO)
 COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
+COMPATIBLE_IOCTL(FS_IOC_FIEMAP)
 /* 0x00 */
 COMPATIBLE_IOCTL(FIBMAP)
 COMPATIBLE_IOCTL(FIGETBSZ)
-- 
cgit v1.2.3-70-g09d2


From 2d8dd38a5aa0cc2490bbad9b75e77fa154e1d145 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Aug 2009 15:07:39 -0700
Subject: vfs: mnt_want_write_file(): fix special file handling

I suspect that mnt_want_write_file() may have wrong assumption.  I think
mnt_want_write_file() is assuming it increments ->mnt_writers if
(file->f_mode & FMODE_WRITE).  But, if it's special_file(), it is false?

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Acked-by: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 277c28a63ead..7230787d18b0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -316,7 +316,8 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);
  */
 int mnt_want_write_file(struct file *file)
 {
-	if (!(file->f_mode & FMODE_WRITE))
+	struct inode *inode = file->f_dentry->d_inode;
+	if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
 		return mnt_want_write(file->f_path.mnt);
 	else
 		return mnt_clone_write(file->f_path.mnt);
-- 
cgit v1.2.3-70-g09d2


From 3440625d78711bee41a84cf29c3d8c579b522666 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 6 Aug 2009 15:09:34 -0700
Subject: flat: fix uninitialized ptr with shared libs

The new credentials code broke load_flat_shared_library() as it now uses
an uninitialized cred pointer.

Reported-by: Bernd Schmidt <bernds_cb1@t-online.de>
Tested-by: Bernd Schmidt <bernds_cb1@t-online.de>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: David Howells <dhowells@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_flat.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 697f6b5f1313..e92f229e3c6e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -828,15 +828,22 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
 	if (IS_ERR(bprm.file))
 		return res;
 
+	bprm.cred = prepare_exec_creds();
+	res = -ENOMEM;
+	if (!bprm.cred)
+		goto out;
+
 	res = prepare_binprm(&bprm);
 
 	if (res <= (unsigned long)-4096)
 		res = load_flat_file(&bprm, libs, id, NULL);
-	if (bprm.file) {
-		allow_write_access(bprm.file);
-		fput(bprm.file);
-		bprm.file = NULL;
-	}
+
+	abort_creds(bprm.cred);
+
+out:
+	allow_write_access(bprm.file);
+	fput(bprm.file);
+
 	return(res);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 4baf8c9201e88546918cbfa61ea8062c38bc1644 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 7 Aug 2009 13:47:08 -0400
Subject: Btrfs: remove superfluous NULL pointer check in btrfs_rename()

This takes care of the following entry from Dan's list:

fs/btrfs/inode.c +4788 btrfs_rename(36) warning: variable derefenced before check 'old_inode'

Reported-by: Dan Carpenter <error27@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Eugene Teo <eteo@redhat.com>
Cc: Julia Lawall <julia@diku.dk>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3ea827ddf0fe..04b53b5ebe59 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4806,8 +4806,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * and the replacement file is large.  Start IO on it now so
 	 * we don't add too much work to the end of the transaction
 	 */
-	if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
-	    new_inode->i_size &&
+	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
 	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
 		filemap_flush(old_inode->i_mapping);
 
-- 
cgit v1.2.3-70-g09d2


From 60f2e8f8a07331097a57ec4abcdc680405579377 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Fri, 7 Aug 2009 13:51:33 -0400
Subject: Btrfs: correct error-handling zlib error handling

find_zlib_workspace returns an ERR_PTR value in an error case instead of NULL.

A simplified version of the semantic match that finds this problem is as
follows: (http://coccinelle.lip6.fr/)

// <smpl>
@match exists@
expression x, E;
statement S1, S2;
@@

x = find_zlib_workspace(...)
... when != x = E
(
*  if (x == NULL || ...) S1 else S2
|
*  if (x == NULL && ...) S1 else S2
)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/zlib.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ecfbce836d32..3e2b90eaa239 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -208,7 +208,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
 	*total_in = 0;
 
 	workspace = find_zlib_workspace();
-	if (!workspace)
+	if (IS_ERR(workspace))
 		return -1;
 
 	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
@@ -366,7 +366,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 	char *kaddr;
 
 	workspace = find_zlib_workspace();
-	if (!workspace)
+	if (IS_ERR(workspace))
 		return -ENOMEM;
 
 	data_in = kmap(pages_in[page_in_index]);
@@ -547,7 +547,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
 		return -ENOMEM;
 
 	workspace = find_zlib_workspace();
-	if (!workspace)
+	if (IS_ERR(workspace))
 		return -ENOMEM;
 
 	workspace->inf_strm.next_in = data_in;
-- 
cgit v1.2.3-70-g09d2


From ceab36edd3d3ad3ffd01d41d6d1e05ac1ff8357e Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Fri, 7 Aug 2009 13:51:33 -0400
Subject: Btrfs: fix balancing oops when invalidate_inode_pages2 returns EBUSY

invalidate_inode_pages2_range may return -EBUSY occasionally
which results Oops. This patch fixes the issue by moving
invalidate_inode_pages2_range into a loop and keeping calling
it until the return value is not -EBUSY.

The EBUSY return is temporary, and can happen when the btrfs release page
function is unable to release a page because the EXTENT_LOCK
bit is set.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/relocation.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e71264d1c2c9..c04f7f212602 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2553,8 +2553,13 @@ int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
 	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
 
 	/* make sure the dirty trick played by the caller work */
-	ret = invalidate_inode_pages2_range(inode->i_mapping,
-					    first_index, last_index);
+	while (1) {
+		ret = invalidate_inode_pages2_range(inode->i_mapping,
+						    first_index, last_index);
+		if (ret != -EBUSY)
+			break;
+		schedule_timeout(HZ/10);
+	}
 	if (ret)
 		goto out_unlock;
 
-- 
cgit v1.2.3-70-g09d2


From e7432675f8ca868a4af365759a8d4c3779a3d922 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 6 Aug 2009 16:12:58 -0700
Subject: ocfs2: Initialize the cluster we're writing to in a non-sparse extend

In a non-sparse extend, we correctly allocate (and zero) the clusters between
the old_i_size and pos, but we don't zero the portions of the cluster we're
writing to outside of pos<->len.

It handles clustersize > pagesize and blocksize < pagesize.

[Cleaned up by Joel Becker.]

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/aops.c | 66 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 47 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e511df101451..b401654011a2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -895,18 +895,17 @@ struct ocfs2_write_cluster_desc {
 	 */
 	unsigned	c_new;
 	unsigned	c_unwritten;
+	unsigned	c_needs_zero;
 };
 
-static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
-{
-	return d->c_new || d->c_unwritten;
-}
-
 struct ocfs2_write_ctxt {
 	/* Logical cluster position / len of write */
 	u32				w_cpos;
 	u32				w_clen;
 
+	/* First cluster allocated in a nonsparse extend */
+	u32				w_first_new_cpos;
+
 	struct ocfs2_write_cluster_desc	w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
 
 	/*
@@ -984,6 +983,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 		return -ENOMEM;
 
 	wc->w_cpos = pos >> osb->s_clustersize_bits;
+	wc->w_first_new_cpos = UINT_MAX;
 	cend = (pos + len - 1) >> osb->s_clustersize_bits;
 	wc->w_clen = cend - wc->w_cpos + 1;
 	get_bh(di_bh);
@@ -1218,20 +1218,18 @@ out:
  */
 static int ocfs2_write_cluster(struct address_space *mapping,
 			       u32 phys, unsigned int unwritten,
+			       unsigned int should_zero,
 			       struct ocfs2_alloc_context *data_ac,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct ocfs2_write_ctxt *wc, u32 cpos,
 			       loff_t user_pos, unsigned user_len)
 {
-	int ret, i, new, should_zero = 0;
+	int ret, i, new;
 	u64 v_blkno, p_blkno;
 	struct inode *inode = mapping->host;
 	struct ocfs2_extent_tree et;
 
 	new = phys == 0 ? 1 : 0;
-	if (new || unwritten)
-		should_zero = 1;
-
 	if (new) {
 		u32 tmp_pos;
 
@@ -1342,7 +1340,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
 			local_len = osb->s_clustersize - cluster_off;
 
 		ret = ocfs2_write_cluster(mapping, desc->c_phys,
-					  desc->c_unwritten, data_ac, meta_ac,
+					  desc->c_unwritten,
+					  desc->c_needs_zero,
+					  data_ac, meta_ac,
 					  wc, desc->c_cpos, pos, local_len);
 		if (ret) {
 			mlog_errno(ret);
@@ -1392,14 +1392,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
 		 * newly allocated cluster.
 		 */
 		desc = &wc->w_desc[0];
-		if (ocfs2_should_zero_cluster(desc))
+		if (desc->c_needs_zero)
 			ocfs2_figure_cluster_boundaries(osb,
 							desc->c_cpos,
 							&wc->w_target_from,
 							NULL);
 
 		desc = &wc->w_desc[wc->w_clen - 1];
-		if (ocfs2_should_zero_cluster(desc))
+		if (desc->c_needs_zero)
 			ocfs2_figure_cluster_boundaries(osb,
 							desc->c_cpos,
 							NULL,
@@ -1467,13 +1467,28 @@ static int ocfs2_populate_write_desc(struct inode *inode,
 			phys++;
 		}
 
+		/*
+		 * If w_first_new_cpos is < UINT_MAX, we have a non-sparse
+		 * file that got extended.  w_first_new_cpos tells us
+		 * where the newly allocated clusters are so we can
+		 * zero them.
+		 */
+		if (desc->c_cpos >= wc->w_first_new_cpos) {
+			BUG_ON(phys == 0);
+			desc->c_needs_zero = 1;
+		}
+
 		desc->c_phys = phys;
 		if (phys == 0) {
 			desc->c_new = 1;
+			desc->c_needs_zero = 1;
 			*clusters_to_alloc = *clusters_to_alloc + 1;
 		}
-		if (ext_flags & OCFS2_EXT_UNWRITTEN)
+
+		if (ext_flags & OCFS2_EXT_UNWRITTEN) {
 			desc->c_unwritten = 1;
+			desc->c_needs_zero = 1;
+		}
 
 		num_clusters--;
 	}
@@ -1633,10 +1648,13 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
 	if (newsize <= i_size_read(inode))
 		return 0;
 
-	ret = ocfs2_extend_no_holes(inode, newsize, newsize - len);
+	ret = ocfs2_extend_no_holes(inode, newsize, pos);
 	if (ret)
 		mlog_errno(ret);
 
+	wc->w_first_new_cpos =
+		ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+
 	return ret;
 }
 
@@ -1645,7 +1663,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 			     struct page **pagep, void **fsdata,
 			     struct buffer_head *di_bh, struct page *mmap_page)
 {
-	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
+	int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
 	unsigned int clusters_to_alloc, extents_to_split;
 	struct ocfs2_write_ctxt *wc;
 	struct inode *inode = mapping->host;
@@ -1723,8 +1741,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 
 	}
 
-	ocfs2_set_target_boundaries(osb, wc, pos, len,
-				    clusters_to_alloc + extents_to_split);
+	/*
+	 * We have to zero sparse allocated clusters, unwritten extent clusters,
+	 * and non-sparse clusters we just extended.  For non-sparse writes,
+	 * we know zeros will only be needed in the first and/or last cluster.
+	 */
+	if (clusters_to_alloc || extents_to_split ||
+	    wc->w_desc[0].c_needs_zero ||
+	    wc->w_desc[wc->w_clen - 1].c_needs_zero)
+		cluster_of_pages = 1;
+	else
+		cluster_of_pages = 0;
+
+	ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
 
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
@@ -1757,8 +1786,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	 * extent.
 	 */
 	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
-					 clusters_to_alloc + extents_to_split,
-					 mmap_page);
+					 cluster_of_pages, mmap_page);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_quota;
-- 
cgit v1.2.3-70-g09d2


From 8a57a9dda760ea7845390f1cd36f3eb2a49391d8 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 6 Aug 2009 16:07:50 -0700
Subject: ocfs2: keep index within status_map[]

Do not exceed array status_map[]

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/stack_o2cb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 3f661376a2de..e49c41050264 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -17,6 +17,7 @@
  * General Public License for more details.
  */
 
+#include <linux/kernel.h>
 #include <linux/crc32.h>
 #include <linux/module.h>
 
@@ -153,7 +154,7 @@ static int status_map[] = {
 
 static int dlm_status_to_errno(enum dlm_status status)
 {
-	BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
+	BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map));
 
 	return status_map[status];
 }
-- 
cgit v1.2.3-70-g09d2


From dd8ac1da4190139de70da18823ff8f5992a649ae Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Sun, 9 Aug 2009 15:06:19 -0400
Subject: nfs: Keep index within mnt_errtbl[]

Ensure that index i remains within array mnt_errtbl[] and mnt3_errtbl[].

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/mount_clnt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 38ef9eaec407..8b9affc8bab2 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -258,7 +258,7 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
 		return -EIO;
 	status = ntohl(*p);
 
-	for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) {
+	for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
 		if (mnt_errtbl[i].status == status) {
 			res->errno = mnt_errtbl[i].errno;
 			return 0;
@@ -309,7 +309,7 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
 		return -EIO;
 	status = ntohl(*p);
 
-	for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) {
+	for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
 		if (mnt3_errtbl[i].status == status) {
 			res->errno = mnt3_errtbl[i].errno;
 			return 0;
-- 
cgit v1.2.3-70-g09d2


From a78cb57a106fceeba26da2907db9d8886700e1dc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:06:19 -0400
Subject: NFSv4: Don't loop forever on state recovery failure...

If the server is broken, then retrying forever won't fix it. We
should just give up after a while, and return an error to the user.
We set the number of retries to 10 for now...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6917311f201c..d95f7f9e60c4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -61,6 +61,8 @@
 #define NFS4_POLL_RETRY_MIN	(HZ/10)
 #define NFS4_POLL_RETRY_MAX	(15*HZ)
 
+#define NFS4_MAX_LOOP_ON_RECOVER (10)
+
 struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
@@ -426,17 +428,19 @@ out:
 static int nfs4_recover_session(struct nfs4_session *session)
 {
 	struct nfs_client *clp = session->clp;
+	unsigned int loop;
 	int ret;
 
-	for (;;) {
+	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
 		ret = nfs4_wait_clnt_recover(clp);
 		if (ret != 0)
-				return ret;
+			break;
 		if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
 			break;
 		nfs4_schedule_state_manager(clp);
+		ret = -EIO;
 	}
-	return 0;
+	return ret;
 }
 
 static int nfs41_setup_sequence(struct nfs4_session *session,
@@ -1444,18 +1448,20 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 static int nfs4_recover_expired_lease(struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
+	unsigned int loop;
 	int ret;
 
-	for (;;) {
+	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
 		ret = nfs4_wait_clnt_recover(clp);
 		if (ret != 0)
-			return ret;
+			break;
 		if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
 		    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
 			break;
 		nfs4_schedule_state_recovery(clp);
+		ret = -EIO;
 	}
-	return 0;
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 62ab460cf5e450e1d207a98a9c6cf2e4a6a78fd1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:06:19 -0400
Subject: NFSv4: Add 'server capability' flags for NFSv4 recommended attributes

If the NFSv4 server doesn't support a POSIX attribute, the generic NFS code
needs to know that, so that it don't keep trying to poll for it.

However, by the same count, if the NFSv4 server does support that
attribute, then we should ensure that the inode metadata is appropriately
labelled as being untrusted. For instance, if we don't know the correct
value of the file's uid, we should certainly not be caching ACLs or ACCESS
results.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           |  7 ++--
 fs/nfs/inode.c            | 92 +++++++++++++++++++++++++++++++++++++++--------
 fs/nfs/nfs4proc.c         | 22 ++++++++++++
 include/linux/nfs_fs_sb.h |  9 +++++
 4 files changed, 114 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8d25ccb2d51d..8f34fd8028fc 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -809,6 +809,9 @@ static int nfs_init_server(struct nfs_server *server,
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags;
 	server->options = data->options;
+	server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+		NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
+		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
 
 	if (data->rsize)
 		server->rsize = nfs_block_size(data->rsize, NULL);
@@ -1274,7 +1277,7 @@ static int nfs4_init_server(struct nfs_server *server,
 
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags;
-	server->caps |= NFS_CAP_ATOMIC_OPEN;
+	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
 	server->options = data->options;
 
 	/* Get a client record */
@@ -1400,7 +1403,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 
 	/* Initialise the client representation from the parent server */
 	nfs_server_copy_userdata(server, parent_server);
-	server->caps |= NFS_CAP_ATOMIC_OPEN;
+	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
 
 	/* Get a client representation.
 	 * Note: NFSv4 always uses TCP, */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bd7938eda6a8..fe5a8b45d867 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -286,6 +286,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		/* We can't support update_atime(), since the server will reset it */
 		inode->i_flags |= S_NOATIME|S_NOCMTIME;
 		inode->i_mode = fattr->mode;
+		if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
+				&& nfs_server_capable(inode, NFS_CAP_MODE))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL;
 		/* Why so? Because we want revalidate for devices/FIFOs, and
 		 * that's precisely what we have in nfs_file_inode_operations.
 		 */
@@ -330,20 +335,46 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		nfsi->attr_gencount = fattr->gencount;
 		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
 			inode->i_atime = fattr->atime;
+		else if (nfs_server_capable(inode, NFS_CAP_ATIME))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
 			inode->i_mtime = fattr->mtime;
+		else if (nfs_server_capable(inode, NFS_CAP_MTIME))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA;
 		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
 			inode->i_ctime = fattr->ctime;
+		else if (nfs_server_capable(inode, NFS_CAP_CTIME))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL;
 		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
 			nfsi->change_attr = fattr->change_attr;
+		else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA;
 		if (fattr->valid & NFS_ATTR_FATTR_SIZE)
 			inode->i_size = nfs_size_to_loff_t(fattr->size);
+		else
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA
+				| NFS_INO_REVAL_PAGECACHE;
 		if (fattr->valid & NFS_ATTR_FATTR_NLINK)
 			inode->i_nlink = fattr->nlink;
+		else if (nfs_server_capable(inode, NFS_CAP_NLINK))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (fattr->valid & NFS_ATTR_FATTR_OWNER)
 			inode->i_uid = fattr->uid;
+		else if (nfs_server_capable(inode, NFS_CAP_OWNER))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL;
 		if (fattr->valid & NFS_ATTR_FATTR_GROUP)
 			inode->i_gid = fattr->gid;
+		else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL;
 		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
 			inode->i_blocks = fattr->du.nfs2.blocks;
 		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -1145,6 +1176,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	loff_t cur_isize, new_isize;
 	unsigned long invalid = 0;
 	unsigned long now = jiffies;
+	unsigned long save_cache_validity;
 
 	dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
 			__func__, inode->i_sb->s_id, inode->i_ino,
@@ -1171,10 +1203,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 */
 	nfsi->read_cache_jiffies = fattr->time_start;
 
-	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
-	    nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
-		    | NFS_INO_INVALID_ATIME
-		    | NFS_INO_REVAL_PAGECACHE);
+	save_cache_validity = nfsi->cache_validity;
+	nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+			| NFS_INO_INVALID_ATIME
+			| NFS_INO_REVAL_FORCED
+			| NFS_INO_REVAL_PAGECACHE);
 
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
@@ -1189,7 +1222,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				nfs_force_lookup_revalidate(inode);
 			nfsi->change_attr = fattr->change_attr;
 		}
-	}
+	} else if (server->caps & NFS_CAP_CHANGE_ATTR)
+		invalid |= save_cache_validity;
 
 	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
 		/* NFSv2/v3: Check if the mtime agrees */
@@ -1201,7 +1235,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				nfs_force_lookup_revalidate(inode);
 			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
 		}
-	}
+	} else if (server->caps & NFS_CAP_MTIME)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_DATA
+				| NFS_INO_REVAL_PAGECACHE
+				| NFS_INO_REVAL_FORCED);
+
 	if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
 		/* If ctime has changed we should definitely clear access+acl caches */
 		if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
@@ -1215,7 +1254,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			}
 			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
 		}
-	}
+	} else if (server->caps & NFS_CAP_CTIME)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
 
 	/* Check if our cached file size is stale */
 	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1231,30 +1274,50 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			dprintk("NFS: isize change on server for file %s/%ld\n",
 					inode->i_sb->s_id, inode->i_ino);
 		}
-	}
+	} else
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_REVAL_PAGECACHE
+				| NFS_INO_REVAL_FORCED);
 
 
 	if (fattr->valid & NFS_ATTR_FATTR_ATIME)
 		memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
+	else if (server->caps & NFS_CAP_ATIME)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME
+				| NFS_INO_REVAL_FORCED);
 
 	if (fattr->valid & NFS_ATTR_FATTR_MODE) {
 		if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 			inode->i_mode = fattr->mode;
 		}
-	}
+	} else if (server->caps & NFS_CAP_MODE)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
+
 	if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
 		if (inode->i_uid != fattr->uid) {
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 			inode->i_uid = fattr->uid;
 		}
-	}
+	} else if (server->caps & NFS_CAP_OWNER)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
+
 	if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
 		if (inode->i_gid != fattr->gid) {
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 			inode->i_gid = fattr->gid;
 		}
-	}
+	} else if (server->caps & NFS_CAP_OWNER_GROUP)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_INVALID_ACCESS
+				| NFS_INO_INVALID_ACL
+				| NFS_INO_REVAL_FORCED);
 
 	if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
 		if (inode->i_nlink != fattr->nlink) {
@@ -1263,7 +1326,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				invalid |= NFS_INO_INVALID_DATA;
 			inode->i_nlink = fattr->nlink;
 		}
-	}
+	} else if (server->caps & NFS_CAP_NLINK)
+		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+				| NFS_INO_REVAL_FORCED);
 
 	if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
 		/*
@@ -1293,9 +1358,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				|| S_ISLNK(inode->i_mode)))
 		invalid &= ~NFS_INO_INVALID_DATA;
 	if (!nfs_have_delegation(inode, FMODE_READ) ||
-			(nfsi->cache_validity & NFS_INO_REVAL_FORCED))
+			(save_cache_validity & NFS_INO_REVAL_FORCED))
 		nfsi->cache_validity |= invalid;
-	nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED;
 
 	return 0;
  out_changed:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d95f7f9e60c4..be6544aef41f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2003,12 +2003,34 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 	status = nfs4_call_sync(server, &msg, &args, &res, 0);
 	if (status == 0) {
 		memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
+		server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
+				NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+				NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
+				NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
+				NFS_CAP_CTIME|NFS_CAP_MTIME);
 		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
 			server->caps |= NFS_CAP_ACLS;
 		if (res.has_links != 0)
 			server->caps |= NFS_CAP_HARDLINKS;
 		if (res.has_symlinks != 0)
 			server->caps |= NFS_CAP_SYMLINKS;
+		if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID)
+			server->caps |= NFS_CAP_FILEID;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_MODE)
+			server->caps |= NFS_CAP_MODE;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)
+			server->caps |= NFS_CAP_NLINK;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER)
+			server->caps |= NFS_CAP_OWNER;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)
+			server->caps |= NFS_CAP_OWNER_GROUP;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)
+			server->caps |= NFS_CAP_ATIME;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)
+			server->caps |= NFS_CAP_CTIME;
+		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
+			server->caps |= NFS_CAP_MTIME;
+
 		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
 		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
 		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 19fe15d12042..320569eabe3b 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -167,6 +167,15 @@ struct nfs_server {
 #define NFS_CAP_SYMLINKS	(1U << 2)
 #define NFS_CAP_ACLS		(1U << 3)
 #define NFS_CAP_ATOMIC_OPEN	(1U << 4)
+#define NFS_CAP_CHANGE_ATTR	(1U << 5)
+#define NFS_CAP_FILEID		(1U << 6)
+#define NFS_CAP_MODE		(1U << 7)
+#define NFS_CAP_NLINK		(1U << 8)
+#define NFS_CAP_OWNER		(1U << 9)
+#define NFS_CAP_OWNER_GROUP	(1U << 10)
+#define NFS_CAP_ATIME		(1U << 11)
+#define NFS_CAP_CTIME		(1U << 12)
+#define NFS_CAP_MTIME		(1U << 13)
 
 
 /* maximum number of slots to use */
-- 
cgit v1.2.3-70-g09d2


From 80e52aced138bb41b045a8595a87510f27d8d8c5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:06:19 -0400
Subject: NFSv4: Don't do idmapper upcalls for asynchronous RPC calls

We don't want to cause rpciod to hang...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 86 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 58 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 617273e7d47f..e65cc2e650c8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3075,7 +3075,8 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 	return ret;
 }
 
-static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
+static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
+		struct nfs_client *clp, uint32_t *uid, int may_sleep)
 {
 	uint32_t len;
 	__be32 *p;
@@ -3088,7 +3089,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		READ_BUF(4);
 		READ32(len);
 		READ_BUF(len);
-		if (len < XDR_MAX_NETOBJ) {
+		if (!may_sleep) {
+			/* do nothing */
+		} else if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
 				ret = NFS_ATTR_FATTR_OWNER;
 			else
@@ -3103,7 +3106,8 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	return ret;
 }
 
-static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
+static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
+		struct nfs_client *clp, uint32_t *gid, int may_sleep)
 {
 	uint32_t len;
 	__be32 *p;
@@ -3116,7 +3120,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		READ_BUF(4);
 		READ32(len);
 		READ_BUF(len);
-		if (len < XDR_MAX_NETOBJ) {
+		if (!may_sleep) {
+			/* do nothing */
+		} else if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
 				ret = NFS_ATTR_FATTR_GROUP;
 			else
@@ -3466,7 +3472,8 @@ xdr_error:
 	return status;
 }
 
-static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server)
+static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		const struct nfs_server *server, int may_sleep)
 {
 	__be32 *savep;
 	uint32_t attrlen,
@@ -3538,12 +3545,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 		goto xdr_error;
 	fattr->valid |= status;
 
-	status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
+	status = decode_attr_owner(xdr, bitmap, server->nfs_client,
+			&fattr->uid, may_sleep);
 	if (status < 0)
 		goto xdr_error;
 	fattr->valid |= status;
 
-	status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
+	status = decode_attr_group(xdr, bitmap, server->nfs_client,
+			&fattr->gid, may_sleep);
 	if (status < 0)
 		goto xdr_error;
 	fattr->valid |= status;
@@ -4370,7 +4379,8 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct
 	status = decode_open_downgrade(&xdr, res);
 	if (status != 0)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4397,7 +4407,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
 	status = decode_access(&xdr, res);
 	if (status != 0)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4424,7 +4435,8 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
 		goto out;
 	if ((status = decode_getfh(&xdr, res->fh)) != 0)
 		goto out;
-	status = decode_getfattr(&xdr, res->fattr, res->server);
+	status = decode_getfattr(&xdr, res->fattr, res->server
+			,!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4448,7 +4460,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
 	if ((status = decode_putrootfh(&xdr)) != 0)
 		goto out;
 	if ((status = decode_getfh(&xdr, res->fh)) == 0)
-		status = decode_getfattr(&xdr, res->fattr, res->server);
+		status = decode_getfattr(&xdr, res->fattr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4473,7 +4486,8 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
 		goto out;
 	if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
 		goto out;
-	decode_getfattr(&xdr, &res->dir_attr, res->server);
+	decode_getfattr(&xdr, &res->dir_attr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4503,11 +4517,13 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
 	if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
 		goto out;
 	/* Current FH is target directory */
-	if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0)
+	if (decode_getfattr(&xdr, res->new_fattr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
 	if ((status = decode_restorefh(&xdr)) != 0)
 		goto out;
-	decode_getfattr(&xdr, res->old_fattr, res->server);
+	decode_getfattr(&xdr, res->old_fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4540,11 +4556,13 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
 	 * Note order: OP_LINK leaves the directory as the current
 	 *             filehandle.
 	 */
-	if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0)
+	if (decode_getfattr(&xdr, res->dir_attr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
 	if ((status = decode_restorefh(&xdr)) != 0)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4573,11 +4591,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
 		goto out;
 	if ((status = decode_getfh(&xdr, res->fh)) != 0)
 		goto out;
-	if (decode_getfattr(&xdr, res->fattr, res->server) != 0)
+	if (decode_getfattr(&xdr, res->fattr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
 	if ((status = decode_restorefh(&xdr)) != 0)
 		goto out;
-	decode_getfattr(&xdr, res->dir_fattr, res->server);
+	decode_getfattr(&xdr, res->dir_fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4609,7 +4629,8 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
 	status = decode_putfh(&xdr);
 	if (status)
 		goto out;
-	status = decode_getfattr(&xdr, res->fattr, res->server);
+	status = decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4716,7 +4737,8 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
 	 * 	an ESTALE error. Shouldn't be a problem,
 	 * 	though, since fattr->valid will remain unset.
 	 */
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4748,11 +4770,13 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
 		goto out;
 	if (decode_getfh(&xdr, &res->fh) != 0)
 		goto out;
-	if (decode_getfattr(&xdr, res->f_attr, res->server) != 0)
+	if (decode_getfattr(&xdr, res->f_attr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
 		goto out;
 	if (decode_restorefh(&xdr) != 0)
 		goto out;
-	decode_getfattr(&xdr, res->dir_attr, res->server);
+	decode_getfattr(&xdr, res->dir_attr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4800,7 +4824,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf
 	status = decode_open(&xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->f_attr, res->server);
+	decode_getfattr(&xdr, res->f_attr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -4827,7 +4852,8 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
 	status = decode_setattr(&xdr);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -5001,7 +5027,8 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ
 	status = decode_write(&xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 	if (!status)
 		status = res->count;
 out:
@@ -5030,7 +5057,8 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri
 	status = decode_commit(&xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -5194,7 +5222,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
 	if (status != 0)
 		goto out;
 	status = decode_delegreturn(&xdr);
-	decode_getfattr(&xdr, res->fattr, res->server);
+	decode_getfattr(&xdr, res->fattr, res->server,
+			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
 }
@@ -5222,7 +5251,8 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
 		goto out;
 	xdr_enter_page(&xdr, PAGE_SIZE);
 	status = decode_getfattr(&xdr, &res->fs_locations->fattr,
-				 res->fs_locations->server);
+				 res->fs_locations->server,
+				 !RPC_IS_ASYNC(req->rq_task));
 out:
 	return status;
 }
-- 
cgit v1.2.3-70-g09d2


From c140aa91357c415c91269884518fa1d6fdebc20d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:06:19 -0400
Subject: NFSv4: Clean up the nfs.callback_tcpport option

Tighten up the validity checking in param_set_port: check for NULL pointers.
Ensure that the option shows up on 'modinfo' output.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 7f604c7941fb..293fa0528a6e 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -43,21 +43,29 @@ static struct svc_program nfs4_callback_program;
 unsigned int nfs_callback_set_tcpport;
 unsigned short nfs_callback_tcpport;
 unsigned short nfs_callback_tcpport6;
-static const int nfs_set_port_min = 0;
-static const int nfs_set_port_max = 65535;
+#define NFS_CALLBACK_MAXPORTNR (65535U)
 
-static int param_set_port(const char *val, struct kernel_param *kp)
+static int param_set_portnr(const char *val, struct kernel_param *kp)
 {
-	char *endp;
-	int num = simple_strtol(val, &endp, 0);
-	if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
+	unsigned long num;
+	int ret;
+
+	if (!val)
+		return -EINVAL;
+	ret = strict_strtoul(val, 0, &num);
+	if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
 		return -EINVAL;
-	*((int *)kp->arg) = num;
+	*((unsigned int *)kp->arg) = num;
 	return 0;
 }
 
-module_param_call(callback_tcpport, param_set_port, param_get_int,
-		 &nfs_callback_set_tcpport, 0644);
+static int param_get_portnr(char *buffer, struct kernel_param *kp)
+{
+	return param_get_uint(buffer, kp);
+}
+#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
+
+module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
 
 /*
  * This is the NFSv4 callback kernel thread.
-- 
cgit v1.2.3-70-g09d2


From f3f4f4ed26b116f621596f74d42d2b736171e968 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:29 -0400
Subject: NFS: Fix up new minorversion= option

The new minorversion= mount option (commit 3fd5be9e) was merged at
the same time as the recent sloppy parser fixes (commit a5a16bae),
so minorversion= still uses the old value parsing logic.

If the minorversion= option specifies a bogus value, it should fail
with "bad value" not "bad option."

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0b4cbdc60abd..83a31070bc1a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -158,7 +158,7 @@ static const match_table_t nfs_mount_option_tokens = {
 	{ Opt_mountvers, "mountvers=%s" },
 	{ Opt_nfsvers, "nfsvers=%s" },
 	{ Opt_nfsvers, "vers=%s" },
-	{ Opt_minorversion, "minorversion=%u" },
+	{ Opt_minorversion, "minorversion=%s" },
 
 	{ Opt_sec, "sec=%s" },
 	{ Opt_proto, "proto=%s" },
@@ -1001,7 +1001,6 @@ static int nfs_parse_mount_options(char *raw,
 	while ((p = strsep(&raw, ",")) != NULL) {
 		substring_t args[MAX_OPT_ARGS];
 		unsigned long option;
-		int int_option;
 		int token;
 
 		if (!*p)
@@ -1273,11 +1272,16 @@ static int nfs_parse_mount_options(char *raw,
 			}
 			break;
 		case Opt_minorversion:
-			if (match_int(args, &int_option))
-				return 0;
-			if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION)
-				return 0;
-			mnt->minorversion = int_option;
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			rc = strict_strtoul(string, 10, &option);
+			kfree(string);
+			if (rc != 0)
+				goto out_invalid_value;
+			if (option > NFS4_MAX_MINOR_VERSION)
+				goto out_invalid_value;
+			mnt->minorversion = option;
 			break;
 
 		/*
-- 
cgit v1.2.3-70-g09d2


From 0b524123c93893391ec9e6c9b04998a45235f9c8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:30 -0400
Subject: NFS: Add ability to send MOUNTPROC_UMNT to the kernel's mountd client

After certain failure modes of an NFS mount, an NFS client should send
a MOUNTPROC_UMNT request to remove the just-added mount entry from the
server's mount table.  While no-one should rely on the accuracy of the
server's mount table, sending a UMNT is simply being a good internet
neighbor.

Since NFS mount processing is handled in the kernel now, we will need
a function in the kernel's mountd client that can post a MOUNTRPC_UMNT
request, in order to handle these failure modes.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h   |  1 +
 fs/nfs/mount_clnt.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7dd90a6769d0..8d2b71d57d29 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -102,6 +102,7 @@ struct nfs_mount_request {
 };
 
 extern int nfs_mount(struct nfs_mount_request *info);
+extern void nfs_umount(const struct nfs_mount_request *info);
 
 /* client.c */
 extern struct rpc_program nfs_program;
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 38ef9eaec407..72dd8b6ccafa 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -209,6 +209,71 @@ out_mnt_err:
 	goto out;
 }
 
+/**
+ * nfs_umount - Notify a server that we have unmounted this export
+ * @info: pointer to umount request arguments
+ *
+ * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always
+ * use UDP.
+ */
+void nfs_umount(const struct nfs_mount_request *info)
+{
+	static const struct rpc_timeout nfs_umnt_timeout = {
+		.to_initval = 1 * HZ,
+		.to_maxval = 3 * HZ,
+		.to_retries = 2,
+	};
+	struct rpc_create_args args = {
+		.protocol	= IPPROTO_UDP,
+		.address	= info->sap,
+		.addrsize	= info->salen,
+		.timeout	= &nfs_umnt_timeout,
+		.servername	= info->hostname,
+		.program	= &mnt_program,
+		.version	= info->version,
+		.authflavor	= RPC_AUTH_UNIX,
+		.flags		= RPC_CLNT_CREATE_NOPING,
+	};
+	struct mountres	result;
+	struct rpc_message msg	= {
+		.rpc_argp	= info->dirpath,
+		.rpc_resp	= &result,
+	};
+	struct rpc_clnt *clnt;
+	int status;
+
+	if (info->noresvport)
+		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+	clnt = rpc_create(&args);
+	if (unlikely(IS_ERR(clnt)))
+		goto out_clnt_err;
+
+	dprintk("NFS: sending UMNT request for %s:%s\n",
+		(info->hostname ? info->hostname : "server"), info->dirpath);
+
+	if (info->version == NFS_MNT3_VERSION)
+		msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT];
+	else
+		msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT];
+
+	status = rpc_call_sync(clnt, &msg, 0);
+	rpc_shutdown_client(clnt);
+
+	if (unlikely(status < 0))
+		goto out_call_err;
+
+	return;
+
+out_clnt_err:
+	dprintk("NFS: failed to create UMNT RPC client, status=%ld\n",
+			PTR_ERR(clnt));
+	return;
+
+out_call_err:
+	dprintk("NFS: UMNT request failed, status=%d\n", status);
+}
+
 /*
  * XDR encode/decode functions for MOUNT
  */
@@ -407,6 +472,13 @@ static struct rpc_procinfo mnt_procedures[] = {
 		.p_statidx	= MOUNTPROC_MNT,
 		.p_name		= "MOUNT",
 	},
+	[MOUNTPROC_UMNT] = {
+		.p_proc		= MOUNTPROC_UMNT,
+		.p_encode	= (kxdrproc_t)mnt_enc_dirpath,
+		.p_arglen	= MNT_enc_dirpath_sz,
+		.p_statidx	= MOUNTPROC_UMNT,
+		.p_name		= "UMOUNT",
+	},
 };
 
 static struct rpc_procinfo mnt3_procedures[] = {
@@ -419,6 +491,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
 		.p_statidx	= MOUNTPROC3_MNT,
 		.p_name		= "MOUNT",
 	},
+	[MOUNTPROC3_UMNT] = {
+		.p_proc		= MOUNTPROC3_UMNT,
+		.p_encode	= (kxdrproc_t)mnt_enc_dirpath,
+		.p_arglen	= MNT_enc_dirpath_sz,
+		.p_statidx	= MOUNTPROC3_UMNT,
+		.p_name		= "UMOUNT",
+	},
 };
 
 
-- 
cgit v1.2.3-70-g09d2


From 059f90b323c0f5d34656ab7e0548d7d033c2a51a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:31 -0400
Subject: NFS: Fix auth flavor len accounting

Previous logic in the NFS mount parsing code path assumed
auth_flavor_len was set to zero for simple authentication flavors
(like AUTH_UNIX), and 1 for compound flavors (like AUTH_GSS).

At some earlier point (maybe even before the option parsers were
merged?) specific checks for auth_flavor_len being zero were removed
from the functions that validate the mount option that sets the mount
point's authentication flavor.

Since we are populating an array for authentication flavors, the
auth_flavor_len should always be set to the number of flavors.  Let's
eliminate some cleverness here, and prepare for new logic that needs
to know the number of flavors in the auth_flavors[] array.

(auth_flavors[] is an array because at some point we want to allow a
list of acceptable authentication flavors to be specified via the sec=
mount option.  For now it remains a single element array).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 83a31070bc1a..a33e608713e9 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -904,8 +904,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
 
 /*
  * Parse the value of the 'sec=' option.
- *
- * The flavor_len setting is for v4 mounts.
  */
 static int nfs_parse_security_flavors(char *value,
 				      struct nfs_parsed_mount_data *mnt)
@@ -916,53 +914,43 @@ static int nfs_parse_security_flavors(char *value,
 
 	switch (match_token(value, nfs_secflavor_tokens, args)) {
 	case Opt_sec_none:
-		mnt->auth_flavor_len = 0;
 		mnt->auth_flavors[0] = RPC_AUTH_NULL;
 		break;
 	case Opt_sec_sys:
-		mnt->auth_flavor_len = 0;
 		mnt->auth_flavors[0] = RPC_AUTH_UNIX;
 		break;
 	case Opt_sec_krb5:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
 		break;
 	case Opt_sec_krb5i:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
 		break;
 	case Opt_sec_krb5p:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
 		break;
 	case Opt_sec_lkey:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
 		break;
 	case Opt_sec_lkeyi:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
 		break;
 	case Opt_sec_lkeyp:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
 		break;
 	case Opt_sec_spkm:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
 		break;
 	case Opt_sec_spkmi:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
 		break;
 	case Opt_sec_spkmp:
-		mnt->auth_flavor_len = 1;
 		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
 		break;
 	default:
 		return 0;
 	}
 
+	mnt->auth_flavor_len = 1;
 	return 1;
 }
 
@@ -1680,6 +1668,7 @@ static int nfs_validate_mount_data(void *options,
 	args->nfs_server.port	= 0;	/* autobind unless user sets port */
 	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 	args->auth_flavors[0]	= RPC_AUTH_UNIX;
+	args->auth_flavor_len	= 1;
 
 	switch (data->version) {
 	case 1:
@@ -2343,7 +2332,7 @@ static int nfs4_validate_mount_data(void *options,
 	args->acdirmax		= NFS_DEF_ACDIRMAX;
 	args->nfs_server.port	= NFS_PORT; /* 2049 unless user set port= */
 	args->auth_flavors[0]	= RPC_AUTH_UNIX;
-	args->auth_flavor_len	= 0;
+	args->auth_flavor_len	= 1;
 	args->minorversion	= 0;
 
 	switch (data->version) {
-- 
cgit v1.2.3-70-g09d2


From ec88f28d1eb77346f19ca324ceec76e645cdd9da Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:32 -0400
Subject: NFS: Use the authentication flavor list returned by mountd

Commit a14017db added support in the kernel's NFS mount client to
decode the authentication flavor list returned by mountd.

The NFS client can now use this list to determine whether the
authentication flavor requested by the user is actually supported
by the server.

Note we don't actually negotiate the security flavor if none was
specified by the user.  Instead, we try to use AUTH_SYS, and fail if
the server does not support it.  This prevents us from negotiating
an inappropriate security flavor (some servers list AUTH_NULL first).

If the server does not support AUTH_SYS, the user must provide an
appropriate security flavor by specifying the "sec=" mount option.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a33e608713e9..8526008eba72 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1436,6 +1436,42 @@ out_security_failure:
 	return 0;
 }
 
+/*
+ * Match the requested auth flavors with the list returned by
+ * the server.  Returns zero and sets the mount's authentication
+ * flavor on success; returns -EACCES if server does not support
+ * the requested flavor.
+ */
+static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
+			     struct nfs_mount_request *request)
+{
+	unsigned int i, j, server_authlist_len = *(request->auth_flav_len);
+
+	/*
+	 * We avoid sophisticated negotiating here, as there are
+	 * plenty of cases where we can get it wrong, providing
+	 * either too little or too much security.
+	 *
+	 * RFC 2623, section 2.7 suggests we SHOULD prefer the
+	 * flavor listed first.  However, some servers list
+	 * AUTH_NULL first.  Our caller plants AUTH_SYS, the
+	 * preferred default, in args->auth_flavors[0] if user
+	 * didn't specify sec= mount option.
+	 */
+	for (i = 0; i < args->auth_flavor_len; i++)
+		for (j = 0; j < server_authlist_len; j++)
+			if (args->auth_flavors[i] == request->auth_flavs[j]) {
+				dfprintk(MOUNT, "NFS: using auth flavor %d\n",
+					request->auth_flavs[j]);
+				args->auth_flavors[0] = request->auth_flavs[j];
+				return 0;
+			}
+
+	dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n");
+	nfs_umount(request);
+	return -EACCES;
+}
+
 /*
  * Use the remote server's MOUNT service to request the NFS file handle
  * corresponding to the provided path.
@@ -1443,7 +1479,8 @@ out_security_failure:
 static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 			 struct nfs_fh *root_fh)
 {
-	unsigned int auth_flavor_len = 0;
+	rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
+	unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
 	struct nfs_mount_request request = {
 		.sap		= (struct sockaddr *)
 						&args->mount_server.address,
@@ -1451,7 +1488,8 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 		.protocol	= args->mount_server.protocol,
 		.fh		= root_fh,
 		.noresvport	= args->flags & NFS_MOUNT_NORESVPORT,
-		.auth_flav_len	= &auth_flavor_len,
+		.auth_flav_len	= &server_authlist_len,
+		.auth_flavs	= server_authlist,
 	};
 	int status;
 
@@ -1488,12 +1526,18 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	 * to a file handle.
 	 */
 	status = nfs_mount(&request);
-	if (status == 0)
-		return 0;
+	if (status != 0) {
+		dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
+				request.hostname, status);
+		return status;
+	}
 
-	dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
-			request.hostname, status);
-	return status;
+	/*
+	 * MNTv1 (NFSv2) does not support auth flavor negotiation.
+	 */
+	if (args->mount_server.version != NFS_MNT3_VERSION)
+		return 0;
+	return nfs_walk_authlist(args, &request);
 }
 
 static int nfs_parse_simple_hostname(const char *dev_name,
-- 
cgit v1.2.3-70-g09d2


From a02d692611348f11ee1bc37431a883c3ff2de23e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:34 -0400
Subject: SUNRPC: Provide functions for managing universal addresses

Introduce a set of functions in the kernel's RPC implementation for
converting between a socket address and either a standard
presentation address string or an RPC universal address.

The universal address functions will be used to encode and decode
RPCB_FOO and NFSv4 SETCLIENTID arguments.  The other functions are
part of a previous promise to deliver shared functions that can be
used by upper-layer protocols to display and manipulate IP
addresses.

The kernel's current address printf formatters were designed
specifically for kernel to user-space APIs that require a particular
string format for socket addresses, thus are somewhat limited for the
purposes of sunrpc.ko.  The formatter for IPv6 addresses, %pI6, does
not support short-handing or scope IDs.  Also, these printf formatters
are unique per address family, so a separate formatter string is
required for printing AF_INET and AF_INET6 addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h           |   2 -
 include/linux/sunrpc/clnt.h |  38 +++++
 net/sunrpc/Makefile         |   2 +-
 net/sunrpc/addr.c           | 364 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 403 insertions(+), 3 deletions(-)
 create mode 100644 net/sunrpc/addr.c

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 8d2b71d57d29..ff68397f9b19 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -370,8 +370,6 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
 		PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
 
-#define IPV6_SCOPE_DELIMITER	'%'
-
 /*
  * Set the port number in an address.  Be agnostic about the address
  * family.
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 37881f1a0bd7..2636e8ad551c 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -9,6 +9,10 @@
 #ifndef _LINUX_SUNRPC_CLNT_H
 #define _LINUX_SUNRPC_CLNT_H
 
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+
 #include <linux/sunrpc/msg_prot.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/sunrpc/xprt.h>
@@ -151,5 +155,39 @@ void		rpc_force_rebind(struct rpc_clnt *);
 size_t		rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
 const char	*rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t);
 
+size_t		rpc_ntop(const struct sockaddr *, char *, const size_t);
+size_t		rpc_pton(const char *, const size_t,
+			 struct sockaddr *, const size_t);
+char *		rpc_sockaddr2uaddr(const struct sockaddr *);
+size_t		rpc_uaddr2sockaddr(const char *, const size_t,
+				   struct sockaddr *, const size_t);
+
+static inline unsigned short rpc_get_port(const struct sockaddr *sap)
+{
+	switch (sap->sa_family) {
+	case AF_INET:
+		return ntohs(((struct sockaddr_in *)sap)->sin_port);
+	case AF_INET6:
+		return ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
+	}
+	return 0;
+}
+
+static inline void rpc_set_port(struct sockaddr *sap,
+				const unsigned short port)
+{
+	switch (sap->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *)sap)->sin_port = htons(port);
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *)sap)->sin6_port = htons(port);
+		break;
+	}
+}
+
+#define IPV6_SCOPE_DELIMITER		'%'
+#define IPV6_SCOPE_ID_LEN		sizeof("%nnnnnnnnnn")
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_CLNT_H */
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index db73fd2a3f0e..9d2fca5ad14a 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/
 sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
 	    auth.o auth_null.o auth_unix.o auth_generic.o \
 	    svc.o svcsock.o svcauth.o svcauth_unix.o \
-	    rpcb_clnt.o timer.o xdr.o \
+	    addr.o rpcb_clnt.o timer.o xdr.o \
 	    sunrpc_syms.o cache.o rpc_pipe.o \
 	    svc_xprt.o
 sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
new file mode 100644
index 000000000000..22e8fd89477f
--- /dev/null
+++ b/net/sunrpc/addr.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright 2009, Oracle.  All rights reserved.
+ *
+ * Convert socket addresses to presentation addresses and universal
+ * addresses, and vice versa.
+ *
+ * Universal addresses are introduced by RFC 1833 and further refined by
+ * recent RFCs describing NFSv4.  The universal address format is part
+ * of the external (network) interface provided by rpcbind version 3
+ * and 4, and by NFSv4.  Such an address is a string containing a
+ * presentation format IP address followed by a port number in
+ * "hibyte.lobyte" format.
+ *
+ * IPv6 addresses can also include a scope ID, typically denoted by
+ * a '%' followed by a device name or a non-negative integer.  Refer to
+ * RFC 4291, Section 2.2 for details on IPv6 presentation formats.
+ */
+
+#include <net/ipv6.h>
+#include <linux/sunrpc/clnt.h>
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+static size_t rpc_ntop6_noscopeid(const struct sockaddr *sap,
+				  char *buf, const int buflen)
+{
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	const struct in6_addr *addr = &sin6->sin6_addr;
+
+	/*
+	 * RFC 4291, Section 2.2.2
+	 *
+	 * Shorthanded ANY address
+	 */
+	if (ipv6_addr_any(addr))
+		return snprintf(buf, buflen, "::");
+
+	/*
+	 * RFC 4291, Section 2.2.2
+	 *
+	 * Shorthanded loopback address
+	 */
+	if (ipv6_addr_loopback(addr))
+		return snprintf(buf, buflen, "::1");
+
+	/*
+	 * RFC 4291, Section 2.2.3
+	 *
+	 * Special presentation address format for mapped v4
+	 * addresses.
+	 */
+	if (ipv6_addr_v4mapped(addr))
+		return snprintf(buf, buflen, "::ffff:%pI4",
+					&addr->s6_addr32[3]);
+
+	/*
+	 * RFC 4291, Section 2.2.1
+	 *
+	 * To keep the result as short as possible, especially
+	 * since we don't shorthand, we don't want leading zeros
+	 * in each halfword, so avoid %pI6.
+	 */
+	return snprintf(buf, buflen, "%x:%x:%x:%x:%x:%x:%x:%x",
+		ntohs(addr->s6_addr16[0]), ntohs(addr->s6_addr16[1]),
+		ntohs(addr->s6_addr16[2]), ntohs(addr->s6_addr16[3]),
+		ntohs(addr->s6_addr16[4]), ntohs(addr->s6_addr16[5]),
+		ntohs(addr->s6_addr16[6]), ntohs(addr->s6_addr16[7]));
+}
+
+static size_t rpc_ntop6(const struct sockaddr *sap,
+			char *buf, const size_t buflen)
+{
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	char scopebuf[IPV6_SCOPE_ID_LEN];
+	size_t len;
+	int rc;
+
+	len = rpc_ntop6_noscopeid(sap, buf, buflen);
+	if (unlikely(len == 0))
+		return len;
+
+	if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) &&
+	    !(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_SITELOCAL))
+		return len;
+
+	rc = snprintf(scopebuf, sizeof(scopebuf), "%c%u",
+			IPV6_SCOPE_DELIMITER, sin6->sin6_scope_id);
+	if (unlikely((size_t)rc > sizeof(scopebuf)))
+		return 0;
+
+	len += rc;
+	if (unlikely(len > buflen))
+		return 0;
+
+	strcat(buf, scopebuf);
+	return len;
+}
+
+#else	/* !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) */
+
+static size_t rpc_ntop6_noscopeid(const struct sockaddr *sap,
+				  char *buf, const int buflen)
+{
+	return 0;
+}
+
+static size_t rpc_ntop6(const struct sockaddr *sap,
+			char *buf, const size_t buflen)
+{
+	return 0;
+}
+
+#endif	/* !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) */
+
+static int rpc_ntop4(const struct sockaddr *sap,
+		     char *buf, const size_t buflen)
+{
+	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+
+	return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
+}
+
+/**
+ * rpc_ntop - construct a presentation address in @buf
+ * @sap: socket address
+ * @buf: construction area
+ * @buflen: size of @buf, in bytes
+ *
+ * Plants a %NUL-terminated string in @buf and returns the length
+ * of the string, excluding the %NUL.  Otherwise zero is returned.
+ */
+size_t rpc_ntop(const struct sockaddr *sap, char *buf, const size_t buflen)
+{
+	switch (sap->sa_family) {
+	case AF_INET:
+		return rpc_ntop4(sap, buf, buflen);
+	case AF_INET6:
+		return rpc_ntop6(sap, buf, buflen);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rpc_ntop);
+
+static size_t rpc_pton4(const char *buf, const size_t buflen,
+			struct sockaddr *sap, const size_t salen)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	u8 *addr = (u8 *)&sin->sin_addr.s_addr;
+
+	if (buflen > INET_ADDRSTRLEN || salen < sizeof(struct sockaddr_in))
+		return 0;
+
+	memset(sap, 0, sizeof(struct sockaddr_in));
+
+	if (in4_pton(buf, buflen, addr, '\0', NULL) == 0)
+		return 0;
+
+	sin->sin_family = AF_INET;
+	return sizeof(struct sockaddr_in);;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static int rpc_parse_scope_id(const char *buf, const size_t buflen,
+			      const char *delim, struct sockaddr_in6 *sin6)
+{
+	char *p;
+	size_t len;
+
+	if ((buf + buflen) == delim)
+		return 1;
+
+	if (*delim != IPV6_SCOPE_DELIMITER)
+		return 0;
+
+	if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) &&
+	    !(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_SITELOCAL))
+		return 0;
+
+	len = (buf + buflen) - delim - 1;
+	p = kstrndup(delim + 1, len, GFP_KERNEL);
+	if (p) {
+		unsigned long scope_id = 0;
+		struct net_device *dev;
+
+		dev = dev_get_by_name(&init_net, p);
+		if (dev != NULL) {
+			scope_id = dev->ifindex;
+			dev_put(dev);
+		} else {
+			if (strict_strtoul(p, 10, &scope_id) == 0) {
+				kfree(p);
+				return 0;
+			}
+		}
+
+		kfree(p);
+
+		sin6->sin6_scope_id = scope_id;
+		return 1;
+	}
+
+	return 0;
+}
+
+static size_t rpc_pton6(const char *buf, const size_t buflen,
+			struct sockaddr *sap, const size_t salen)
+{
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
+	const char *delim;
+
+	if (buflen > (INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN) ||
+	    salen < sizeof(struct sockaddr_in6))
+		return 0;
+
+	memset(sap, 0, sizeof(struct sockaddr_in6));
+
+	if (in6_pton(buf, buflen, addr, IPV6_SCOPE_DELIMITER, &delim) == 0)
+		return 0;
+
+	if (!rpc_parse_scope_id(buf, buflen, delim, sin6))
+		return 0;
+
+	sin6->sin6_family = AF_INET6;
+	return sizeof(struct sockaddr_in6);
+}
+#else
+static size_t rpc_pton6(const char *buf, const size_t buflen,
+			struct sockaddr *sap, const size_t salen)
+{
+	return 0;
+}
+#endif
+
+/**
+ * rpc_pton - Construct a sockaddr in @sap
+ * @buf: C string containing presentation format IP address
+ * @buflen: length of presentation address in bytes
+ * @sap: buffer into which to plant socket address
+ * @salen: size of buffer in bytes
+ *
+ * Returns the size of the socket address if successful; otherwise
+ * zero is returned.
+ *
+ * Plants a socket address in @sap and returns the size of the
+ * socket address, if successful.  Returns zero if an error
+ * occurred.
+ */
+size_t rpc_pton(const char *buf, const size_t buflen,
+		struct sockaddr *sap, const size_t salen)
+{
+	unsigned int i;
+
+	for (i = 0; i < buflen; i++)
+		if (buf[i] == ':')
+			return rpc_pton6(buf, buflen, sap, salen);
+	return rpc_pton4(buf, buflen, sap, salen);
+}
+EXPORT_SYMBOL_GPL(rpc_pton);
+
+/**
+ * rpc_sockaddr2uaddr - Construct a universal address string from @sap.
+ * @sap: socket address
+ *
+ * Returns a %NUL-terminated string in dynamically allocated memory;
+ * otherwise NULL is returned if an error occurred.  Caller must
+ * free the returned string.
+ */
+char *rpc_sockaddr2uaddr(const struct sockaddr *sap)
+{
+	char portbuf[RPCBIND_MAXUADDRPLEN];
+	char addrbuf[RPCBIND_MAXUADDRLEN];
+	unsigned short port;
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		if (rpc_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
+			return NULL;
+		port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+		break;
+	case AF_INET6:
+		if (rpc_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
+			return NULL;
+		port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
+		break;
+	default:
+		return NULL;
+	}
+
+	if (snprintf(portbuf, sizeof(portbuf),
+		     ".%u.%u", port >> 8, port & 0xff) > (int)sizeof(portbuf))
+		return NULL;
+
+	if (strlcat(addrbuf, portbuf, sizeof(addrbuf)) > sizeof(addrbuf))
+		return NULL;
+
+	return kstrdup(addrbuf, GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(rpc_sockaddr2uaddr);
+
+/**
+ * rpc_uaddr2sockaddr - convert a universal address to a socket address.
+ * @uaddr: C string containing universal address to convert
+ * @uaddr_len: length of universal address string
+ * @sap: buffer into which to plant socket address
+ * @salen: size of buffer
+ *
+ * Returns the size of the socket address if successful; otherwise
+ * zero is returned.
+ */
+size_t rpc_uaddr2sockaddr(const char *uaddr, const size_t uaddr_len,
+			  struct sockaddr *sap, const size_t salen)
+{
+	char *c, buf[RPCBIND_MAXUADDRLEN];
+	unsigned long portlo, porthi;
+	unsigned short port;
+
+	if (uaddr_len > sizeof(buf))
+		return 0;
+
+	memcpy(buf, uaddr, uaddr_len);
+
+	buf[uaddr_len] = '\n';
+	buf[uaddr_len + 1] = '\0';
+
+	c = strrchr(buf, '.');
+	if (unlikely(c == NULL))
+		return 0;
+	if (unlikely(strict_strtoul(c + 1, 10, &portlo) != 0))
+		return 0;
+	if (unlikely(portlo > 255))
+		return 0;
+
+	c[0] = '\n';
+	c[1] = '\0';
+
+	c = strrchr(buf, '.');
+	if (unlikely(c == NULL))
+		return 0;
+	if (unlikely(strict_strtoul(c + 1, 10, &porthi) != 0))
+		return 0;
+	if (unlikely(porthi > 255))
+		return 0;
+
+	port = (unsigned short)((porthi << 8) | portlo);
+
+	c[0] = '\0';
+
+	if (rpc_pton(buf, strlen(buf), sap, salen) == 0)
+		return 0;
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *)sap)->sin_port = htons(port);
+		return sizeof(struct sockaddr_in);
+	case AF_INET6:
+		((struct sockaddr_in6 *)sap)->sin6_port = htons(port);
+		return sizeof(struct sockaddr_in6);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rpc_uaddr2sockaddr);
-- 
cgit v1.2.3-70-g09d2


From 53a0b9c4c99ab0085a06421f71592722e5b3fd5f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:36 -0400
Subject: NFS: Replace nfs_parse_ip_address() with rpc_pton()

Clean up: Use the common routine now provided in sunrpc.ko for parsing mount
addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h      |   1 -
 fs/nfs/nfs4namespace.c |   6 +-
 fs/nfs/super.c         | 148 +++++++------------------------------------------
 3 files changed, 22 insertions(+), 133 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ff68397f9b19..cf1da3e22004 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -214,7 +214,6 @@ void nfs_zap_acl_cache(struct inode *inode);
 extern int nfs_wait_bit_killable(void *word);
 
 /* super.c */
-void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
 extern struct file_system_type nfs_xdev_fs_type;
 #ifdef CONFIG_NFS_V4
 extern struct file_system_type nfs4_xdev_fs_type;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2a2a0a7143ad..43c86b7556e1 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -121,9 +121,9 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 
 		if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
 			continue;
-		nfs_parse_ip_address(buf->data, buf->len,
-				mountdata->addr, &mountdata->addrlen);
-		if (mountdata->addr->sa_family == AF_UNSPEC)
+		mountdata->addrlen = rpc_pton(buf->data, buf->len,
+				mountdata->addr, mountdata->addrlen);
+		if (mountdata->addrlen == 0)
 			continue;
 		nfs_set_port(mountdata->addr, NFS_PORT);
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 8526008eba72..1eeba8e53802 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -742,129 +742,10 @@ static int nfs_verify_server_address(struct sockaddr *addr)
 	}
 	}
 
+	dfprintk(MOUNT, "NFS: Invalid IP address specified\n");
 	return 0;
 }
 
-static void nfs_parse_ipv4_address(char *string, size_t str_len,
-				   struct sockaddr *sap, size_t *addr_len)
-{
-	struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-	u8 *addr = (u8 *)&sin->sin_addr.s_addr;
-
-	if (str_len <= INET_ADDRSTRLEN) {
-		dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
-				(int)str_len, string);
-
-		sin->sin_family = AF_INET;
-		*addr_len = sizeof(*sin);
-		if (in4_pton(string, str_len, addr, '\0', NULL))
-			return;
-	}
-
-	sap->sa_family = AF_UNSPEC;
-	*addr_len = 0;
-}
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
-				   const char *delim,
-				   struct sockaddr_in6 *sin6)
-{
-	char *p;
-	size_t len;
-
-	if ((string + str_len) == delim)
-		return 1;
-
-	if (*delim != IPV6_SCOPE_DELIMITER)
-		return 0;
-
-	if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
-		return 0;
-
-	len = (string + str_len) - delim - 1;
-	p = kstrndup(delim + 1, len, GFP_KERNEL);
-	if (p) {
-		unsigned long scope_id = 0;
-		struct net_device *dev;
-
-		dev = dev_get_by_name(&init_net, p);
-		if (dev != NULL) {
-			scope_id = dev->ifindex;
-			dev_put(dev);
-		} else {
-			if (strict_strtoul(p, 10, &scope_id) == 0) {
-				kfree(p);
-				return 0;
-			}
-		}
-
-		kfree(p);
-
-		sin6->sin6_scope_id = scope_id;
-		dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
-		return 1;
-	}
-
-	return 0;
-}
-
-static void nfs_parse_ipv6_address(char *string, size_t str_len,
-				   struct sockaddr *sap, size_t *addr_len)
-{
-	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-	u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
-	const char *delim;
-
-	if (str_len <= INET6_ADDRSTRLEN) {
-		dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
-				(int)str_len, string);
-
-		sin6->sin6_family = AF_INET6;
-		*addr_len = sizeof(*sin6);
-		if (in6_pton(string, str_len, addr,
-					IPV6_SCOPE_DELIMITER, &delim) != 0) {
-			if (nfs_parse_ipv6_scope_id(string, str_len,
-							delim, sin6) != 0)
-				return;
-		}
-	}
-
-	sap->sa_family = AF_UNSPEC;
-	*addr_len = 0;
-}
-#else
-static void nfs_parse_ipv6_address(char *string, size_t str_len,
-				   struct sockaddr *sap, size_t *addr_len)
-{
-	sap->sa_family = AF_UNSPEC;
-	*addr_len = 0;
-}
-#endif
-
-/*
- * Construct a sockaddr based on the contents of a string that contains
- * an IP address in presentation format.
- *
- * If there is a problem constructing the new sockaddr, set the address
- * family to AF_UNSPEC.
- */
-void nfs_parse_ip_address(char *string, size_t str_len,
-				 struct sockaddr *sap, size_t *addr_len)
-{
-	unsigned int i, colons;
-
-	colons = 0;
-	for (i = 0; i < str_len; i++)
-		if (string[i] == ':')
-			colons++;
-
-	if (colons >= 2)
-		nfs_parse_ipv6_address(string, str_len, sap, addr_len);
-	else
-		nfs_parse_ipv4_address(string, str_len, sap, addr_len);
-}
-
 /*
  * Sanity check the NFS transport protocol.
  *
@@ -1344,11 +1225,14 @@ static int nfs_parse_mount_options(char *raw,
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
-			nfs_parse_ip_address(string, strlen(string),
-					     (struct sockaddr *)
-						&mnt->nfs_server.address,
-					     &mnt->nfs_server.addrlen);
+			mnt->nfs_server.addrlen =
+				rpc_pton(string, strlen(string),
+					(struct sockaddr *)
+					&mnt->nfs_server.address,
+					sizeof(mnt->nfs_server.address));
 			kfree(string);
+			if (mnt->nfs_server.addrlen == 0)
+				goto out_invalid_address;
 			break;
 		case Opt_clientaddr:
 			string = match_strdup(args);
@@ -1368,11 +1252,14 @@ static int nfs_parse_mount_options(char *raw,
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
-			nfs_parse_ip_address(string, strlen(string),
-					     (struct sockaddr *)
-						&mnt->mount_server.address,
-					     &mnt->mount_server.addrlen);
+			mnt->mount_server.addrlen =
+				rpc_pton(string, strlen(string),
+					(struct sockaddr *)
+					&mnt->mount_server.address,
+					sizeof(mnt->mount_server.address));
 			kfree(string);
+			if (mnt->mount_server.addrlen == 0)
+				goto out_invalid_address;
 			break;
 		case Opt_lookupcache:
 			string = match_strdup(args);
@@ -1424,8 +1311,11 @@ static int nfs_parse_mount_options(char *raw,
 
 	return 1;
 
+out_invalid_address:
+	printk(KERN_INFO "NFS: bad IP address specified: %s\n", p);
+	return 0;
 out_invalid_value:
-	printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p);
+	printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);
 	return 0;
 out_nomem:
 	printk(KERN_INFO "NFS: not enough memory to parse option\n");
-- 
cgit v1.2.3-70-g09d2


From ec6ee61250acfccbc5578dd4014735fb2cbe53b5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:37 -0400
Subject: NFS: Replace nfs_set_port() with rpc_set_port()

Clean up.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h      | 19 -------------------
 fs/nfs/nfs4namespace.c |  2 +-
 fs/nfs/super.c         |  6 +++---
 3 files changed, 4 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index cf1da3e22004..c2f171a3d70e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -368,22 +368,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
 	return ((unsigned long)len + (unsigned long)base +
 		PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
-
-/*
- * Set the port number in an address.  Be agnostic about the address
- * family.
- */
-static inline void nfs_set_port(struct sockaddr *sap, unsigned short port)
-{
-	struct sockaddr_in *ap = (struct sockaddr_in *)sap;
-	struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap;
-
-	switch (sap->sa_family) {
-	case AF_INET:
-		ap->sin_port = htons(port);
-		break;
-	case AF_INET6:
-		ap6->sin6_port = htons(port);
-		break;
-	}
-}
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 43c86b7556e1..ef22ee89aa77 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -125,7 +125,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 				mountdata->addr, mountdata->addrlen);
 		if (mountdata->addrlen == 0)
 			continue;
-		nfs_set_port(mountdata->addr, NFS_PORT);
+		rpc_set_port(mountdata->addr, NFS_PORT);
 
 		memcpy(page2, buf->data, buf->len);
 		page2[buf->len] = '\0';
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 1eeba8e53802..9c85cdb353aa 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1409,7 +1409,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	/*
 	 * autobind will be used if mount_server.port == 0
 	 */
-	nfs_set_port(request.sap, args->mount_server.port);
+	rpc_set_port(request.sap, args->mount_server.port);
 
 	/*
 	 * Now ask the mount server to map our export path
@@ -1703,7 +1703,7 @@ static int nfs_validate_mount_data(void *options,
 						&args->nfs_server.address))
 			goto out_no_address;
 
-		nfs_set_port((struct sockaddr *)&args->nfs_server.address,
+		rpc_set_port((struct sockaddr *)&args->nfs_server.address,
 				args->nfs_server.port);
 
 		nfs_set_mount_transport_protocol(args);
@@ -2336,7 +2336,7 @@ static int nfs4_validate_mount_data(void *options,
 						&args->nfs_server.address))
 			return -EINVAL;
 
-		nfs_set_port((struct sockaddr *)&args->nfs_server.address,
+		rpc_set_port((struct sockaddr *)&args->nfs_server.address,
 				args->nfs_server.port);
 
 		nfs_validate_transport_protocol(args);
-- 
cgit v1.2.3-70-g09d2


From b97a56747ea3f6c1a27dd0719bf1424959f1ebae Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:38 -0400
Subject: lockd: Replace nlm_clear_port()

Clean up: Use shared rpc_set_port() function instead of nlm_clear_port().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 99d737bd4325..7cb076ac6b45 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -87,18 +87,6 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
 	return hash & (NLM_HOST_NRHASH - 1);
 }
 
-static void nlm_clear_port(struct sockaddr *sap)
-{
-	switch (sap->sa_family) {
-	case AF_INET:
-		((struct sockaddr_in *)sap)->sin_port = 0;
-		break;
-	case AF_INET6:
-		((struct sockaddr_in6 *)sap)->sin6_port = 0;
-		break;
-	}
-}
-
 /*
  * Common host lookup routine for server & client
  */
@@ -177,7 +165,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 	host->h_addrbuf    = nsm->sm_addrbuf;
 	memcpy(nlm_addr(host), ni->sap, ni->salen);
 	host->h_addrlen = ni->salen;
-	nlm_clear_port(nlm_addr(host));
+	rpc_set_port(nlm_addr(host), 0);
 	memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
 	host->h_version    = ni->version;
 	host->h_proto      = ni->protocol;
-- 
cgit v1.2.3-70-g09d2


From c15128c5e428af1e8ada1476484a74c970665edd Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:39 -0400
Subject: lockd: Replace nsm_display_address() with rpc_ntop()

Clean up.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/mon.c | 44 +++++---------------------------------------
 1 file changed, 5 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 7fce1b525849..30c933188dd7 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -61,43 +61,6 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
 	return (struct sockaddr *)&nsm->sm_addr;
 }
 
-static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
-				     const size_t len)
-{
-	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-	snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
-}
-
-static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
-				     const size_t len)
-{
-	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-
-	if (ipv6_addr_v4mapped(&sin6->sin6_addr))
-		snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
-	else if (sin6->sin6_scope_id != 0)
-		snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
-				sin6->sin6_scope_id);
-	else
-		snprintf(buf, len, "%pI6", &sin6->sin6_addr);
-}
-
-static void nsm_display_address(const struct sockaddr *sap,
-				char *buf, const size_t len)
-{
-	switch (sap->sa_family) {
-	case AF_INET:
-		nsm_display_ipv4_address(sap, buf, len);
-		break;
-	case AF_INET6:
-		nsm_display_ipv6_address(sap, buf, len);
-		break;
-	default:
-		snprintf(buf, len, "unsupported address family");
-		break;
-	}
-}
-
 static struct rpc_clnt *nsm_create(void)
 {
 	struct sockaddr_in sin = {
@@ -307,8 +270,11 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
 	memcpy(nsm_addr(new), sap, salen);
 	new->sm_addrlen = salen;
 	nsm_init_private(new);
-	nsm_display_address((const struct sockaddr *)&new->sm_addr,
-				new->sm_addrbuf, sizeof(new->sm_addrbuf));
+
+	if (rpc_ntop(nsm_addr(new), new->sm_addrbuf,
+					sizeof(new->sm_addrbuf)) == 0)
+		(void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf),
+				"unsupported address family");
 	memcpy(new->sm_name, hostname, hostname_len);
 	new->sm_name[hostname_len] = '\0';
 
-- 
cgit v1.2.3-70-g09d2


From 4116092b92f859e5e9a90c99d740933e651ee8c0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Aug 2009 15:09:40 -0400
Subject: NFSD: Support IPv6 addresses in write_failover_ip()

In write_failover_ip(), replace the sscanf() with a call to the common
sunrpc.ko presentation address parser.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/nfsctl.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6d0847562d87..7e906c5b7671 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -37,6 +37,7 @@
 #include <linux/nfsd/xdr.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
+#include <linux/sunrpc/clnt.h>
 
 #include <asm/uaccess.h>
 #include <net/ipv6.h>
@@ -490,22 +491,18 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
  *
  * Input:
  *			buf:	'\n'-terminated C string containing a
- *				presentation format IPv4 address
+ *				presentation format IP address
  *			size:	length of C string in @buf
  * Output:
  *	On success:	returns zero if all specified locks were released;
  *			returns one if one or more locks were not released
  *	On error:	return code is negative errno value
- *
- * Note: Only AF_INET client addresses are passed in
  */
 static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
 {
-	struct sockaddr_in sin = {
-		.sin_family	= AF_INET,
-	};
-	int b1, b2, b3, b4;
-	char c;
+	struct sockaddr_storage address;
+	struct sockaddr *sap = (struct sockaddr *)&address;
+	size_t salen = sizeof(address);
 	char *fo_path;
 
 	/* sanity check */
@@ -519,14 +516,10 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
 	if (qword_get(&buf, fo_path, size) < 0)
 		return -EINVAL;
 
-	/* get ipv4 address */
-	if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4)
-		return -EINVAL;
-	if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255)
+	if (rpc_pton(fo_path, size, sap, salen) == 0)
 		return -EINVAL;
-	sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
 
-	return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
+	return nlmsvc_unlock_all_by_ip(sap);
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From b693ba4a338da15db1db4b5ebaa36e4ab9781c82 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:14:15 -0400
Subject: SUNRPC: Constify rpc_pipe_ops...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c                     | 2 +-
 include/linux/sunrpc/rpc_pipe_fs.h | 5 +++--
 net/sunrpc/auth_gss/auth_gss.c     | 8 ++++----
 net/sunrpc/rpc_pipe.c              | 7 ++++---
 4 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 86147b0ab2cf..fae0d3e52b44 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -101,7 +101,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
 
 static unsigned int fnvhash32(const void *, size_t);
 
-static struct rpc_pipe_ops idmap_upcall_ops = {
+static const struct rpc_pipe_ops idmap_upcall_ops = {
 	.upcall		= idmap_pipe_upcall,
 	.downcall	= idmap_pipe_downcall,
 	.destroy_msg	= idmap_pipe_destroy_msg,
diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index cea764c2359f..91f5b13389c5 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -32,8 +32,8 @@ struct rpc_inode {
 	wait_queue_head_t waitq;
 #define RPC_PIPE_WAIT_FOR_OPEN	1
 	int flags;
-	struct rpc_pipe_ops *ops;
 	struct delayed_work queue_timeout;
+	const struct rpc_pipe_ops *ops;
 };
 
 static inline struct rpc_inode *
@@ -46,7 +46,8 @@ extern int rpc_queue_upcall(struct inode *, struct rpc_pipe_msg *);
 
 extern struct dentry *rpc_mkdir(char *, struct rpc_clnt *);
 extern int rpc_rmdir(struct dentry *);
-extern struct dentry *rpc_mkpipe(struct dentry *, const char *, void *, struct rpc_pipe_ops *, int flags);
+extern struct dentry *rpc_mkpipe(struct dentry *, const char *, void *,
+				 const struct rpc_pipe_ops *, int flags);
 extern int rpc_unlink(struct dentry *);
 extern struct vfsmount *rpc_get_mount(void);
 extern void rpc_put_mount(void);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 66d458fc6920..23eb3864ffc0 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -89,8 +89,8 @@ static struct rpc_wait_queue pipe_version_rpc_waitqueue;
 static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue);
 
 static void gss_free_ctx(struct gss_cl_ctx *);
-static struct rpc_pipe_ops gss_upcall_ops_v0;
-static struct rpc_pipe_ops gss_upcall_ops_v1;
+static const struct rpc_pipe_ops gss_upcall_ops_v0;
+static const struct rpc_pipe_ops gss_upcall_ops_v1;
 
 static inline struct gss_cl_ctx *
 gss_get_ctx(struct gss_cl_ctx *ctx)
@@ -1507,7 +1507,7 @@ static const struct rpc_credops gss_nullops = {
 	.crunwrap_resp	= gss_unwrap_resp,
 };
 
-static struct rpc_pipe_ops gss_upcall_ops_v0 = {
+static const struct rpc_pipe_ops gss_upcall_ops_v0 = {
 	.upcall		= gss_pipe_upcall,
 	.downcall	= gss_pipe_downcall,
 	.destroy_msg	= gss_pipe_destroy_msg,
@@ -1515,7 +1515,7 @@ static struct rpc_pipe_ops gss_upcall_ops_v0 = {
 	.release_pipe	= gss_pipe_release,
 };
 
-static struct rpc_pipe_ops gss_upcall_ops_v1 = {
+static const struct rpc_pipe_ops gss_upcall_ops_v1 = {
 	.upcall		= gss_pipe_upcall,
 	.downcall	= gss_pipe_downcall,
 	.destroy_msg	= gss_pipe_destroy_msg,
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 9ced0628d69c..f6f60f625e3f 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -125,7 +125,7 @@ static void
 rpc_close_pipes(struct inode *inode)
 {
 	struct rpc_inode *rpci = RPC_I(inode);
-	struct rpc_pipe_ops *ops;
+	const struct rpc_pipe_ops *ops;
 	int need_release;
 
 	mutex_lock(&inode->i_mutex);
@@ -776,8 +776,9 @@ rpc_rmdir(struct dentry *dentry)
  * The @private argument passed here will be available to all these methods
  * from the file pointer, via RPC_I(file->f_dentry->d_inode)->private.
  */
-struct dentry *
-rpc_mkpipe(struct dentry *parent, const char *name, void *private, struct rpc_pipe_ops *ops, int flags)
+struct dentry *rpc_mkpipe(struct dentry *parent, const char *name,
+			  void *private, const struct rpc_pipe_ops *ops,
+			  int flags)
 {
 	struct dentry *dentry;
 	struct inode *dir, *inode;
-- 
cgit v1.2.3-70-g09d2


From 7d217caca5d704e48aa5e59aba0b3ad4c7af4fd2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:14:24 -0400
Subject: SUNRPC: Replace rpc_client->cl_dentry and cl_mnt, with a cl_path

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c                 |  4 ++--
 include/linux/sunrpc/clnt.h    |  4 ++--
 net/sunrpc/auth_gss/auth_gss.c |  4 ++--
 net/sunrpc/clnt.c              | 24 ++++++++++++------------
 4 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index fae0d3e52b44..21a84d45916f 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -119,8 +119,8 @@ nfs_idmap_new(struct nfs_client *clp)
 	if (idmap == NULL)
 		return -ENOMEM;
 
-	idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
-					 idmap, &idmap_upcall_ops, 0);
+	idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry,
+			"idmap", idmap, &idmap_upcall_ops, 0);
 	if (IS_ERR(idmap->idmap_dentry)) {
 		error = PTR_ERR(idmap->idmap_dentry);
 		kfree(idmap);
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 37881f1a0bd7..38ad162330a2 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -17,6 +17,7 @@
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/timer.h>
 #include <asm/signal.h>
+#include <linux/path.h>
 
 struct rpc_inode;
 
@@ -51,8 +52,7 @@ struct rpc_clnt {
 	int			cl_nodelen;	/* nodename length */
 	char 			cl_nodename[UNX_MAXNODENAME];
 	char			cl_pathname[30];/* Path in rpc_pipe_fs */
-	struct vfsmount *	cl_vfsmnt;
-	struct dentry *		cl_dentry;	/* inode */
+	struct path		cl_path;
 	struct rpc_clnt *	cl_parent;	/* Points to parent of clones */
 	struct rpc_rtt		cl_rtt_default;
 	struct rpc_timeout	cl_timeout_default;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 23eb3864ffc0..fc6a43ccd950 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -777,7 +777,7 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 	 * that we supported only the old pipe.  So we instead create
 	 * the new pipe first.
 	 */
-	gss_auth->dentry[1] = rpc_mkpipe(clnt->cl_dentry,
+	gss_auth->dentry[1] = rpc_mkpipe(clnt->cl_path.dentry,
 					 "gssd",
 					 clnt, &gss_upcall_ops_v1,
 					 RPC_PIPE_WAIT_FOR_OPEN);
@@ -786,7 +786,7 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 		goto err_put_mech;
 	}
 
-	gss_auth->dentry[0] = rpc_mkpipe(clnt->cl_dentry,
+	gss_auth->dentry[0] = rpc_mkpipe(clnt->cl_path.dentry,
 					 gss_auth->mech->gm_name,
 					 clnt, &gss_upcall_ops_v0,
 					 RPC_PIPE_WAIT_FOR_OPEN);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 6ec37701a165..b3f863346300 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -99,24 +99,24 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
 	static uint32_t clntid;
 	int error;
 
-	clnt->cl_vfsmnt = ERR_PTR(-ENOENT);
-	clnt->cl_dentry = ERR_PTR(-ENOENT);
+	clnt->cl_path.mnt = ERR_PTR(-ENOENT);
+	clnt->cl_path.dentry = ERR_PTR(-ENOENT);
 	if (dir_name == NULL)
 		return 0;
 
-	clnt->cl_vfsmnt = rpc_get_mount();
-	if (IS_ERR(clnt->cl_vfsmnt))
-		return PTR_ERR(clnt->cl_vfsmnt);
+	clnt->cl_path.mnt = rpc_get_mount();
+	if (IS_ERR(clnt->cl_path.mnt))
+		return PTR_ERR(clnt->cl_path.mnt);
 
 	for (;;) {
 		snprintf(clnt->cl_pathname, sizeof(clnt->cl_pathname),
 				"%s/clnt%x", dir_name,
 				(unsigned int)clntid++);
 		clnt->cl_pathname[sizeof(clnt->cl_pathname) - 1] = '\0';
-		clnt->cl_dentry = rpc_create_client_dir(clnt->cl_pathname, clnt);
-		if (!IS_ERR(clnt->cl_dentry))
+		clnt->cl_path.dentry = rpc_create_client_dir(clnt->cl_pathname, clnt);
+		if (!IS_ERR(clnt->cl_path.dentry))
 			return 0;
-		error = PTR_ERR(clnt->cl_dentry);
+		error = PTR_ERR(clnt->cl_path.dentry);
 		if (error != -EEXIST) {
 			printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n",
 					clnt->cl_pathname, error);
@@ -231,8 +231,8 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
 	return clnt;
 
 out_no_auth:
-	if (!IS_ERR(clnt->cl_dentry)) {
-		rpc_remove_client_dir(clnt->cl_dentry);
+	if (!IS_ERR(clnt->cl_path.dentry)) {
+		rpc_remove_client_dir(clnt->cl_path.dentry);
 		rpc_put_mount();
 	}
 out_no_path:
@@ -423,8 +423,8 @@ rpc_free_client(struct kref *kref)
 
 	dprintk("RPC:       destroying %s client for %s\n",
 			clnt->cl_protname, clnt->cl_server);
-	if (!IS_ERR(clnt->cl_dentry)) {
-		rpc_remove_client_dir(clnt->cl_dentry);
+	if (!IS_ERR(clnt->cl_path.dentry)) {
+		rpc_remove_client_dir(clnt->cl_path.dentry);
 		rpc_put_mount();
 	}
 	if (clnt->cl_parent != clnt) {
-- 
cgit v1.2.3-70-g09d2


From 2da8ca26c6bfad685bfddf39728eac1c83906aa9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:14:26 -0400
Subject: NFSD: Clean up the idmapper warning...

What part of 'internal use' is so hard to understand?

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/nfs4idmap.c          | 4 ++--
 include/linux/sunrpc/cache.h | 3 ++-
 net/sunrpc/cache.c           | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5b398421b051..e9012ad36ac0 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -175,10 +175,10 @@ idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
 }
 
 static void
-warn_no_idmapd(struct cache_detail *detail)
+warn_no_idmapd(struct cache_detail *detail, int has_died)
 {
 	printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n",
-			detail->last_close? "died" : "not been started");
+			has_died ? "died" : "not been started");
 }
 
 
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 2d8b211b9324..3d1fad22185a 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -79,6 +79,8 @@ struct cache_detail {
 	int			(*cache_show)(struct seq_file *m,
 					      struct cache_detail *cd,
 					      struct cache_head *h);
+	void			(*warn_no_listener)(struct cache_detail *cd,
+					      int has_died);
 
 	struct cache_head *	(*alloc)(void);
 	int			(*match)(struct cache_head *orig, struct cache_head *new);
@@ -102,7 +104,6 @@ struct cache_detail {
 	atomic_t		readers;		/* how many time is /chennel open */
 	time_t			last_close;		/* if no readers, when did last close */
 	time_t			last_warn;		/* when we last warned about no readers */
-	void			(*warn_no_listener)(struct cache_detail *cd);
 };
 
 
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index ff0c23053d2f..8ede4a6f384b 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1020,7 +1020,7 @@ static void warn_no_listener(struct cache_detail *detail)
 	if (detail->last_warn != detail->last_close) {
 		detail->last_warn = detail->last_close;
 		if (detail->warn_no_listener)
-			detail->warn_no_listener(detail);
+			detail->warn_no_listener(detail, detail->last_close != 0);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From bc74b4f5e63a09fb78e245794a0de1e5a2716bbe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 9 Aug 2009 15:14:29 -0400
Subject: SUNRPC: Allow the cache_detail to specify alternative upcall
 mechanisms

For events that are rare, such as referral DNS lookups, it makes limited
sense to have a daemon constantly listening for upcalls on a channel. An
alternative in those cases might simply be to run the app that fills the
cache using call_usermodehelper_exec() and friends.

The following patch allows the cache_detail to specify alternative upcall
mechanisms for these particular cases.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/export.c                  | 14 ++++++++++++--
 fs/nfsd/nfs4idmap.c               | 16 ++++++++++++++--
 include/linux/sunrpc/cache.h      | 13 ++++++++++---
 net/sunrpc/auth_gss/svcauth_gss.c |  7 ++++++-
 net/sunrpc/cache.c                | 26 ++++++++++++++++++--------
 net/sunrpc/svcauth_unix.c         | 14 ++++++++++++--
 6 files changed, 72 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index b92a27629fb7..d9462643155c 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -85,6 +85,11 @@ static void expkey_request(struct cache_detail *cd,
 	(*bpp)[-1] = '\n';
 }
 
+static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, expkey_request);
+}
+
 static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old);
 static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *);
 static struct cache_detail svc_expkey_cache;
@@ -259,7 +264,7 @@ static struct cache_detail svc_expkey_cache = {
 	.hash_table	= expkey_table,
 	.name		= "nfsd.fh",
 	.cache_put	= expkey_put,
-	.cache_request	= expkey_request,
+	.cache_upcall	= expkey_upcall,
 	.cache_parse	= expkey_parse,
 	.cache_show	= expkey_show,
 	.match		= expkey_match,
@@ -355,6 +360,11 @@ static void svc_export_request(struct cache_detail *cd,
 	(*bpp)[-1] = '\n';
 }
 
+static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
+}
+
 static struct svc_export *svc_export_update(struct svc_export *new,
 					    struct svc_export *old);
 static struct svc_export *svc_export_lookup(struct svc_export *);
@@ -724,7 +734,7 @@ struct cache_detail svc_export_cache = {
 	.hash_table	= export_table,
 	.name		= "nfsd.export",
 	.cache_put	= svc_export_put,
-	.cache_request	= svc_export_request,
+	.cache_upcall	= svc_export_upcall,
 	.cache_parse	= svc_export_parse,
 	.cache_show	= svc_export_show,
 	.match		= svc_export_match,
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index e9012ad36ac0..cdfa86fa1471 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -145,6 +145,12 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
 	(*bpp)[-1] = '\n';
 }
 
+static int
+idtoname_upcall(struct cache_detail *cd, struct cache_head *ch)
+{
+	return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request);
+}
+
 static int
 idtoname_match(struct cache_head *ca, struct cache_head *cb)
 {
@@ -192,7 +198,7 @@ static struct cache_detail idtoname_cache = {
 	.hash_table	= idtoname_table,
 	.name		= "nfs4.idtoname",
 	.cache_put	= ent_put,
-	.cache_request	= idtoname_request,
+	.cache_upcall	= idtoname_upcall,
 	.cache_parse	= idtoname_parse,
 	.cache_show	= idtoname_show,
 	.warn_no_listener = warn_no_idmapd,
@@ -324,6 +330,12 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
 	(*bpp)[-1] = '\n';
 }
 
+static int
+nametoid_upcall(struct cache_detail *cd, struct cache_head *ch)
+{
+	return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request);
+}
+
 static int
 nametoid_match(struct cache_head *ca, struct cache_head *cb)
 {
@@ -363,7 +375,7 @@ static struct cache_detail nametoid_cache = {
 	.hash_table	= nametoid_table,
 	.name		= "nfs4.nametoid",
 	.cache_put	= ent_put,
-	.cache_request	= nametoid_request,
+	.cache_upcall	= nametoid_upcall,
 	.cache_parse	= nametoid_parse,
 	.cache_show	= nametoid_show,
 	.warn_no_listener = warn_no_idmapd,
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 3d1fad22185a..23ee25981a0c 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -70,9 +70,9 @@ struct cache_detail {
 	char			*name;
 	void			(*cache_put)(struct kref *);
 
-	void			(*cache_request)(struct cache_detail *cd,
-						 struct cache_head *h,
-						 char **bpp, int *blen);
+	int			(*cache_upcall)(struct cache_detail *,
+						struct cache_head *);
+
 	int			(*cache_parse)(struct cache_detail *,
 					       char *buf, int len);
 
@@ -135,6 +135,13 @@ extern struct cache_head *
 sunrpc_cache_update(struct cache_detail *detail,
 		    struct cache_head *new, struct cache_head *old, int hash);
 
+extern int
+sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h,
+		void (*cache_request)(struct cache_detail *,
+				      struct cache_head *,
+				      char **,
+				      int *));
+
 
 extern void cache_clean_deferred(void *owner);
 
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 2278a50c6444..2e6a148d277c 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -181,6 +181,11 @@ static void rsi_request(struct cache_detail *cd,
 	(*bpp)[-1] = '\n';
 }
 
+static int rsi_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, rsi_request);
+}
+
 
 static int rsi_parse(struct cache_detail *cd,
 		    char *mesg, int mlen)
@@ -270,7 +275,7 @@ static struct cache_detail rsi_cache = {
 	.hash_table     = rsi_table,
 	.name           = "auth.rpcsec.init",
 	.cache_put      = rsi_put,
-	.cache_request  = rsi_request,
+	.cache_upcall   = rsi_upcall,
 	.cache_parse    = rsi_parse,
 	.match		= rsi_match,
 	.init		= rsi_init,
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index c8e7d2d07260..e438352bed7b 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -176,7 +176,13 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 }
 EXPORT_SYMBOL_GPL(sunrpc_cache_update);
 
-static int cache_make_upcall(struct cache_detail *detail, struct cache_head *h);
+static int cache_make_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	if (!cd->cache_upcall)
+		return -EINVAL;
+	return cd->cache_upcall(cd, h);
+}
+
 /*
  * This is the generic cache management routine for all
  * the authentication caches.
@@ -322,7 +328,7 @@ static int create_cache_proc_entries(struct cache_detail *cd)
 	if (p == NULL)
 		goto out_nomem;
 
-	if (cd->cache_request || cd->cache_parse) {
+	if (cd->cache_upcall || cd->cache_parse) {
 		p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
 				     cd->proc_ent, &cache_file_operations, cd);
 		cd->channel_ent = p;
@@ -1080,10 +1086,16 @@ static void warn_no_listener(struct cache_detail *detail)
 }
 
 /*
- * register an upcall request to user-space.
+ * register an upcall request to user-space and queue it up for read() by the
+ * upcall daemon.
+ *
  * Each request is at most one page long.
  */
-static int cache_make_upcall(struct cache_detail *detail, struct cache_head *h)
+int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h,
+		void (*cache_request)(struct cache_detail *,
+				      struct cache_head *,
+				      char **,
+				      int *))
 {
 
 	char *buf;
@@ -1091,9 +1103,6 @@ static int cache_make_upcall(struct cache_detail *detail, struct cache_head *h)
 	char *bp;
 	int len;
 
-	if (detail->cache_request == NULL)
-		return -EINVAL;
-
 	if (atomic_read(&detail->readers) == 0 &&
 	    detail->last_close < get_seconds() - 30) {
 			warn_no_listener(detail);
@@ -1112,7 +1121,7 @@ static int cache_make_upcall(struct cache_detail *detail, struct cache_head *h)
 
 	bp = buf; len = PAGE_SIZE;
 
-	detail->cache_request(detail, h, &bp, &len);
+	cache_request(detail, h, &bp, &len);
 
 	if (len < 0) {
 		kfree(buf);
@@ -1130,6 +1139,7 @@ static int cache_make_upcall(struct cache_detail *detail, struct cache_head *h)
 	wake_up(&queue_wait);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall);
 
 /*
  * parse a message from user-space and pass it
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 5c865e2d299e..6caffa34ac01 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -171,6 +171,11 @@ static void ip_map_request(struct cache_detail *cd,
 	(*bpp)[-1] = '\n';
 }
 
+static int ip_map_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, ip_map_request);
+}
+
 static struct ip_map *ip_map_lookup(char *class, struct in6_addr *addr);
 static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t expiry);
 
@@ -289,7 +294,7 @@ struct cache_detail ip_map_cache = {
 	.hash_table	= ip_table,
 	.name		= "auth.unix.ip",
 	.cache_put	= ip_map_put,
-	.cache_request	= ip_map_request,
+	.cache_upcall	= ip_map_upcall,
 	.cache_parse	= ip_map_parse,
 	.cache_show	= ip_map_show,
 	.match		= ip_map_match,
@@ -523,6 +528,11 @@ static void unix_gid_request(struct cache_detail *cd,
 	(*bpp)[-1] = '\n';
 }
 
+static int unix_gid_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, unix_gid_request);
+}
+
 static struct unix_gid *unix_gid_lookup(uid_t uid);
 extern struct cache_detail unix_gid_cache;
 
@@ -622,7 +632,7 @@ struct cache_detail unix_gid_cache = {
 	.hash_table	= gid_table,
 	.name		= "auth.unix.gid",
 	.cache_put	= unix_gid_put,
-	.cache_request	= unix_gid_request,
+	.cache_upcall	= unix_gid_upcall,
 	.cache_parse	= unix_gid_parse,
 	.cache_show	= unix_gid_show,
 	.match		= unix_gid_match,
-- 
cgit v1.2.3-70-g09d2


From c333e073b7bf76dc819a6b4ce6bef88ee5fa5e50 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Mon, 10 Aug 2009 22:47:22 -0400
Subject: ext4: remove redundant test on unsigned

unsigned i_block cannot be less than 0.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index deb14a728791..9a4c929b16dc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -341,9 +341,7 @@ static int ext4_block_to_path(struct inode *inode,
 	int n = 0;
 	int final = 0;
 
-	if (i_block < 0) {
-		ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
-	} else if (i_block < direct_blocks) {
+	if (i_block < direct_blocks) {
 		offsets[n++] = i_block;
 		final = direct_blocks;
 	} else if ((i_block -= direct_blocks) < indirect_blocks) {
-- 
cgit v1.2.3-70-g09d2


From b1f485f20eb9b02cc7d2009556287f3939d480cc Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@sun.com>
Date: Mon, 10 Aug 2009 22:51:53 -0400
Subject: jbd2: round commit timer up to avoid uncommitted transaction

fix jiffie rounding in jbd commit timer setup code.  Rounding down
could cause the timer to be fired before the corresponding transaction
has expired.  That transaction can stay not committed forever if no
new transaction is created or expicit sync/umount happens.

Signed-off-by: Alex Zhuravlev (Tomas) <alex.zhuravlev@sun.com>
Signed-off-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6213ac728f30..4516a87ca55c 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	INIT_LIST_HEAD(&transaction->t_private_list);
 
 	/* Set up the commit timer for the new transaction. */
-	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+	journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
 	add_timer(&journal->j_commit_timer);
 
 	J_ASSERT(journal->j_running_transaction == NULL);
-- 
cgit v1.2.3-70-g09d2


From 91cc219ad963731191247c5f2db4118be2bc341a Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Mon, 10 Aug 2009 23:05:28 -0400
Subject: ext4: fix journal ref count in move_extent_par_page

move_extent_par_page calls a_ops->write_begin() to increase journal
handler's reference count. However, if either mext_replace_branches()
or ext4_get_block fails, the increased reference count isn't
decreased. This will cause a later attempt to umount of the fs to hang
forever. The patch addresses the issue by calling ext4_journal_stop()
if page is not NULL (which means a_ops->write_end() isn't invoked).

Signed-off-by: Peng Tao <bergwolf@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index bbf2dd9404dc..5821e0bee917 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -871,6 +871,7 @@ out:
 		if (PageLocked(page))
 			unlock_page(page);
 		page_cache_release(page);
+		ext4_journal_stop(handle);
 	}
 out2:
 	ext4_journal_stop(handle);
-- 
cgit v1.2.3-70-g09d2


From 6ba495e9259cd9a0b40ebd6c315143535c92542f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 18 Sep 2009 13:38:55 -0400
Subject: ext4: Add configurable run-time mballoc debugging

Allow mballoc debugging to be enabled at run-time instead of just at
compile time.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/Kconfig   |  9 +++++++
 fs/ext4/mballoc.c | 81 ++++++++++++++++++++++++++++++++++++++++---------------
 fs/ext4/mballoc.h | 16 ++++++++---
 3 files changed, 80 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 152304624644..d5c0ea2e8f2d 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -77,3 +77,12 @@ config EXT4_FS_SECURITY
 
 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
+
+config EXT4_DEBUG
+	bool "EXT4 debugging support"
+	depends on EXT4_FS
+	help
+	  Enables run-time debugging support for the ext4 filesystem.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 68cde598b3bf..d80b300c1779 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,7 @@
  */
 
 #include "mballoc.h"
+#include <linux/debugfs.h>
 #include <trace/events/ext4.h>
 
 /*
@@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 	char *data;
 	char *bitmap;
 
-	mb_debug("init page %lu\n", page->index);
+	mb_debug(1, "init page %lu\n", page->index);
 
 	inode = page->mapping->host;
 	sb = inode->i_sb;
@@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		set_bitmap_uptodate(bh[i]);
 		bh[i]->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, bh[i]);
-		mb_debug("read bitmap for group %u\n", first_group + i);
+		mb_debug(1, "read bitmap for group %u\n", first_group + i);
 	}
 
 	/* wait for I/O completion */
@@ -862,7 +863,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		if ((first_block + i) & 1) {
 			/* this is block of buddy */
 			BUG_ON(incore == NULL);
-			mb_debug("put buddy for group %u in page %lu/%x\n",
+			mb_debug(1, "put buddy for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
 			grinfo = ext4_get_group_info(sb, group);
 			grinfo->bb_fragments = 0;
@@ -878,7 +879,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		} else {
 			/* this is block of bitmap */
 			BUG_ON(incore != NULL);
-			mb_debug("put bitmap for group %u in page %lu/%x\n",
+			mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
 
 			/* see comments in ext4_mb_put_pa() */
@@ -922,7 +923,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct inode *inode = sbi->s_buddy_cache;
 
-	mb_debug("load group %u\n", group);
+	mb_debug(1, "load group %u\n", group);
 
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 	grp = ext4_get_group_info(sb, group);
@@ -1851,7 +1852,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 	struct inode *inode = sbi->s_buddy_cache;
 	struct page *page = NULL, *bitmap_page = NULL;
 
-	mb_debug("init group %u\n", group);
+	mb_debug(1, "init group %u\n", group);
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 	this_grp = ext4_get_group_info(sb, group);
 	/*
@@ -2739,7 +2740,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 		kmem_cache_free(ext4_pspace_cachep, pa);
 	}
 	if (count)
-		mb_debug("mballoc: %u PAs left\n", count);
+		mb_debug(1, "mballoc: %u PAs left\n", count);
 
 }
 
@@ -2820,7 +2821,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 	list_for_each_safe(l, ltmp, &txn->t_private_list) {
 		entry = list_entry(l, struct ext4_free_data, list);
 
-		mb_debug("gonna free %u blocks in group %u (0x%p):",
+		mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
 			 entry->count, entry->group, entry);
 
 		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2855,9 +2856,43 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		ext4_mb_release_desc(&e4b);
 	}
 
-	mb_debug("freed %u blocks in %u structures\n", count, count2);
+	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
 }
 
+#ifdef CONFIG_EXT4_DEBUG
+u8 mb_enable_debug __read_mostly;
+
+static struct dentry *debugfs_dir;
+static struct dentry *debugfs_debug;
+
+static void __init ext4_create_debugfs_entry(void)
+{
+	debugfs_dir = debugfs_create_dir("ext4", NULL);
+	if (debugfs_dir)
+		debugfs_debug = debugfs_create_u8("mballoc-debug",
+						  S_IRUGO | S_IWUSR,
+						  debugfs_dir,
+						  &mb_enable_debug);
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+	debugfs_remove(debugfs_debug);
+	debugfs_remove(debugfs_dir);
+}
+
+#else
+
+static void __init ext4_create_debugfs_entry(void)
+{
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+}
+
+#endif
+
 int __init init_ext4_mballoc(void)
 {
 	ext4_pspace_cachep =
@@ -2885,6 +2920,7 @@ int __init init_ext4_mballoc(void)
 		kmem_cache_destroy(ext4_ac_cachep);
 		return -ENOMEM;
 	}
+	ext4_create_debugfs_entry();
 	return 0;
 }
 
@@ -2898,6 +2934,7 @@ void exit_ext4_mballoc(void)
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
 	kmem_cache_destroy(ext4_free_ext_cachep);
+	ext4_remove_debugfs_entry();
 }
 
 
@@ -3042,7 +3079,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
 		ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
 	else
 		ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
-	mb_debug("#%u: goal %u blocks for locality group\n",
+	mb_debug(1, "#%u: goal %u blocks for locality group\n",
 		current->pid, ac->ac_g_ex.fe_len);
 }
 
@@ -3232,7 +3269,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
 	}
 
-	mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
+	mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
 		(unsigned) orig_size, (unsigned) start);
 }
 
@@ -3281,7 +3318,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
 	BUG_ON(pa->pa_free < len);
 	pa->pa_free -= len;
 
-	mb_debug("use %llu/%u from inode pa %p\n", start, len, pa);
+	mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
 }
 
 /*
@@ -3305,7 +3342,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
 	 * in on-disk bitmap -- see ext4_mb_release_context()
 	 * Other CPUs are prevented from allocating from this pa by lg_mutex
 	 */
-	mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+	mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
 }
 
 /*
@@ -3484,7 +3521,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 		preallocated += len;
 		count++;
 	}
-	mb_debug("prellocated %u for group %u\n", preallocated, group);
+	mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
 }
 
 static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3619,7 +3656,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 	pa->pa_deleted = 0;
 	pa->pa_type = MB_INODE_PA;
 
-	mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
+	mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
 	trace_ext4_mb_new_inode_pa(ac, pa);
 
@@ -3679,7 +3716,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	pa->pa_deleted = 0;
 	pa->pa_type = MB_GROUP_PA;
 
-	mb_debug("new group pa %p: %llu/%u for %u\n", pa,
+	mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
 	trace_ext4_mb_new_group_pa(ac, pa);
 
@@ -3758,7 +3795,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
 		start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
 				le32_to_cpu(sbi->s_es->s_first_data_block);
-		mb_debug("    free preallocated %u/%u in group %u\n",
+		mb_debug(1, "    free preallocated %u/%u in group %u\n",
 				(unsigned) start, (unsigned) next - bit,
 				(unsigned) group);
 		free += next - bit;
@@ -3849,7 +3886,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 	int busy = 0;
 	int free = 0;
 
-	mb_debug("discard preallocation for group %u\n", group);
+	mb_debug(1, "discard preallocation for group %u\n", group);
 
 	if (list_empty(&grp->bb_prealloc_list))
 		return 0;
@@ -3973,7 +4010,7 @@ void ext4_discard_preallocations(struct inode *inode)
 		return;
 	}
 
-	mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+	mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
 	trace_ext4_discard_preallocations(inode);
 
 	INIT_LIST_HEAD(&list);
@@ -4078,7 +4115,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
 {
 	BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
 }
-#ifdef MB_DEBUG
+#ifdef CONFIG_EXT4_DEBUG
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
 	struct super_block *sb = ac->ac_sb;
@@ -4227,7 +4264,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 	 * locality group. this is a policy, actually */
 	ext4_mb_group_or_file(ac);
 
-	mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+	mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
 			"left: %u/%u, right %u/%u to %swritable\n",
 			(unsigned) ar->len, (unsigned) ar->logical,
 			(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4249,7 +4286,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 	struct ext4_prealloc_space *pa, *tmp;
 	struct ext4_allocation_context *ac;
 
-	mb_debug("discard locality group preallocation\n");
+	mb_debug(1, "discard locality group preallocation\n");
 
 	INIT_LIST_HEAD(&discard_list);
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c96bb19f58f9..9db890d4d275 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,11 +37,19 @@
 
 /*
  */
-#define MB_DEBUG__
-#ifdef MB_DEBUG
-#define mb_debug(fmt, a...)	printk(fmt, ##a)
+#ifdef CONFIG_EXT4_DEBUG
+extern u8 mb_enable_debug;
+
+#define mb_debug(n, fmt, a...)	                                        \
+	do {								\
+		if ((n) <= mb_enable_debug) {		        	\
+			printk(KERN_DEBUG "(%s, %d): %s: ",		\
+			       __FILE__, __LINE__, __func__);		\
+			printk(fmt, ## a);				\
+		}							\
+	} while (0)
 #else
-#define mb_debug(fmt, a...)
+#define mb_debug(n, fmt, a...)
 #endif
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 0ef90db93a4ddfc300af288c2a1bfc1e6c79da64 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 9 Aug 2009 16:46:13 -0400
Subject: ext4: Display the mballoc flags in mb_history in hex instead of
 decimal

Displaying the flags in base 16 makes it easier to see which flags
have been set.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 22 +++++++++++-----------
 fs/ext4/mballoc.c |  4 ++--
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9714db393efe..e267727cc62d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -67,27 +67,27 @@ typedef unsigned int ext4_group_t;
 
 
 /* prefer goal again. length */
-#define EXT4_MB_HINT_MERGE		1
+#define EXT4_MB_HINT_MERGE		0x0001
 /* blocks already reserved */
-#define EXT4_MB_HINT_RESERVED		2
+#define EXT4_MB_HINT_RESERVED		0x0002
 /* metadata is being allocated */
-#define EXT4_MB_HINT_METADATA		4
+#define EXT4_MB_HINT_METADATA		0x0004
 /* first blocks in the file */
-#define EXT4_MB_HINT_FIRST		8
+#define EXT4_MB_HINT_FIRST		0x0008
 /* search for the best chunk */
-#define EXT4_MB_HINT_BEST		16
+#define EXT4_MB_HINT_BEST		0x0010
 /* data is being allocated */
-#define EXT4_MB_HINT_DATA		32
+#define EXT4_MB_HINT_DATA		0x0020
 /* don't preallocate (for tails) */
-#define EXT4_MB_HINT_NOPREALLOC		64
+#define EXT4_MB_HINT_NOPREALLOC		0x0040
 /* allocate for locality group */
-#define EXT4_MB_HINT_GROUP_ALLOC	128
+#define EXT4_MB_HINT_GROUP_ALLOC	0x0080
 /* allocate goal blocks or none */
-#define EXT4_MB_HINT_GOAL_ONLY		256
+#define EXT4_MB_HINT_GOAL_ONLY		0x0100
 /* goal is meaningful */
-#define EXT4_MB_HINT_TRY_GOAL		512
+#define EXT4_MB_HINT_TRY_GOAL		0x0200
 /* blocks already pre-reserved by delayed allocation */
-#define EXT4_MB_DELALLOC_RESERVED      1024
+#define EXT4_MB_DELALLOC_RESERVED	0x0400
 
 
 struct ext4_allocation_request {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d80b300c1779..3434c603432d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2157,7 +2157,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
 
 	if (v == SEQ_START_TOKEN) {
 		seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
-				"%-5s %-2s %-5s %-5s %-5s %-6s\n",
+				"%-5s %-2s %-6s %-5s %-5s %-6s\n",
 			  "pid", "inode", "original", "goal", "result", "found",
 			   "grps", "cr", "flags", "merge", "tail", "broken");
 		return 0;
@@ -2165,7 +2165,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
 
 	if (hs->op == EXT4_MB_HISTORY_ALLOC) {
 		fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
-			"%-5u %-5s %-5u %-6u\n";
+			"0x%04x %-5s %-5u %-6u\n";
 		sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
 			hs->result.fe_start, hs->result.fe_len,
 			hs->result.fe_logical);
-- 
cgit v1.2.3-70-g09d2


From 4ba74d00a20256e22f159cb288ff34b587608917 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 9 Aug 2009 22:01:13 -0400
Subject: ext4: Fix bugs in mballoc's stream allocation mode

The logic around sbi->s_mb_last_group and sbi->s_mb_last_start was all
screwed up.  These fields were getting unconditionally all the time,
set even when stream allocation had not taken place, and if they were
being used when the file was smaller than s_mb_stream_request, which
is when the allocation should _not_ be doing stream allocation.

Fix this by determining whether or not we stream allocation should
take place once, in ext4_mb_group_or_file(), and setting a flag which
gets used in ext4_mb_regular_allocator() and ext4_mb_use_best_found().
This simplifies the code and assures that we are consistently using
(or not using) the stream allocation logic.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  2 ++
 fs/ext4/mballoc.c | 23 ++++++++++-------------
 2 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e267727cc62d..70aa951ecb3c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -88,6 +88,8 @@ typedef unsigned int ext4_group_t;
 #define EXT4_MB_HINT_TRY_GOAL		0x0200
 /* blocks already pre-reserved by delayed allocation */
 #define EXT4_MB_DELALLOC_RESERVED	0x0400
+/* We are doing stream allocation */
+#define EXT4_MB_STREAM_ALLOC		0x0800
 
 
 struct ext4_allocation_request {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 3434c603432d..90a30ce822fc 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1361,7 +1361,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
 	ac->alloc_semp =  e4b->alloc_semp;
 	e4b->alloc_semp = NULL;
 	/* store last allocated for subsequent stream allocation */
-	if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
+	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
 		spin_lock(&sbi->s_md_lock);
 		sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
 		sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1939,7 +1939,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	struct ext4_buddy e4b;
-	loff_t size, isize;
 
 	sb = ac->ac_sb;
 	sbi = EXT4_SB(sb);
@@ -1975,20 +1974,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	}
 
 	bsbits = ac->ac_sb->s_blocksize_bits;
-	/* if stream allocation is enabled, use global goal */
-	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
-	isize = i_size_read(ac->ac_inode) >> bsbits;
-	if (size < isize)
-		size = isize;
 
-	if (size < sbi->s_mb_stream_request &&
-			(ac->ac_flags & EXT4_MB_HINT_DATA)) {
+	/* if stream allocation is enabled, use global goal */
+	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
 		/* TBD: may be hot point */
 		spin_lock(&sbi->s_md_lock);
 		ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
 		ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
 		spin_unlock(&sbi->s_md_lock);
 	}
+
 	/* Let's just scan groups to find more-less suitable blocks */
 	cr = ac->ac_2order ? 0 : 1;
 	/*
@@ -4192,16 +4187,18 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
 		return;
 
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+		return;
+
 	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
 	isize = i_size_read(ac->ac_inode) >> bsbits;
 	size = max(size, isize);
 
 	/* don't use group allocation for large files */
-	if (size >= sbi->s_mb_stream_request)
-		return;
-
-	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+	if (size >= sbi->s_mb_stream_request) {
+		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
 		return;
+	}
 
 	BUG_ON(ac->ac_lg != NULL);
 	/*
-- 
cgit v1.2.3-70-g09d2


From d3c8660233d3f2801e14eaf722937ff4ed49bfb7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 10 Jul 2009 03:27:38 +0200
Subject: mm_for_maps: shift down_read(mmap_sem) to the caller

mm_for_maps() takes ->mmap_sem after security checks, this looks
strange and obfuscates the locking rules. Move this lock to its
single caller, m_start().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/base.c       | 8 +++-----
 fs/proc/task_mmu.c   | 1 +
 fs/proc/task_nommu.c | 1 +
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 917f338a6739..f3c2e4085fed 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -235,9 +235,8 @@ static int check_mem_permission(struct task_struct *task)
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
 	struct mm_struct *mm = get_task_mm(task);
-	if (!mm)
-		return NULL;
-	if (mm != current->mm) {
+
+	if (mm && mm != current->mm) {
 		/*
 		 * task->mm can be changed before security check,
 		 * in that case we must notice the change after.
@@ -245,10 +244,9 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 		if (!ptrace_may_access(task, PTRACE_MODE_READ) ||
 		    mm != task->mm) {
 			mmput(mm);
-			return NULL;
+			mm = NULL;
 		}
 	}
-	down_read(&mm->mmap_sem);
 	return mm;
 }
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6f61b7cc32e0..9bd8be1d235c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	mm = mm_for_maps(priv->task);
 	if (!mm)
 		return NULL;
+	down_read(&mm->mmap_sem);
 
 	tail_vma = get_gate_vma(priv->task);
 	priv->tail_vma = tail_vma;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 64a72e2e7650..8f5c05d3dbd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 		priv->task = NULL;
 		return NULL;
 	}
+	down_read(&mm->mmap_sem);
 
 	/* start from the Nth VMA */
 	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
-- 
cgit v1.2.3-70-g09d2


From 896a6de40ef3814525632609799af909338f50c3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 10 Jul 2009 03:27:40 +0200
Subject: mm_for_maps: take ->cred_guard_mutex to fix the race with exec

The problem is minor, but without ->cred_guard_mutex held we can race
with exec() and get the new ->mm but check old creds.

Now we do not need to re-check task->mm after ptrace_may_access(), it
can't be changed to the new mm under us.

Strictly speaking, this also fixes another very minor problem. Unless
security check fails or the task exits mm_for_maps() should never
return NULL, the caller should get either old or new ->mm.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/base.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f3c2e4085fed..175db258942f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -234,19 +234,19 @@ static int check_mem_permission(struct task_struct *task)
 
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
-	struct mm_struct *mm = get_task_mm(task);
+	struct mm_struct *mm;
 
-	if (mm && mm != current->mm) {
-		/*
-		 * task->mm can be changed before security check,
-		 * in that case we must notice the change after.
-		 */
-		if (!ptrace_may_access(task, PTRACE_MODE_READ) ||
-		    mm != task->mm) {
-			mmput(mm);
-			mm = NULL;
-		}
+	if (mutex_lock_killable(&task->cred_guard_mutex))
+		return NULL;
+
+	mm = get_task_mm(task);
+	if (mm && mm != current->mm &&
+			!ptrace_may_access(task, PTRACE_MODE_READ)) {
+		mmput(mm);
+		mm = NULL;
 	}
+	mutex_unlock(&task->cred_guard_mutex);
+
 	return mm;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 13f0feafa6b8aead57a2a328e2fca6a5828bf286 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 23 Jun 2009 21:25:32 +0200
Subject: mm_for_maps: simplify, use ptrace_may_access()

It would be nice to kill __ptrace_may_access(). It requires task_lock(),
but this lock is only needed to read mm->flags in the middle.

Convert mm_for_maps() to use ptrace_may_access(), this also simplifies
the code a little bit.

Also, we do not need to take ->mmap_sem in advance. In fact I think
mm_for_maps() should not play with ->mmap_sem at all, the caller should
take this lock.

With or without this patch, without ->cred_guard_mutex held we can race
with exec() and get the new ->mm but check old creds.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/base.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3ce5ae9e3d2d..917f338a6739 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -237,20 +237,19 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 	struct mm_struct *mm = get_task_mm(task);
 	if (!mm)
 		return NULL;
+	if (mm != current->mm) {
+		/*
+		 * task->mm can be changed before security check,
+		 * in that case we must notice the change after.
+		 */
+		if (!ptrace_may_access(task, PTRACE_MODE_READ) ||
+		    mm != task->mm) {
+			mmput(mm);
+			return NULL;
+		}
+	}
 	down_read(&mm->mmap_sem);
-	task_lock(task);
-	if (task->mm != mm)
-		goto out;
-	if (task->mm != current->mm &&
-	    __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
-		goto out;
-	task_unlock(task);
 	return mm;
-out:
-	task_unlock(task);
-	up_read(&mm->mmap_sem);
-	mmput(mm);
-	return NULL;
 }
 
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
-- 
cgit v1.2.3-70-g09d2


From 00f89d218523b9bf6b522349c039d5ac80aa536d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 10 Jul 2009 03:27:38 +0200
Subject: mm_for_maps: shift down_read(mmap_sem) to the caller

mm_for_maps() takes ->mmap_sem after security checks, this looks
strange and obfuscates the locking rules. Move this lock to its
single caller, m_start().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/base.c       | 8 +++-----
 fs/proc/task_mmu.c   | 1 +
 fs/proc/task_nommu.c | 1 +
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 917f338a6739..f3c2e4085fed 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -235,9 +235,8 @@ static int check_mem_permission(struct task_struct *task)
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
 	struct mm_struct *mm = get_task_mm(task);
-	if (!mm)
-		return NULL;
-	if (mm != current->mm) {
+
+	if (mm && mm != current->mm) {
 		/*
 		 * task->mm can be changed before security check,
 		 * in that case we must notice the change after.
@@ -245,10 +244,9 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 		if (!ptrace_may_access(task, PTRACE_MODE_READ) ||
 		    mm != task->mm) {
 			mmput(mm);
-			return NULL;
+			mm = NULL;
 		}
 	}
-	down_read(&mm->mmap_sem);
 	return mm;
 }
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6f61b7cc32e0..9bd8be1d235c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	mm = mm_for_maps(priv->task);
 	if (!mm)
 		return NULL;
+	down_read(&mm->mmap_sem);
 
 	tail_vma = get_gate_vma(priv->task);
 	priv->tail_vma = tail_vma;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 64a72e2e7650..8f5c05d3dbd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 		priv->task = NULL;
 		return NULL;
 	}
+	down_read(&mm->mmap_sem);
 
 	/* start from the Nth VMA */
 	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
-- 
cgit v1.2.3-70-g09d2


From 704b836cbf19e885f8366bccb2e4b0474346c02d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 10 Jul 2009 03:27:40 +0200
Subject: mm_for_maps: take ->cred_guard_mutex to fix the race with exec

The problem is minor, but without ->cred_guard_mutex held we can race
with exec() and get the new ->mm but check old creds.

Now we do not need to re-check task->mm after ptrace_may_access(), it
can't be changed to the new mm under us.

Strictly speaking, this also fixes another very minor problem. Unless
security check fails or the task exits mm_for_maps() should never
return NULL, the caller should get either old or new ->mm.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/base.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f3c2e4085fed..175db258942f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -234,19 +234,19 @@ static int check_mem_permission(struct task_struct *task)
 
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
-	struct mm_struct *mm = get_task_mm(task);
+	struct mm_struct *mm;
 
-	if (mm && mm != current->mm) {
-		/*
-		 * task->mm can be changed before security check,
-		 * in that case we must notice the change after.
-		 */
-		if (!ptrace_may_access(task, PTRACE_MODE_READ) ||
-		    mm != task->mm) {
-			mmput(mm);
-			mm = NULL;
-		}
+	if (mutex_lock_killable(&task->cred_guard_mutex))
+		return NULL;
+
+	mm = get_task_mm(task);
+	if (mm && mm != current->mm &&
+			!ptrace_may_access(task, PTRACE_MODE_READ)) {
+		mmput(mm);
+		mm = NULL;
 	}
+	mutex_unlock(&task->cred_guard_mutex);
+
 	return mm;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 1905b1bfc0de6f69a61dc03cac0d86a04b3216bd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 6 Aug 2009 18:13:23 +0900
Subject: chrdev: implement __[un]register_chrdev()

[un]register_chrdev() assume minor range 0-255.  This patch adds __
prefixed versions which take @minorbase and @count explicitly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 fs/char_dev.c      | 39 ++++++++++++++++++++++++++-------------
 include/linux/fs.h | 19 ++++++++++++++++---
 2 files changed, 42 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index a173551e19d7..2f18c1e4e301 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -237,8 +237,10 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
 }
 
 /**
- * register_chrdev() - Register a major number for character devices.
+ * __register_chrdev() - create and register a cdev occupying a range of minors
  * @major: major device number or 0 for dynamic allocation
+ * @baseminor: first of the requested range of minor numbers
+ * @count: the number of minor numbers required
  * @name: name of this range of devices
  * @fops: file operations associated with this devices
  *
@@ -254,19 +256,17 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
  * /dev. It only helps to keep track of the different owners of devices. If
  * your module name has only one type of devices it's ok to use e.g. the name
  * of the module here.
- *
- * This function registers a range of 256 minor numbers. The first minor number
- * is 0.
  */
-int register_chrdev(unsigned int major, const char *name,
-		    const struct file_operations *fops)
+int __register_chrdev(unsigned int major, unsigned int baseminor,
+		      unsigned int count, const char *name,
+		      const struct file_operations *fops)
 {
 	struct char_device_struct *cd;
 	struct cdev *cdev;
 	char *s;
 	int err = -ENOMEM;
 
-	cd = __register_chrdev_region(major, 0, 256, name);
+	cd = __register_chrdev_region(major, baseminor, count, name);
 	if (IS_ERR(cd))
 		return PTR_ERR(cd);
 	
@@ -280,7 +280,7 @@ int register_chrdev(unsigned int major, const char *name,
 	for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
 		*s = '!';
 		
-	err = cdev_add(cdev, MKDEV(cd->major, 0), 256);
+	err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
 	if (err)
 		goto out;
 
@@ -290,7 +290,7 @@ int register_chrdev(unsigned int major, const char *name,
 out:
 	kobject_put(&cdev->kobj);
 out2:
-	kfree(__unregister_chrdev_region(cd->major, 0, 256));
+	kfree(__unregister_chrdev_region(cd->major, baseminor, count));
 	return err;
 }
 
@@ -316,10 +316,23 @@ void unregister_chrdev_region(dev_t from, unsigned count)
 	}
 }
 
-void unregister_chrdev(unsigned int major, const char *name)
+/**
+ * __unregister_chrdev - unregister and destroy a cdev
+ * @major: major device number
+ * @baseminor: first of the range of minor numbers
+ * @count: the number of minor numbers this cdev is occupying
+ * @name: name of this range of devices
+ *
+ * Unregister and destroy the cdev occupying the region described by
+ * @major, @baseminor and @count.  This function undoes what
+ * __register_chrdev() did.
+ */
+void __unregister_chrdev(unsigned int major, unsigned int baseminor,
+			 unsigned int count, const char *name)
 {
 	struct char_device_struct *cd;
-	cd = __unregister_chrdev_region(major, 0, 256);
+
+	cd = __unregister_chrdev_region(major, baseminor, count);
 	if (cd && cd->cdev)
 		cdev_del(cd->cdev);
 	kfree(cd);
@@ -568,6 +581,6 @@ EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
 EXPORT_SYMBOL(cdev_index);
-EXPORT_SYMBOL(register_chrdev);
-EXPORT_SYMBOL(unregister_chrdev);
+EXPORT_SYMBOL(__register_chrdev);
+EXPORT_SYMBOL(__unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a36ffa5a77a4..6c36ab788854 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1998,12 +1998,25 @@ extern void bd_release_from_disk(struct block_device *, struct gendisk *);
 #define CHRDEV_MAJOR_HASH_SIZE	255
 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
 extern int register_chrdev_region(dev_t, unsigned, const char *);
-extern int register_chrdev(unsigned int, const char *,
-			   const struct file_operations *);
-extern void unregister_chrdev(unsigned int, const char *);
+extern int __register_chrdev(unsigned int major, unsigned int baseminor,
+			     unsigned int count, const char *name,
+			     const struct file_operations *fops);
+extern void __unregister_chrdev(unsigned int major, unsigned int baseminor,
+				unsigned int count, const char *name);
 extern void unregister_chrdev_region(dev_t, unsigned);
 extern void chrdev_show(struct seq_file *,off_t);
 
+static inline int register_chrdev(unsigned int major, const char *name,
+				  const struct file_operations *fops)
+{
+	return __register_chrdev(major, 0, 256, name, fops);
+}
+
+static inline void unregister_chrdev(unsigned int major, const char *name)
+{
+	__unregister_chrdev(major, 0, 256, name);
+}
+
 /* fs/block_dev.c */
 #define BDEVNAME_SIZE	32	/* Largest string for a blockdev identifier */
 #define BDEVT_SIZE	10	/* Largest string for MAJ:MIN for blkdev */
-- 
cgit v1.2.3-70-g09d2


From 074cc1deec5dee63fcd5d966b36fa4f3765b50fc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 10 Aug 2009 08:54:13 -0400
Subject: NFS: Add a ->migratepage() aop for NFS

Make NFS a bit more friendly to NUMA and memory hot removal...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c     |  1 +
 fs/nfs/internal.h |  6 ++++
 fs/nfs/write.c    | 91 +++++++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 76 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05062329b678..dfc89671dc94 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -479,6 +479,7 @@ const struct address_space_operations nfs_file_aops = {
 	.invalidatepage = nfs_invalidate_page,
 	.releasepage = nfs_release_page,
 	.direct_IO = nfs_direct_IO,
+	.migratepage = nfs_migrate_page,
 	.launder_page = nfs_launder_page,
 };
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7dd90a6769d0..e2ccb4a4398a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -248,6 +248,12 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 
 /* write.c */
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
+#ifdef CONFIG_MIGRATION
+extern int nfs_migrate_page(struct address_space *,
+		struct page *, struct page *);
+#else
+#define nfs_migrate_page NULL
+#endif
 
 /* nfs4proc.c */
 extern int _nfs4_call_sync(struct nfs_server *server,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0a0a2ff767c3..6240e644f249 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -13,6 +13,7 @@
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/swap.h>
+#include <linux/migrate.h>
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
@@ -26,6 +27,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "nfs4_fs.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
@@ -220,24 +222,17 @@ static void nfs_end_page_writeback(struct page *page)
 		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
 
-/*
- * Find an associated nfs write request, and prepare to flush it out
- * May return an error if the user signalled nfs_wait_on_request().
- */
-static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
-				struct page *page)
+static struct nfs_page *nfs_find_and_lock_request(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct nfs_page *req;
 	int ret;
 
 	spin_lock(&inode->i_lock);
-	for(;;) {
+	for (;;) {
 		req = nfs_page_find_request_locked(page);
-		if (req == NULL) {
-			spin_unlock(&inode->i_lock);
-			return 0;
-		}
+		if (req == NULL)
+			break;
 		if (nfs_set_page_tag_locked(req))
 			break;
 		/* Note: If we hold the page lock, as is the case in nfs_writepage,
@@ -249,23 +244,40 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 		ret = nfs_wait_on_request(req);
 		nfs_release_request(req);
 		if (ret != 0)
-			return ret;
+			return ERR_PTR(ret);
 		spin_lock(&inode->i_lock);
 	}
-	if (test_bit(PG_CLEAN, &req->wb_flags)) {
-		spin_unlock(&inode->i_lock);
-		BUG();
-	}
-	if (nfs_set_page_writeback(page) != 0) {
-		spin_unlock(&inode->i_lock);
-		BUG();
-	}
 	spin_unlock(&inode->i_lock);
+	return req;
+}
+
+/*
+ * Find an associated nfs write request, and prepare to flush it out
+ * May return an error if the user signalled nfs_wait_on_request().
+ */
+static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
+				struct page *page)
+{
+	struct nfs_page *req;
+	int ret = 0;
+
+	req = nfs_find_and_lock_request(page);
+	if (!req)
+		goto out;
+	ret = PTR_ERR(req);
+	if (IS_ERR(req))
+		goto out;
+
+	ret = nfs_set_page_writeback(page);
+	BUG_ON(ret != 0);
+	BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
+
 	if (!nfs_pageio_add_request(pgio, req)) {
 		nfs_redirty_request(req);
-		return pgio->pg_error;
+		ret = pgio->pg_error;
 	}
-	return 0;
+out:
+	return ret;
 }
 
 static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
@@ -1582,6 +1594,41 @@ int nfs_wb_page(struct inode *inode, struct page* page)
 	return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
 }
 
+#ifdef CONFIG_MIGRATION
+int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
+		struct page *page)
+{
+	struct nfs_page *req;
+	int ret;
+
+	if (PageFsCache(page))
+		nfs_fscache_release_page(page, GFP_KERNEL);
+
+	req = nfs_find_and_lock_request(page);
+	ret = PTR_ERR(req);
+	if (IS_ERR(req))
+		goto out;
+
+	ret = migrate_page(mapping, newpage, page);
+	if (!req)
+		goto out;
+	if (ret)
+		goto out_unlock;
+	page_cache_get(newpage);
+	req->wb_page = newpage;
+	SetPagePrivate(newpage);
+	set_page_private(newpage, page_private(page));
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	page_cache_release(page);
+out_unlock:
+	nfs_clear_page_tag_locked(req);
+	nfs_release_request(req);
+out:
+	return ret;
+}
+#endif
+
 int __init nfs_init_writepagecache(void)
 {
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
-- 
cgit v1.2.3-70-g09d2


From 38c73044f5f4da2ef4339319b170e5e19f8dec87 Mon Sep 17 00:00:00 2001
From: Peter Staubach <staubach@redhat.com>
Date: Mon, 10 Aug 2009 08:54:16 -0400
Subject: NFS: read-modify-write page updating

Hi.

I have a proposal for possibly resolving this issue.

I believe that this situation occurs due to the way that the
Linux NFS client handles writes which modify partial pages.

The Linux NFS client handles partial page modifications by
allocating a page from the page cache, copying the data from
the user level into the page, and then keeping track of the
offset and length of the modified portions of the page.  The
page is not marked as up to date because there are portions
of the page which do not contain valid file contents.

When a read call comes in for a portion of the page, the
contents of the page must be read in the from the server.
However, since the page may already contain some modified
data, that modified data must be written to the server
before the file contents can be read back in the from server.
And, since the writing and reading can not be done atomically,
the data must be written and committed to stable storage on
the server for safety purposes.  This means either a
FILE_SYNC WRITE or a UNSTABLE WRITE followed by a COMMIT.
This has been discussed at length previously.

This algorithm could be described as modify-write-read.  It
is most efficient when the application only updates pages
and does not read them.

My proposed solution is to add a heuristic to decide whether
to do this modify-write-read algorithm or switch to a read-
modify-write algorithm when initially allocating the page
in the write system call path.  The heuristic uses the modes
that the file was opened with, the offset in the page to
read from, and the size of the region to read.

If the file was opened for reading in addition to writing
and the page would not be filled completely with data from
the user level, then read in the old contents of the page
and mark it as Uptodate before copying in the new data.  If
the page would be completely filled with data from the user
level, then there would be no reason to read in the old
contents because they would just be copied over.

This would optimize for applications which randomly access
and update portions of files.  The linkage editor for the
C compiler is an example of such a thing.

I tested the attached patch by using rpmbuild to build the
current Fedora rawhide kernel.  The kernel without the
patch generated about 269,500 WRITE requests.  The modified
kernel containing the patch generated about 261,000 WRITE
requests.  Thus, about 8,500 fewer WRITE requests were
generated.  I suspect that many of these additional
WRITE requests were probably FILE_SYNC requests to WRITE
a single page, but I didn't test this theory.

The difference between this patch and the previous one was
to remove the unneeded PageDirty() test.  I then retested to
ensure that the resulting system continued to behave as
desired.

	Thanx...

		ps

Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index dfc89671dc94..5021b75d2d1e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -327,6 +327,42 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
 	return nfs_do_fsync(ctx, inode);
 }
 
+/*
+ * Decide whether a read/modify/write cycle may be more efficient
+ * then a modify/write/read cycle when writing to a page in the
+ * page cache.
+ *
+ * The modify/write/read cycle may occur if a page is read before
+ * being completely filled by the writer.  In this situation, the
+ * page must be completely written to stable storage on the server
+ * before it can be refilled by reading in the page from the server.
+ * This can lead to expensive, small, FILE_SYNC mode writes being
+ * done.
+ *
+ * It may be more efficient to read the page first if the file is
+ * open for reading in addition to writing, the page is not marked
+ * as Uptodate, it is not dirty or waiting to be committed,
+ * indicating that it was previously allocated and then modified,
+ * that there were valid bytes of data in that range of the file,
+ * and that the new data won't completely replace the old data in
+ * that range of the file.
+ */
+static int nfs_want_read_modify_write(struct file *file, struct page *page,
+			loff_t pos, unsigned len)
+{
+	unsigned int pglen = nfs_page_length(page);
+	unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned int end = offset + len;
+
+	if ((file->f_mode & FMODE_READ) &&	/* open for read? */
+	    !PageUptodate(page) &&		/* Uptodate? */
+	    !PagePrivate(page) &&		/* i/o request already? */
+	    pglen &&				/* valid bytes of file? */
+	    (end < pglen || offset))		/* replace all valid bytes? */
+		return 1;
+	return 0;
+}
+
 /*
  * This does the "real" work of the write. We must allocate and lock the
  * page to be sent back to the generic routine, which then copies the
@@ -340,15 +376,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 			struct page **pagep, void **fsdata)
 {
 	int ret;
-	pgoff_t index;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	struct page *page;
-	index = pos >> PAGE_CACHE_SHIFT;
+	int once_thru = 0;
 
 	dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
 		file->f_path.dentry->d_parent->d_name.name,
 		file->f_path.dentry->d_name.name,
 		mapping->host->i_ino, len, (long long) pos);
 
+start:
 	/*
 	 * Prevent starvation issues if someone is doing a consistency
 	 * sync-to-disk
@@ -367,6 +404,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 	if (ret) {
 		unlock_page(page);
 		page_cache_release(page);
+	} else if (!once_thru &&
+		   nfs_want_read_modify_write(file, page, pos, len)) {
+		once_thru = 1;
+		ret = nfs_readpage(file, page);
+		page_cache_release(page);
+		if (!ret)
+			goto start;
 	}
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From e576e05a73bc1a00cdf56630942dbada1bf280a1 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 10 Aug 2009 08:54:16 -0400
Subject: nfs: remove superfluous BUG_ON()s Subject: [PATCH] nfs: remove
 superfluous BUG_ON()s

Remove duplicated BUG_ON()s from nfs[4]_create_server()
(we make the same checks earlier in both functions).

This takes care of the following entries from Dan's list:

fs/nfs/client.c +1078 nfs_create_server(47) warning: variable derefenced before check 'server->nfs_client'
fs/nfs/client.c +1079 nfs_create_server(48) warning: variable derefenced before check 'server->nfs_client->rpc_ops'
fs/nfs/client.c +1363 nfs4_create_server(43) warning: variable derefenced before check 'server->nfs_client'
fs/nfs/client.c +1364 nfs4_create_server(44) warning: variable derefenced before check 'server->nfs_

Reported-by: Dan Carpenter <error27@gmail.com>
Cc: corbet@lwn.net
Cc: eteo@redhat.com
Cc: Julia Lawall <julia@diku.dk>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8f34fd8028fc..d36925f9b952 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1077,10 +1077,6 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
 		(unsigned long long) server->fsid.major,
 		(unsigned long long) server->fsid.minor);
 
-	BUG_ON(!server->nfs_client);
-	BUG_ON(!server->nfs_client->rpc_ops);
-	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
 	spin_lock(&nfs_client_lock);
 	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
 	list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1362,10 +1358,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
 	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
 		server->namelen = NFS4_MAXNAMLEN;
 
-	BUG_ON(!server->nfs_client);
-	BUG_ON(!server->nfs_client->rpc_ops);
-	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
 	spin_lock(&nfs_client_lock);
 	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
 	list_add_tail(&server->master_link, &nfs_volume_list);
-- 
cgit v1.2.3-70-g09d2


From b409d7a0ab46fe530efe52734984b4ed5d46c3eb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 6 Aug 2009 23:29:34 +0200
Subject: ocfs2: Fix possible deadlock when extending quota file

In OCFS2, allocator locks rank above transaction start. Thus we
cannot extend quota file from inside a transaction less we could
deadlock.

We solve the problem by starting transaction not already in
ocfs2_acquire_dquot() but only in ocfs2_local_read_dquot() and
ocfs2_global_read_dquot() and we allocate blocks to quota files before starting
the transaction.  In case we crash, quota files will just have a few blocks
more but that's no problem since we just use them next time we extend the
quota file.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/journal.h      |   5 ---
 fs/ocfs2/quota_global.c | 115 ++++++++++++++++++++++++------------------------
 2 files changed, 57 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 4a4d3b55fd22..2c3222aec622 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -363,11 +363,6 @@ static inline int ocfs2_quota_trans_credits(struct super_block *sb)
 	return credits;
 }
 
-/* Number of credits needed for removing quota structure from file */
-int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
-/* Number of credits needed for initialization of new quota structure */
-int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
-
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
 
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index d604a6aa0a22..bf7742d0ee3b 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -215,11 +215,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 		loff_t rounded_end =
 				ocfs2_align_bytes_to_blocks(sb, off + len);
 
-		down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-		err = ocfs2_extend_no_holes(gqinode, rounded_end, off);
-		up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-		if (err < 0)
-			goto out;
+		/* Space is already allocated in ocfs2_global_read_dquot() */
 		err = ocfs2_simple_size_update(gqinode,
 					       oinfo->dqi_gqi_bh,
 					       rounded_end);
@@ -405,13 +401,36 @@ int ocfs2_global_write_info(struct super_block *sb, int type)
 	return err;
 }
 
+static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
+{
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+
+	/*
+	 * We may need to allocate tree blocks and a leaf block but not the
+	 * root block
+	 */
+	return oinfo->dqi_gi.dqi_qtree_depth;
+}
+
+static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
+{
+	/* We modify all the allocated blocks, tree root, and info block */
+	return (ocfs2_global_qinit_alloc(sb, type) + 2) *
+			OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
+}
+
 /* Read in information from global quota file and acquire a reference to it.
  * dquot_acquire() has already started the transaction and locked quota file */
 int ocfs2_global_read_dquot(struct dquot *dquot)
 {
 	int err, err2, ex = 0;
-	struct ocfs2_mem_dqinfo *info =
-			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_type;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct inode *gqinode = info->dqi_gqinode;
+	int need_alloc = ocfs2_global_qinit_alloc(sb, type);
+	handle_t *handle = NULL;
 
 	err = ocfs2_qinfo_lock(info, 0);
 	if (err < 0)
@@ -422,14 +441,33 @@ int ocfs2_global_read_dquot(struct dquot *dquot)
 	OCFS2_DQUOT(dquot)->dq_use_count++;
 	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
 	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+	ocfs2_qinfo_unlock(info, 0);
+
 	if (!dquot->dq_off) {	/* No real quota entry? */
-		/* Upgrade to exclusive lock for allocation */
-		ocfs2_qinfo_unlock(info, 0);
-		err = ocfs2_qinfo_lock(info, 1);
-		if (err < 0)
-			goto out_qlock;
 		ex = 1;
+		/*
+		 * Add blocks to quota file before we start a transaction since
+		 * locking allocators ranks above a transaction start
+		 */
+		WARN_ON(journal_current_handle());
+		down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+		err = ocfs2_extend_no_holes(gqinode,
+			gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
+			gqinode->i_size);
+		up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+		if (err < 0)
+			goto out;
+	}
+
+	handle = ocfs2_start_trans(osb,
+				   ocfs2_calc_global_qinit_credits(sb, type));
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto out;
 	}
+	err = ocfs2_qinfo_lock(info, ex);
+	if (err < 0)
+		goto out_trans;
 	err = qtree_write_dquot(&info->dqi_gi, dquot);
 	if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
 		err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
@@ -441,6 +479,9 @@ out_qlock:
 		ocfs2_qinfo_unlock(info, 1);
 	else
 		ocfs2_qinfo_unlock(info, 0);
+out_trans:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
 out:
 	if (err < 0)
 		mlog_errno(err);
@@ -638,24 +679,17 @@ out:
 	return status;
 }
 
-int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
 {
-	struct ocfs2_mem_dqinfo *oinfo;
-	int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
-				    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
-
-	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
-		return 0;
-
-	oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
 	/*
 	 * We modify tree, leaf block, global info, local chunk header,
 	 * global and local inode; OCFS2_QINFO_WRITE_CREDITS already
 	 * accounts for inode update
 	 */
-	return oinfo->dqi_gi.dqi_qtree_depth +
+	return (oinfo->dqi_gi.dqi_qtree_depth + 2) *
+	       OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
 	       OCFS2_QINFO_WRITE_CREDITS +
-	       2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
 	       OCFS2_INODE_UPDATE_CREDITS;
 }
 
@@ -688,36 +722,10 @@ out:
 	return status;
 }
 
-int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
-{
-	struct ocfs2_mem_dqinfo *oinfo;
-	int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
-				    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
-	struct ocfs2_dinode *lfe, *gfe;
-
-	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
-		return 0;
-
-	oinfo = sb_dqinfo(sb, type)->dqi_priv;
-	gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
-	lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
-	/* We can extend local file + global file. In local file we
-	 * can modify info, chunk header block and dquot block. In
-	 * global file we can modify info, tree and leaf block */
-	return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
-	       ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
-	       OCFS2_LOCAL_QINFO_WRITE_CREDITS +
-	       2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
-	       oinfo->dqi_gi.dqi_qtree_depth +
-	       2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
-}
-
 static int ocfs2_acquire_dquot(struct dquot *dquot)
 {
-	handle_t *handle;
 	struct ocfs2_mem_dqinfo *oinfo =
 			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
-	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
 	int status = 0;
 
 	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
@@ -726,16 +734,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
 	status = ocfs2_lock_global_qf(oinfo, 1);
 	if (status < 0)
 		goto out;
-	handle = ocfs2_start_trans(osb,
-		ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		mlog_errno(status);
-		goto out_ilock;
-	}
 	status = dquot_acquire(dquot);
-	ocfs2_commit_trans(osb, handle);
-out_ilock:
 	ocfs2_unlock_global_qf(oinfo, 1);
 out:
 	mlog_exit(status);
-- 
cgit v1.2.3-70-g09d2


From 0cc6eee130b0c062feec8446d9cecdb17d2cfad3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:14:53 -0400
Subject: xfs: avoid memory allocation under m_peraglock in growfs code

Allocate the memory for the larger m_perag array before taking the
per-AG lock as the per-AG lock can be taken under the i_lock which
can be taken from reclaim context.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_fsops.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cbd451bb4848..2d0b3e1da9e6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -167,17 +167,25 @@ xfs_growfs_data_private(
 	new = nb - mp->m_sb.sb_dblocks;
 	oagcount = mp->m_sb.sb_agcount;
 	if (nagcount > oagcount) {
+		void *new_perag, *old_perag;
+
 		xfs_filestream_flush(mp);
+
+		new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
+					KM_MAYFAIL);
+		if (!new_perag)
+			return XFS_ERROR(ENOMEM);
+
 		down_write(&mp->m_peraglock);
-		mp->m_perag = kmem_realloc(mp->m_perag,
-			sizeof(xfs_perag_t) * nagcount,
-			sizeof(xfs_perag_t) * oagcount,
-			KM_SLEEP);
-		memset(&mp->m_perag[oagcount], 0,
-			(nagcount - oagcount) * sizeof(xfs_perag_t));
+		memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
+		old_perag = mp->m_perag;
+		mp->m_perag = new_perag;
+
 		mp->m_flags |= XFS_MOUNT_32BITINODES;
 		nagimax = xfs_initialize_perag(mp, nagcount);
 		up_write(&mp->m_peraglock);
+
+		kmem_free(old_perag);
 	}
 	tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
 	tp->t_flags |= XFS_TRANS_RESERVE;
-- 
cgit v1.2.3-70-g09d2


From ca35dcd6cae7d4a780c484c53f45548c4719f82c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:14:54 -0400
Subject: xfs: switch to NOFS allocation under i_lock in xfs_getbmap

xfs_getbmap allocates memory with i_lock held, but i_lock is taken in
reclaim context so all allocations under it must avoid recursions into
the filesystem.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_bmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 7928b9983c1d..8ee5b5a76a2a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -6009,7 +6009,7 @@ xfs_getbmap(
 	 */
 	error = ENOMEM;
 	subnex = 16;
-	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL);
+	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
 	if (!map)
 		goto out_unlock_ilock;
 
-- 
cgit v1.2.3-70-g09d2


From f41d7fb9da05b604f8a69fb6cac2a0563c8ede4e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:14:55 -0400
Subject: xfs: switch to NOFS allocation under i_lock in xfs_da_state_alloc

xfs_da_state_alloc is always called with i_lock held, but i_lock is taken in
reclaim context so all allocations under it must avoid recursions into the
filesystem.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_da_btree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9ff6e57a5075..bd0bb6dfcdfa 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2201,7 +2201,7 @@ kmem_zone_t *xfs_dabuf_zone;		/* dabuf zone */
 xfs_da_state_t *
 xfs_da_state_alloc(void)
 {
-	return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP);
+	return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 73195ed7864ae4a1fb0bea2ed9df59d19b4fde90 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:14:56 -0400
Subject: xfs: switch to NOFS allocation under i_lock in xfs_da_buf_make

i_lock is taken in the reclaim context so all allocations under it
must avoid recursions into the filesystem.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_da_btree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index bd0bb6dfcdfa..2847bbc1c534 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2261,9 +2261,9 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
 	int		off;
 
 	if (nbuf == 1)
-		dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP);
+		dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS);
 	else
-		dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP);
+		dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS);
 	dabuf->dirty = 0;
 #ifdef XFS_DABUF_DEBUG
 	dabuf->ra = ra;
-- 
cgit v1.2.3-70-g09d2


From 3f52c2f0a07c23771909cc53f2e9451a7f1bf253 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:14:57 -0400
Subject: xfs: switch to NOFS allocation under i_lock in
 xfs_dir_cilookup_result

xfs_dir_cilookup_result is always called with i_lock held, but i_lock is taken
in reclaim context so all allocations under it must avoid recursions into the
filesystem.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_dir2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index c657bec6d951..bb1d58eb3982 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -256,7 +256,7 @@ xfs_dir_cilookup_result(
 					!(args->op_flags & XFS_DA_OP_CILOOKUP))
 		return EEXIST;
 
-	args->value = kmem_alloc(len, KM_MAYFAIL);
+	args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
 	if (!args->value)
 		return ENOMEM;
 
-- 
cgit v1.2.3-70-g09d2


From 36fae17a648e0aee5d9560514d08477ef48dc87f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:14:58 -0400
Subject: xfs: switch to NOFS allocation under i_lock in
 xfs_buf_associate_memory

xfs_buf_associate_memory is used for setting up the spare buffer for the
log wrap case in xlog_sync which can happen under i_lock when called from
xfs_fsync. The i_lock mutex is taken in reclaim context so all allocations
under it must avoid recursions into the filesystem.  There are a couple
more uses of xfs_buf_associate_memory in the log recovery code that are
also affected by this, but I'd rather keep the code simple than passing on
a gfp_mask argument.  Longer term we should just stop requiring the memoery
allocation in xlog_sync by some smaller rework of the buffer layer.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 1418b916fc27..178c20c13e83 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -770,7 +770,7 @@ xfs_buf_associate_memory(
 	bp->b_pages = NULL;
 	bp->b_addr = mem;
 
-	rval = _xfs_buf_get_pages(bp, page_count, 0);
+	rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
 	if (rval)
 		return rval;
 
-- 
cgit v1.2.3-70-g09d2


From 10746e47e722b5688fcd6eba9fbf9b2e64a248a7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:14:59 -0400
Subject: xfs: switch to NOFS allocation under i_lock in xfs_attr_rmtval_set

xfs_attr_rmtval_set is always called with i_lock held, and i_lock is taken
in reclaim context so all allocations under it must avoid recursions into
the filesystem.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_attr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index db15feb906ff..bfb583791fe2 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2141,8 +2141,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
 		blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
 
-		bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno,
-							blkcnt, XFS_BUF_LOCK);
+		bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt,
+				       XFS_BUF_LOCK | XBF_DONT_BLOCK);
 		ASSERT(bp);
 		ASSERT(!XFS_BUF_GETERROR(bp));
 
-- 
cgit v1.2.3-70-g09d2


From 7b02ecb3031b192823bc732ae717febc0a59aa92 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:15:00 -0400
Subject: xfs: switch to NOFS allocation under i_lock in xfs_readlink_bmap

xfs_readlink_bmap is called with i_lock held, but i_lock is taken in
reclaim context so all allocations under it must avoid recursions into
the filesystem.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index c4eca5ed5dab..492d75bae2bf 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -538,7 +538,9 @@ xfs_readlink_bmap(
 		d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
 		byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
 
-		bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
+		bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt),
+					XBF_LOCK | XBF_MAPPED |
+					XBF_DONT_BLOCK);
 		error = XFS_BUF_GETERROR(bp);
 		if (error) {
 			xfs_ioerror_alert("xfs_readlink",
-- 
cgit v1.2.3-70-g09d2


From ddd3a14e0f030f0f7b900621f67532285b8657ef Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 18 Jul 2009 18:15:01 -0400
Subject: xfs: switch to NOFS allocation under i_lock in xfs_attr_rmtval_get

xfs_attr_rmtval_get is always called with i_lock held, but i_lock is taken
in reclaim context so all allocations under it must avoid recursions into
the filesystem.

Reported by the new reclaim context tracing in lockdep.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_attr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index bfb583791fe2..4ece1906bd41 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2010,7 +2010,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 			dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
 			blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
 			error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
-					     blkcnt, XFS_BUF_LOCK, &bp);
+					     blkcnt,
+					     XFS_BUF_LOCK | XBF_DONT_BLOCK,
+					     &bp);
 			if (error)
 				return(error);
 
-- 
cgit v1.2.3-70-g09d2


From e0c222c411e22f086e929cd69fdcc89336164ec1 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Mon, 20 Jul 2009 10:52:15 -0500
Subject: use XFS_CORRUPTION_ERROR in xfs_btree_check_sblock

In Red Hat Bug 512552
 - Can't write to XFS mount during raid5 resync

a user ran into corruption while resyncing a raid, and we failed
a consistency test, but didn't get much more info; it'd be nice
to call XFS_CORRUPTION_ERROR here so we can see the buffer
contents.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_btree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e9df99574829..26717388acf5 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -120,8 +120,8 @@ xfs_btree_check_sblock(
 			XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
 		if (bp)
 			xfs_buftrace("SBTREE ERROR", bp);
-		XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW,
-				 cur->bc_mp);
+		XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
+			XFS_ERRLEVEL_LOW, cur->bc_mp, block);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From b89d4208de3de442c9025919c4261be0b38e79a4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 10 Aug 2009 11:32:18 -0300
Subject: xfs: check for dinode realtime flag corruption

Ramon tested XFS with a modified version of fsfuzzer and hit a NULL
pointer dereference in __xfs_get_blocks due to the RT device target
pointer being NULL.

To fix this reject inode with the realtime bit set on a a filesystem
without an RT subvolume during inode read.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Reported-by: Ramon de Carvalho Valle <ramon@risesecurity.org>
Tested-by: Ramon de Carvalho Valle <ramon@risesecurity.org>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_inode.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1f22d65fed0a..da428b3fe0f5 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -343,6 +343,16 @@ xfs_iformat(
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
+	if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+		     !ip->i_mount->m_rtdev_targp)) {
+		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt dinode %Lu, has realtime flag set.",
+			ip->i_ino);
+		XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+				     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
 	switch (ip->i_d.di_mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFCHR:
-- 
cgit v1.2.3-70-g09d2


From a8914f3a6d72c97328597a556a99daaf5cc288ae Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 10 Aug 2009 11:32:44 -0300
Subject: xfs: fix spin_is_locked assert on uni-processor builds

Without SMP or preemption spin_is_locked always returns false,
so we can't do an assert with it.  Instead use assert_spin_locked,
which does the right thing on all builds.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reported-by: Johannes Engel <jcnengel@googlemail.com>
Tested-by: Johannes Engel <jcnengel@googlemail.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3750f04ede0b..9dbdff3ea484 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3180,7 +3180,7 @@ try_again:
 STATIC void
 xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 {
-	ASSERT(spin_is_locked(&log->l_icloglock));
+	assert_spin_locked(&log->l_icloglock);
 
 	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
 		xlog_state_switch_iclogs(log, iclog, 0);
-- 
cgit v1.2.3-70-g09d2


From 1ae88b2e446261c038f2c0c3150ffae142b227a2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 12 Aug 2009 09:12:30 -0400
Subject: NFS: Fix an O_DIRECT Oops...

We can't call nfs_readdata_release()/nfs_writedata_release() without
first initialising and referencing args.context. Doing so inside
nfs_direct_read_schedule_segment()/nfs_direct_write_schedule_segment()
causes an Oops.

We should rather be calling nfs_readdata_free()/nfs_writedata_free() in
those cases.

Looking at the O_DIRECT code, the "struct nfs_direct_req" is already
referencing the nfs_open_context for us. Since the readdata and writedata
structures carry a reference to that, we can simplify things by getting rid
of the extra nfs_open_context references, so that we can replace all
instances of nfs_readdata_release()/nfs_writedata_release().

Reported-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Tested-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/direct.c        | 20 ++++++++++----------
 fs/nfs/read.c          |  6 ++----
 fs/nfs/write.c         |  6 ++----
 include/linux/nfs_fs.h |  5 ++---
 4 files changed, 16 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 489fc01a3204..e4e089a8f294 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -255,7 +255,7 @@ static void nfs_direct_read_release(void *calldata)
 
 	if (put_dreq(dreq))
 		nfs_direct_complete(dreq);
-	nfs_readdata_release(calldata);
+	nfs_readdata_free(data);
 }
 
 static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -314,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 					data->npages, 1, 0, data->pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
 		if (result < 0) {
-			nfs_readdata_release(data);
+			nfs_readdata_free(data);
 			break;
 		}
 		if ((unsigned)result < data->npages) {
 			bytes = result * PAGE_SIZE;
 			if (bytes <= pgbase) {
 				nfs_direct_release_pages(data->pagevec, result);
-				nfs_readdata_release(data);
+				nfs_readdata_free(data);
 				break;
 			}
 			bytes -= pgbase;
@@ -334,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		data->inode = inode;
 		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
-		data->args.context = get_nfs_open_context(ctx);
+		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
 		data->args.pages = data->pagevec;
@@ -441,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 		struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
 		list_del(&data->pages);
 		nfs_direct_release_pages(data->pagevec, data->npages);
-		nfs_writedata_release(data);
+		nfs_writedata_free(data);
 	}
 }
 
@@ -534,7 +534,7 @@ static void nfs_direct_commit_release(void *calldata)
 
 	dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
 	nfs_direct_write_complete(dreq, data->inode);
-	nfs_commitdata_release(calldata);
+	nfs_commit_free(data);
 }
 
 static const struct rpc_call_ops nfs_commit_direct_ops = {
@@ -570,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->args.fh = NFS_FH(data->inode);
 	data->args.offset = 0;
 	data->args.count = 0;
-	data->args.context = get_nfs_open_context(dreq->ctx);
+	data->args.context = dreq->ctx;
 	data->res.count = 0;
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
@@ -734,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 					data->npages, 0, 0, data->pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
 		if (result < 0) {
-			nfs_writedata_release(data);
+			nfs_writedata_free(data);
 			break;
 		}
 		if ((unsigned)result < data->npages) {
 			bytes = result * PAGE_SIZE;
 			if (bytes <= pgbase) {
 				nfs_direct_release_pages(data->pagevec, result);
-				nfs_writedata_release(data);
+				nfs_writedata_free(data);
 				break;
 			}
 			bytes -= pgbase;
@@ -756,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		data->inode = inode;
 		data->cred = msg.rpc_cred;
 		data->args.fh = NFS_FH(inode);
-		data->args.context = get_nfs_open_context(ctx);
+		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
 		data->args.pages = data->pagevec;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 73ea5e8d66ce..12c9e66d3f1d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -60,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 	return p;
 }
 
-static void nfs_readdata_free(struct nfs_read_data *p)
+void nfs_readdata_free(struct nfs_read_data *p)
 {
 	if (p && (p->pagevec != &p->page_array[0]))
 		kfree(p->pagevec);
 	mempool_free(p, nfs_rdata_mempool);
 }
 
-void nfs_readdata_release(void *data)
+static void nfs_readdata_release(struct nfs_read_data *rdata)
 {
-	struct nfs_read_data *rdata = data;
-
 	put_nfs_open_context(rdata->args.context);
 	nfs_readdata_free(rdata);
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0a0a2ff767c3..a34fae21fe10 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -87,17 +87,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 	return p;
 }
 
-static void nfs_writedata_free(struct nfs_write_data *p)
+void nfs_writedata_free(struct nfs_write_data *p)
 {
 	if (p && (p->pagevec != &p->page_array[0]))
 		kfree(p->pagevec);
 	mempool_free(p, nfs_wdata_mempool);
 }
 
-void nfs_writedata_release(void *data)
+static void nfs_writedata_release(struct nfs_write_data *wdata)
 {
-	struct nfs_write_data *wdata = data;
-
 	put_nfs_open_context(wdata->args.context);
 	nfs_writedata_free(wdata);
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index fdffb413b192..f6b90240dd41 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -473,7 +473,6 @@ extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_flush_incompatible(struct file *file, struct page *page);
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
 extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
-extern void nfs_writedata_release(void *);
 
 /*
  * Try to write back everything synchronously (but check the
@@ -488,7 +487,6 @@ extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 extern int  nfs_commit_inode(struct inode *, int);
 extern struct nfs_write_data *nfs_commitdata_alloc(void);
 extern void nfs_commit_free(struct nfs_write_data *wdata);
-extern void nfs_commitdata_release(void *wdata);
 #else
 static inline int
 nfs_commit_inode(struct inode *inode, int how)
@@ -507,6 +505,7 @@ nfs_have_writebacks(struct inode *inode)
  * Allocate nfs_write_data structures
  */
 extern struct nfs_write_data *nfs_writedata_alloc(unsigned int npages);
+extern void nfs_writedata_free(struct nfs_write_data *);
 
 /*
  * linux/fs/nfs/read.c
@@ -515,7 +514,6 @@ extern int  nfs_readpage(struct file *, struct page *);
 extern int  nfs_readpages(struct file *, struct address_space *,
 		struct list_head *, unsigned);
 extern int  nfs_readpage_result(struct rpc_task *, struct nfs_read_data *);
-extern void nfs_readdata_release(void *data);
 extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
 			       struct page *);
 
@@ -523,6 +521,7 @@ extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
  * Allocate nfs_read_data structures
  */
 extern struct nfs_read_data *nfs_readdata_alloc(unsigned int npages);
+extern void nfs_readdata_free(struct nfs_read_data *);
 
 /*
  * linux/fs/nfs3proc.c
-- 
cgit v1.2.3-70-g09d2


From d7e623da1a757fbd8c117fa29190ca8bef14dab3 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 11 Aug 2009 11:20:11 +0100
Subject: GFS2: Fix permissions on "recover" file

Although this file is only ever written and not read by
userspace, it seems that the utils are opening this
file O_RDWR, so we need to allow that.

Also fixes the whitespace which seemed to be broken.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: David Teigland <teigland@redhat.com>
---
 fs/gfs2/sys.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 23419dc3027b..a7cbfbd340c7 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -386,16 +386,16 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
 #define GDLM_ATTR(_name,_mode,_show,_store) \
 static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 
-GDLM_ATTR(proto_name,     0444, proto_name_show,	NULL);
-GDLM_ATTR(block,          0644, block_show,		block_store);
-GDLM_ATTR(withdraw,       0644, withdraw_show,		withdraw_store);
-GDLM_ATTR(id,             0444, lkid_show,		NULL);
-GDLM_ATTR(jid,		  0444, jid_show,		NULL);
-GDLM_ATTR(first,          0444, lkfirst_show,		NULL);
-GDLM_ATTR(first_done,     0444, first_done_show,	NULL);
-GDLM_ATTR(recover,        0200, NULL,			recover_store);
-GDLM_ATTR(recover_done,   0444, recover_done_show,	NULL);
-GDLM_ATTR(recover_status, 0444, recover_status_show,	NULL);
+GDLM_ATTR(proto_name,		0444, proto_name_show,		NULL);
+GDLM_ATTR(block,		0644, block_show,		block_store);
+GDLM_ATTR(withdraw,		0644, withdraw_show,		withdraw_store);
+GDLM_ATTR(id,			0444, lkid_show,		NULL);
+GDLM_ATTR(jid,			0444, jid_show,			NULL);
+GDLM_ATTR(first,		0444, lkfirst_show,		NULL);
+GDLM_ATTR(first_done,		0444, first_done_show,		NULL);
+GDLM_ATTR(recover,		0600, NULL,			recover_store);
+GDLM_ATTR(recover_done,		0444, recover_done_show,	NULL);
+GDLM_ATTR(recover_status,	0444, recover_status_show,	NULL);
 
 static struct attribute *lock_module_attrs[] = {
 	&gdlm_attr_proto_name.attr,
-- 
cgit v1.2.3-70-g09d2


From e75bc1c89e0c7dda0b140408ddee2ffaef7ba6d4 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:18:54 +0300
Subject: nfs: nfs4xdr: get rid of WRITE32

s/WRITE32/*p++ = cpu_to_be32/

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 291 +++++++++++++++++++++++++++----------------------------
 1 file changed, 145 insertions(+), 146 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 617273e7d47f..9a03b24ead5f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -712,7 +712,6 @@ struct compound_hdr {
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define WRITE32(n)               *p++ = htonl(n)
 #define WRITE64(n)               do {				\
 	*p++ = htonl((uint32_t)((n) >> 32));				\
 	*p++ = htonl((uint32_t)(n));					\
@@ -750,11 +749,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
 	RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
-	WRITE32(hdr->taglen);
+	*p++ = cpu_to_be32(hdr->taglen);
 	WRITEMEM(hdr->tag, hdr->taglen);
-	WRITE32(hdr->minorversion);
+	*p++ = cpu_to_be32(hdr->minorversion);
 	hdr->nops_p = p;
-	WRITE32(hdr->nops);
+	*p++ = cpu_to_be32(hdr->nops);
 }
 
 static void encode_nops(struct compound_hdr *hdr)
@@ -835,7 +834,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	 * We write the bitmap length now, but leave the bitmap and the attribute
 	 * buffer length to be backfilled at the end of this routine.
 	 */
-	WRITE32(2);
+	*p++ = cpu_to_be32(2);
 	q = p;
 	p += 3;
 
@@ -845,39 +844,39 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	}
 	if (iap->ia_valid & ATTR_MODE) {
 		bmval1 |= FATTR4_WORD1_MODE;
-		WRITE32(iap->ia_mode & S_IALLUGO);
+		*p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
 	}
 	if (iap->ia_valid & ATTR_UID) {
 		bmval1 |= FATTR4_WORD1_OWNER;
-		WRITE32(owner_namelen);
+		*p++ = cpu_to_be32(owner_namelen);
 		WRITEMEM(owner_name, owner_namelen);
 	}
 	if (iap->ia_valid & ATTR_GID) {
 		bmval1 |= FATTR4_WORD1_OWNER_GROUP;
-		WRITE32(owner_grouplen);
+		*p++ = cpu_to_be32(owner_grouplen);
 		WRITEMEM(owner_group, owner_grouplen);
 	}
 	if (iap->ia_valid & ATTR_ATIME_SET) {
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
-		WRITE32(NFS4_SET_TO_CLIENT_TIME);
-		WRITE32(0);
-		WRITE32(iap->ia_mtime.tv_sec);
-		WRITE32(iap->ia_mtime.tv_nsec);
+		*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
+		*p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
 	}
 	else if (iap->ia_valid & ATTR_ATIME) {
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
-		WRITE32(NFS4_SET_TO_SERVER_TIME);
+		*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
 	}
 	if (iap->ia_valid & ATTR_MTIME_SET) {
 		bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
-		WRITE32(NFS4_SET_TO_CLIENT_TIME);
-		WRITE32(0);
-		WRITE32(iap->ia_mtime.tv_sec);
-		WRITE32(iap->ia_mtime.tv_nsec);
+		*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+		*p++ = cpu_to_be32(0);
+		*p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
+		*p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
 	}
 	else if (iap->ia_valid & ATTR_MTIME) {
 		bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
-		WRITE32(NFS4_SET_TO_SERVER_TIME);
+		*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
 	}
 
 	/*
@@ -901,8 +900,8 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
 	__be32 *p;
 
 	RESERVE_SPACE(8);
-	WRITE32(OP_ACCESS);
-	WRITE32(access);
+	*p++ = cpu_to_be32(OP_ACCESS);
+	*p++ = cpu_to_be32(access);
 	hdr->nops++;
 	hdr->replen += decode_access_maxsz;
 }
@@ -912,8 +911,8 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 	__be32 *p;
 
 	RESERVE_SPACE(8+NFS4_STATEID_SIZE);
-	WRITE32(OP_CLOSE);
-	WRITE32(arg->seqid->sequence->counter);
+	*p++ = cpu_to_be32(OP_CLOSE);
+	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_close_maxsz;
@@ -924,9 +923,9 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
 	__be32 *p;
 
 	RESERVE_SPACE(16);
-	WRITE32(OP_COMMIT);
+	*p++ = cpu_to_be32(OP_COMMIT);
 	WRITE64(args->offset);
-	WRITE32(args->count);
+	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_commit_maxsz;
 }
@@ -936,20 +935,20 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 	__be32 *p;
 
 	RESERVE_SPACE(8);
-	WRITE32(OP_CREATE);
-	WRITE32(create->ftype);
+	*p++ = cpu_to_be32(OP_CREATE);
+	*p++ = cpu_to_be32(create->ftype);
 
 	switch (create->ftype) {
 	case NF4LNK:
 		RESERVE_SPACE(4);
-		WRITE32(create->u.symlink.len);
+		*p++ = cpu_to_be32(create->u.symlink.len);
 		xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
 		break;
 
 	case NF4BLK: case NF4CHR:
 		RESERVE_SPACE(8);
-		WRITE32(create->u.device.specdata1);
-		WRITE32(create->u.device.specdata2);
+		*p++ = cpu_to_be32(create->u.device.specdata1);
+		*p++ = cpu_to_be32(create->u.device.specdata2);
 		break;
 
 	default:
@@ -957,7 +956,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 	}
 
 	RESERVE_SPACE(4 + create->name->len);
-	WRITE32(create->name->len);
+	*p++ = cpu_to_be32(create->name->len);
 	WRITEMEM(create->name->name, create->name->len);
 	hdr->nops++;
 	hdr->replen += decode_create_maxsz;
@@ -970,9 +969,9 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
 	__be32 *p;
 
 	RESERVE_SPACE(12);
-	WRITE32(OP_GETATTR);
-	WRITE32(1);
-	WRITE32(bitmap);
+	*p++ = cpu_to_be32(OP_GETATTR);
+	*p++ = cpu_to_be32(1);
+	*p++ = cpu_to_be32(bitmap);
 	hdr->nops++;
 	hdr->replen += decode_getattr_maxsz;
 }
@@ -982,10 +981,10 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
 	__be32 *p;
 
 	RESERVE_SPACE(16);
-	WRITE32(OP_GETATTR);
-	WRITE32(2);
-	WRITE32(bm0);
-	WRITE32(bm1);
+	*p++ = cpu_to_be32(OP_GETATTR);
+	*p++ = cpu_to_be32(2);
+	*p++ = cpu_to_be32(bm0);
+	*p++ = cpu_to_be32(bm1);
 	hdr->nops++;
 	hdr->replen += decode_getattr_maxsz;
 }
@@ -1013,7 +1012,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_GETFH);
+	*p++ = cpu_to_be32(OP_GETFH);
 	hdr->nops++;
 	hdr->replen += decode_getfh_maxsz;
 }
@@ -1023,8 +1022,8 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 	__be32 *p;
 
 	RESERVE_SPACE(8 + name->len);
-	WRITE32(OP_LINK);
-	WRITE32(name->len);
+	*p++ = cpu_to_be32(OP_LINK);
+	*p++ = cpu_to_be32(name->len);
 	WRITEMEM(name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_link_maxsz;
@@ -1053,26 +1052,26 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	__be32 *p;
 
 	RESERVE_SPACE(32);
-	WRITE32(OP_LOCK);
-	WRITE32(nfs4_lock_type(args->fl, args->block));
-	WRITE32(args->reclaim);
+	*p++ = cpu_to_be32(OP_LOCK);
+	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
+	*p++ = cpu_to_be32(args->reclaim);
 	WRITE64(args->fl->fl_start);
 	WRITE64(nfs4_lock_length(args->fl));
-	WRITE32(args->new_lock_owner);
+	*p++ = cpu_to_be32(args->new_lock_owner);
 	if (args->new_lock_owner){
 		RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
-		WRITE32(args->open_seqid->sequence->counter);
+		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
 		WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
-		WRITE32(args->lock_seqid->sequence->counter);
+		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 		WRITE64(args->lock_owner.clientid);
-		WRITE32(16);
+		*p++ = cpu_to_be32(16);
 		WRITEMEM("lock id:", 8);
 		WRITE64(args->lock_owner.id);
 	}
 	else {
 		RESERVE_SPACE(NFS4_STATEID_SIZE+4);
 		WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
-		WRITE32(args->lock_seqid->sequence->counter);
+		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 	}
 	hdr->nops++;
 	hdr->replen += decode_lock_maxsz;
@@ -1083,12 +1082,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 	__be32 *p;
 
 	RESERVE_SPACE(52);
-	WRITE32(OP_LOCKT);
-	WRITE32(nfs4_lock_type(args->fl, 0));
+	*p++ = cpu_to_be32(OP_LOCKT);
+	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	WRITE64(args->fl->fl_start);
 	WRITE64(nfs4_lock_length(args->fl));
 	WRITE64(args->lock_owner.clientid);
-	WRITE32(16);
+	*p++ = cpu_to_be32(16);
 	WRITEMEM("lock id:", 8);
 	WRITE64(args->lock_owner.id);
 	hdr->nops++;
@@ -1100,9 +1099,9 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	__be32 *p;
 
 	RESERVE_SPACE(12+NFS4_STATEID_SIZE+16);
-	WRITE32(OP_LOCKU);
-	WRITE32(nfs4_lock_type(args->fl, 0));
-	WRITE32(args->seqid->sequence->counter);
+	*p++ = cpu_to_be32(OP_LOCKU);
+	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
+	*p++ = cpu_to_be32(args->seqid->sequence->counter);
 	WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
 	WRITE64(args->fl->fl_start);
 	WRITE64(nfs4_lock_length(args->fl));
@@ -1116,8 +1115,8 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 	__be32 *p;
 
 	RESERVE_SPACE(8 + len);
-	WRITE32(OP_LOOKUP);
-	WRITE32(len);
+	*p++ = cpu_to_be32(OP_LOOKUP);
+	*p++ = cpu_to_be32(len);
 	WRITEMEM(name->name, len);
 	hdr->nops++;
 	hdr->replen += decode_lookup_maxsz;
@@ -1130,18 +1129,18 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
 	RESERVE_SPACE(8);
 	switch (fmode & (FMODE_READ|FMODE_WRITE)) {
 	case FMODE_READ:
-		WRITE32(NFS4_SHARE_ACCESS_READ);
+		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
 		break;
 	case FMODE_WRITE:
-		WRITE32(NFS4_SHARE_ACCESS_WRITE);
+		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
 		break;
 	case FMODE_READ|FMODE_WRITE:
-		WRITE32(NFS4_SHARE_ACCESS_BOTH);
+		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
 		break;
 	default:
-		WRITE32(0);
+		*p++ = cpu_to_be32(0);
 	}
-	WRITE32(0);		/* for linux, share_deny = 0 always */
+	*p++ = cpu_to_be32(0);		/* for linux, share_deny = 0 always */
 }
 
 static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1152,12 +1151,12 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
  * owner 4 = 32
  */
 	RESERVE_SPACE(8);
-	WRITE32(OP_OPEN);
-	WRITE32(arg->seqid->sequence->counter);
+	*p++ = cpu_to_be32(OP_OPEN);
+	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	RESERVE_SPACE(28);
 	WRITE64(arg->clientid);
-	WRITE32(16);
+	*p++ = cpu_to_be32(16);
 	WRITEMEM("open id:", 8);
 	WRITE64(arg->id);
 }
@@ -1169,11 +1168,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 	RESERVE_SPACE(4);
 	switch(arg->open_flags & O_EXCL) {
 	case 0:
-		WRITE32(NFS4_CREATE_UNCHECKED);
+		*p++ = cpu_to_be32(NFS4_CREATE_UNCHECKED);
 		encode_attrs(xdr, arg->u.attrs, arg->server);
 		break;
 	default:
-		WRITE32(NFS4_CREATE_EXCLUSIVE);
+		*p++ = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
 		encode_nfs4_verifier(xdr, &arg->u.verifier);
 	}
 }
@@ -1185,11 +1184,11 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
 	RESERVE_SPACE(4);
 	switch (arg->open_flags & O_CREAT) {
 	case 0:
-		WRITE32(NFS4_OPEN_NOCREATE);
+		*p++ = cpu_to_be32(NFS4_OPEN_NOCREATE);
 		break;
 	default:
 		BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
-		WRITE32(NFS4_OPEN_CREATE);
+		*p++ = cpu_to_be32(NFS4_OPEN_CREATE);
 		encode_createmode(xdr, arg);
 	}
 }
@@ -1201,13 +1200,13 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
 	RESERVE_SPACE(4);
 	switch (delegation_type) {
 	case 0:
-		WRITE32(NFS4_OPEN_DELEGATE_NONE);
+		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
 		break;
 	case FMODE_READ:
-		WRITE32(NFS4_OPEN_DELEGATE_READ);
+		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
 		break;
 	case FMODE_WRITE|FMODE_READ:
-		WRITE32(NFS4_OPEN_DELEGATE_WRITE);
+		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
 		break;
 	default:
 		BUG();
@@ -1219,7 +1218,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(NFS4_OPEN_CLAIM_NULL);
+	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
 	encode_string(xdr, name->len, name->name);
 }
 
@@ -1228,7 +1227,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(NFS4_OPEN_CLAIM_PREVIOUS);
+	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
 	encode_delegation_type(xdr, type);
 }
 
@@ -1237,7 +1236,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-	WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
+	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
 	WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
 	encode_string(xdr, name->len, name->name);
 }
@@ -1268,9 +1267,9 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
-	WRITE32(OP_OPEN_CONFIRM);
+	*p++ = cpu_to_be32(OP_OPEN_CONFIRM);
 	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
-	WRITE32(arg->seqid->sequence->counter);
+	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	hdr->nops++;
 	hdr->replen += decode_open_confirm_maxsz;
 }
@@ -1280,9 +1279,9 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
-	WRITE32(OP_OPEN_DOWNGRADE);
+	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
 	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
-	WRITE32(arg->seqid->sequence->counter);
+	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	hdr->nops++;
 	hdr->replen += decode_open_downgrade_maxsz;
@@ -1295,8 +1294,8 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 	__be32 *p;
 
 	RESERVE_SPACE(8 + len);
-	WRITE32(OP_PUTFH);
-	WRITE32(len);
+	*p++ = cpu_to_be32(OP_PUTFH);
+	*p++ = cpu_to_be32(len);
 	WRITEMEM(fh->data, len);
 	hdr->nops++;
 	hdr->replen += decode_putfh_maxsz;
@@ -1307,7 +1306,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_PUTROOTFH);
+	*p++ = cpu_to_be32(OP_PUTROOTFH);
 	hdr->nops++;
 	hdr->replen += decode_putrootfh_maxsz;
 }
@@ -1330,13 +1329,13 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_READ);
+	*p++ = cpu_to_be32(OP_READ);
 
 	encode_stateid(xdr, args->context);
 
 	RESERVE_SPACE(12);
 	WRITE64(args->offset);
-	WRITE32(args->count);
+	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_read_maxsz;
 }
@@ -1350,19 +1349,19 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	__be32 *p;
 
 	RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
-	WRITE32(OP_READDIR);
+	*p++ = cpu_to_be32(OP_READDIR);
 	WRITE64(readdir->cookie);
 	WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE);
-	WRITE32(readdir->count >> 1);  /* We're not doing readdirplus */
-	WRITE32(readdir->count);
-	WRITE32(2);
+	*p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
+	*p++ = cpu_to_be32(readdir->count);
+	*p++ = cpu_to_be32(2);
 	/* Switch to mounted_on_fileid if the server supports it */
 	if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
 		attrs[0] &= ~FATTR4_WORD0_FILEID;
 	else
 		attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
-	WRITE32(attrs[0] & readdir->bitmask[0]);
-	WRITE32(attrs[1] & readdir->bitmask[1]);
+	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
+	*p++ = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
 	hdr->nops++;
 	hdr->replen += decode_readdir_maxsz;
 	dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
@@ -1379,7 +1378,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_READLINK);
+	*p++ = cpu_to_be32(OP_READLINK);
 	hdr->nops++;
 	hdr->replen += decode_readlink_maxsz;
 }
@@ -1389,8 +1388,8 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 	__be32 *p;
 
 	RESERVE_SPACE(8 + name->len);
-	WRITE32(OP_REMOVE);
-	WRITE32(name->len);
+	*p++ = cpu_to_be32(OP_REMOVE);
+	*p++ = cpu_to_be32(name->len);
 	WRITEMEM(name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_remove_maxsz;
@@ -1401,12 +1400,12 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 	__be32 *p;
 
 	RESERVE_SPACE(8 + oldname->len);
-	WRITE32(OP_RENAME);
-	WRITE32(oldname->len);
+	*p++ = cpu_to_be32(OP_RENAME);
+	*p++ = cpu_to_be32(oldname->len);
 	WRITEMEM(oldname->name, oldname->len);
 
 	RESERVE_SPACE(4 + newname->len);
-	WRITE32(newname->len);
+	*p++ = cpu_to_be32(newname->len);
 	WRITEMEM(newname->name, newname->len);
 	hdr->nops++;
 	hdr->replen += decode_rename_maxsz;
@@ -1417,7 +1416,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
 	__be32 *p;
 
 	RESERVE_SPACE(12);
-	WRITE32(OP_RENEW);
+	*p++ = cpu_to_be32(OP_RENEW);
 	WRITE64(client_stateid->cl_clientid);
 	hdr->nops++;
 	hdr->replen += decode_renew_maxsz;
@@ -1429,7 +1428,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_RESTOREFH);
+	*p++ = cpu_to_be32(OP_RESTOREFH);
 	hdr->nops++;
 	hdr->replen += decode_restorefh_maxsz;
 }
@@ -1440,15 +1439,15 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-	WRITE32(OP_SETATTR);
+	*p++ = cpu_to_be32(OP_SETATTR);
 	WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
 	RESERVE_SPACE(2*4);
-	WRITE32(1);
-	WRITE32(FATTR4_WORD0_ACL);
+	*p++ = cpu_to_be32(1);
+	*p++ = cpu_to_be32(FATTR4_WORD0_ACL);
 	if (arg->acl_len % 4)
 		return -EINVAL;
 	RESERVE_SPACE(4);
-	WRITE32(arg->acl_len);
+	*p++ = cpu_to_be32(arg->acl_len);
 	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
 	hdr->nops++;
 	hdr->replen += decode_setacl_maxsz;
@@ -1461,7 +1460,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_SAVEFH);
+	*p++ = cpu_to_be32(OP_SAVEFH);
 	hdr->nops++;
 	hdr->replen += decode_savefh_maxsz;
 }
@@ -1471,7 +1470,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 	__be32 *p;
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-	WRITE32(OP_SETATTR);
+	*p++ = cpu_to_be32(OP_SETATTR);
 	WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setattr_maxsz;
@@ -1483,16 +1482,16 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 	__be32 *p;
 
 	RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE);
-	WRITE32(OP_SETCLIENTID);
+	*p++ = cpu_to_be32(OP_SETCLIENTID);
 	WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
 
 	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
 	RESERVE_SPACE(4);
-	WRITE32(setclientid->sc_prog);
+	*p++ = cpu_to_be32(setclientid->sc_prog);
 	encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
 	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
 	RESERVE_SPACE(4);
-	WRITE32(setclientid->sc_cb_ident);
+	*p++ = cpu_to_be32(setclientid->sc_cb_ident);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_maxsz;
 }
@@ -1502,7 +1501,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 	__be32 *p;
 
 	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
-	WRITE32(OP_SETCLIENTID_CONFIRM);
+	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
 	WRITE64(client_state->cl_clientid);
 	WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	hdr->nops++;
@@ -1514,14 +1513,14 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	__be32 *p;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_WRITE);
+	*p++ = cpu_to_be32(OP_WRITE);
 
 	encode_stateid(xdr, args->context);
 
 	RESERVE_SPACE(16);
 	WRITE64(args->offset);
-	WRITE32(args->stable);
-	WRITE32(args->count);
+	*p++ = cpu_to_be32(args->stable);
+	*p++ = cpu_to_be32(args->count);
 
 	xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
 	hdr->nops++;
@@ -1534,7 +1533,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 
-	WRITE32(OP_DELEGRETURN);
+	*p++ = cpu_to_be32(OP_DELEGRETURN);
 	WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_delegreturn_maxsz;
@@ -1549,15 +1548,15 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 	__be32 *p;
 
 	RESERVE_SPACE(4 + sizeof(args->verifier->data));
-	WRITE32(OP_EXCHANGE_ID);
+	*p++ = cpu_to_be32(OP_EXCHANGE_ID);
 	WRITEMEM(args->verifier->data, sizeof(args->verifier->data));
 
 	encode_string(xdr, args->id_len, args->id);
 
 	RESERVE_SPACE(12);
-	WRITE32(args->flags);
-	WRITE32(0);	/* zero length state_protect4_a */
-	WRITE32(0);	/* zero length implementation id array */
+	*p++ = cpu_to_be32(args->flags);
+	*p++ = cpu_to_be32(0);	/* zero length state_protect4_a */
+	*p++ = cpu_to_be32(0);	/* zero length implementation id array */
 	hdr->nops++;
 	hdr->replen += decode_exchange_id_maxsz;
 }
@@ -1572,54 +1571,54 @@ static void encode_create_session(struct xdr_stream *xdr,
 	struct nfs_client *clp = args->client;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_CREATE_SESSION);
+	*p++ = cpu_to_be32(OP_CREATE_SESSION);
 
 	RESERVE_SPACE(8);
 	WRITE64(clp->cl_ex_clid);
 
 	RESERVE_SPACE(8);
-	WRITE32(clp->cl_seqid);			/*Sequence id */
-	WRITE32(args->flags);			/*flags */
+	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
+	*p++ = cpu_to_be32(args->flags);			/*flags */
 
 	RESERVE_SPACE(2*28);			/* 2 channel_attrs */
 	/* Fore Channel */
-	WRITE32(args->fc_attrs.headerpadsz);	/* header padding size */
-	WRITE32(args->fc_attrs.max_rqst_sz);	/* max req size */
-	WRITE32(args->fc_attrs.max_resp_sz);	/* max resp size */
-	WRITE32(args->fc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
-	WRITE32(args->fc_attrs.max_ops);	/* max operations */
-	WRITE32(args->fc_attrs.max_reqs);	/* max requests */
-	WRITE32(0);				/* rdmachannel_attrs */
+	*p++ = cpu_to_be32(args->fc_attrs.headerpadsz);	/* header padding size */
+	*p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz);	/* max req size */
+	*p++ = cpu_to_be32(args->fc_attrs.max_resp_sz);	/* max resp size */
+	*p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
+	*p++ = cpu_to_be32(args->fc_attrs.max_ops);	/* max operations */
+	*p++ = cpu_to_be32(args->fc_attrs.max_reqs);	/* max requests */
+	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
 
 	/* Back Channel */
-	WRITE32(args->fc_attrs.headerpadsz);	/* header padding size */
-	WRITE32(args->bc_attrs.max_rqst_sz);	/* max req size */
-	WRITE32(args->bc_attrs.max_resp_sz);	/* max resp size */
-	WRITE32(args->bc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
-	WRITE32(args->bc_attrs.max_ops);	/* max operations */
-	WRITE32(args->bc_attrs.max_reqs);	/* max requests */
-	WRITE32(0);				/* rdmachannel_attrs */
+	*p++ = cpu_to_be32(args->fc_attrs.headerpadsz);	/* header padding size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz);	/* max req size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_resp_sz);	/* max resp size */
+	*p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
+	*p++ = cpu_to_be32(args->bc_attrs.max_ops);	/* max operations */
+	*p++ = cpu_to_be32(args->bc_attrs.max_reqs);	/* max requests */
+	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
 
 	RESERVE_SPACE(4);
-	WRITE32(args->cb_program);		/* cb_program */
+	*p++ = cpu_to_be32(args->cb_program);		/* cb_program */
 
 	RESERVE_SPACE(4);			/* # of security flavors */
-	WRITE32(1);
+	*p++ = cpu_to_be32(1);
 
 	RESERVE_SPACE(4);
-	WRITE32(RPC_AUTH_UNIX);			/* auth_sys */
+	*p++ = cpu_to_be32(RPC_AUTH_UNIX);			/* auth_sys */
 
 	/* authsys_parms rfc1831 */
 	RESERVE_SPACE(4);
-	WRITE32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
+	*p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
 	len = scnprintf(machine_name, sizeof(machine_name), "%s",
 			clp->cl_ipaddr);
 	RESERVE_SPACE(16 + len);
-	WRITE32(len);
+	*p++ = cpu_to_be32(len);
 	WRITEMEM(machine_name, len);
-	WRITE32(0);				/* UID */
-	WRITE32(0);				/* GID */
-	WRITE32(0);				/* No more gids */
+	*p++ = cpu_to_be32(0);				/* UID */
+	*p++ = cpu_to_be32(0);				/* GID */
+	*p++ = cpu_to_be32(0);				/* No more gids */
 	hdr->nops++;
 	hdr->replen += decode_create_session_maxsz;
 }
@@ -1630,7 +1629,7 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 {
 	__be32 *p;
 	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
-	WRITE32(OP_DESTROY_SESSION);
+	*p++ = cpu_to_be32(OP_DESTROY_SESSION);
 	WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	hdr->nops++;
 	hdr->replen += decode_destroy_session_maxsz;
@@ -1656,7 +1655,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 	slot = tp->slots + args->sa_slotid;
 
 	RESERVE_SPACE(4);
-	WRITE32(OP_SEQUENCE);
+	*p++ = cpu_to_be32(OP_SEQUENCE);
 
 	/*
 	 * Sessionid + seqid + slotid + max slotid + cache_this
@@ -1672,10 +1671,10 @@ static void encode_sequence(struct xdr_stream *xdr,
 		tp->highest_used_slotid, args->sa_cache_this);
 	RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
 	WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
-	WRITE32(slot->seq_nr);
-	WRITE32(args->sa_slotid);
-	WRITE32(tp->highest_used_slotid);
-	WRITE32(args->sa_cache_this);
+	*p++ = cpu_to_be32(slot->seq_nr);
+	*p++ = cpu_to_be32(args->sa_slotid);
+	*p++ = cpu_to_be32(tp->highest_used_slotid);
+	*p++ = cpu_to_be32(args->sa_cache_this);
 	hdr->nops++;
 	hdr->replen += decode_sequence_maxsz;
 #endif /* CONFIG_NFS_V4_1 */
-- 
cgit v1.2.3-70-g09d2


From b95be5a976848febff82edb21d5b4351b3997bf6 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:01 +0300
Subject: nfs: nfs4xdr: get rid of WRITE64

s/WRITE64/p = xdr_encode_hyper(p, /

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 9a03b24ead5f..5d80766fc672 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -712,10 +712,6 @@ struct compound_hdr {
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define WRITE64(n)               do {				\
-	*p++ = htonl((uint32_t)((n) >> 32));				\
-	*p++ = htonl((uint32_t)(n));					\
-} while (0)
 #define WRITEMEM(ptr,nbytes)     do {				\
 	p = xdr_encode_opaque_fixed(p, ptr, nbytes);		\
 } while (0)
@@ -840,7 +836,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 
 	if (iap->ia_valid & ATTR_SIZE) {
 		bmval0 |= FATTR4_WORD0_SIZE;
-		WRITE64(iap->ia_size);
+		p = xdr_encode_hyper(p, iap->ia_size);
 	}
 	if (iap->ia_valid & ATTR_MODE) {
 		bmval1 |= FATTR4_WORD1_MODE;
@@ -924,7 +920,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
 
 	RESERVE_SPACE(16);
 	*p++ = cpu_to_be32(OP_COMMIT);
-	WRITE64(args->offset);
+	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_commit_maxsz;
@@ -1055,18 +1051,18 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	*p++ = cpu_to_be32(OP_LOCK);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
 	*p++ = cpu_to_be32(args->reclaim);
-	WRITE64(args->fl->fl_start);
-	WRITE64(nfs4_lock_length(args->fl));
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	*p++ = cpu_to_be32(args->new_lock_owner);
 	if (args->new_lock_owner){
 		RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
 		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
 		WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
-		WRITE64(args->lock_owner.clientid);
+		p = xdr_encode_hyper(p, args->lock_owner.clientid);
 		*p++ = cpu_to_be32(16);
 		WRITEMEM("lock id:", 8);
-		WRITE64(args->lock_owner.id);
+		p = xdr_encode_hyper(p, args->lock_owner.id);
 	}
 	else {
 		RESERVE_SPACE(NFS4_STATEID_SIZE+4);
@@ -1084,12 +1080,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 	RESERVE_SPACE(52);
 	*p++ = cpu_to_be32(OP_LOCKT);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
-	WRITE64(args->fl->fl_start);
-	WRITE64(nfs4_lock_length(args->fl));
-	WRITE64(args->lock_owner.clientid);
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+	p = xdr_encode_hyper(p, args->lock_owner.clientid);
 	*p++ = cpu_to_be32(16);
 	WRITEMEM("lock id:", 8);
-	WRITE64(args->lock_owner.id);
+	p = xdr_encode_hyper(p, args->lock_owner.id);
 	hdr->nops++;
 	hdr->replen += decode_lockt_maxsz;
 }
@@ -1103,8 +1099,8 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	*p++ = cpu_to_be32(args->seqid->sequence->counter);
 	WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
-	WRITE64(args->fl->fl_start);
-	WRITE64(nfs4_lock_length(args->fl));
+	p = xdr_encode_hyper(p, args->fl->fl_start);
+	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	hdr->nops++;
 	hdr->replen += decode_locku_maxsz;
 }
@@ -1155,10 +1151,10 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	RESERVE_SPACE(28);
-	WRITE64(arg->clientid);
+	p = xdr_encode_hyper(p, arg->clientid);
 	*p++ = cpu_to_be32(16);
 	WRITEMEM("open id:", 8);
-	WRITE64(arg->id);
+	p = xdr_encode_hyper(p, arg->id);
 }
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1334,7 +1330,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 	encode_stateid(xdr, args->context);
 
 	RESERVE_SPACE(12);
-	WRITE64(args->offset);
+	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_read_maxsz;
@@ -1350,7 +1346,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 
 	RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
 	*p++ = cpu_to_be32(OP_READDIR);
-	WRITE64(readdir->cookie);
+	p = xdr_encode_hyper(p, readdir->cookie);
 	WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
 	*p++ = cpu_to_be32(readdir->count);
@@ -1417,7 +1413,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
 
 	RESERVE_SPACE(12);
 	*p++ = cpu_to_be32(OP_RENEW);
-	WRITE64(client_stateid->cl_clientid);
+	p = xdr_encode_hyper(p, client_stateid->cl_clientid);
 	hdr->nops++;
 	hdr->replen += decode_renew_maxsz;
 }
@@ -1502,7 +1498,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 
 	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
-	WRITE64(client_state->cl_clientid);
+	p = xdr_encode_hyper(p, client_state->cl_clientid);
 	WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_confirm_maxsz;
@@ -1518,7 +1514,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	encode_stateid(xdr, args->context);
 
 	RESERVE_SPACE(16);
-	WRITE64(args->offset);
+	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->stable);
 	*p++ = cpu_to_be32(args->count);
 
@@ -1574,7 +1570,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(OP_CREATE_SESSION);
 
 	RESERVE_SPACE(8);
-	WRITE64(clp->cl_ex_clid);
+	p = xdr_encode_hyper(p, clp->cl_ex_clid);
 
 	RESERVE_SPACE(8);
 	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
-- 
cgit v1.2.3-70-g09d2


From 93f0cf25944695e1229fe90a2897af0211fbd425 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:06 +0300
Subject: nfs: nfs4xdr: get rid of WRITEMEM

s/WRITEMEM(/p = xdr_encode_opaque_fixed(p, /

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 68 ++++++++++++++++++++++++++------------------------------
 1 file changed, 32 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5d80766fc672..17915c8ea7fa 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -712,10 +712,6 @@ struct compound_hdr {
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define WRITEMEM(ptr,nbytes)     do {				\
-	p = xdr_encode_opaque_fixed(p, ptr, nbytes);		\
-} while (0)
-
 #define RESERVE_SPACE(nbytes)	do {				\
 	p = xdr_reserve_space(xdr, nbytes);			\
 	BUG_ON(!p);						\
@@ -746,7 +742,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
 	RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
 	*p++ = cpu_to_be32(hdr->taglen);
-	WRITEMEM(hdr->tag, hdr->taglen);
+	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
 	hdr->nops_p = p;
 	*p++ = cpu_to_be32(hdr->nops);
@@ -845,12 +841,12 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	if (iap->ia_valid & ATTR_UID) {
 		bmval1 |= FATTR4_WORD1_OWNER;
 		*p++ = cpu_to_be32(owner_namelen);
-		WRITEMEM(owner_name, owner_namelen);
+		p = xdr_encode_opaque_fixed(p, owner_name, owner_namelen);
 	}
 	if (iap->ia_valid & ATTR_GID) {
 		bmval1 |= FATTR4_WORD1_OWNER_GROUP;
 		*p++ = cpu_to_be32(owner_grouplen);
-		WRITEMEM(owner_group, owner_grouplen);
+		p = xdr_encode_opaque_fixed(p, owner_group, owner_grouplen);
 	}
 	if (iap->ia_valid & ATTR_ATIME_SET) {
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
@@ -909,7 +905,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 	RESERVE_SPACE(8+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_CLOSE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
-	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_close_maxsz;
 }
@@ -953,7 +949,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 
 	RESERVE_SPACE(4 + create->name->len);
 	*p++ = cpu_to_be32(create->name->len);
-	WRITEMEM(create->name->name, create->name->len);
+	p = xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
 	hdr->nops++;
 	hdr->replen += decode_create_maxsz;
 
@@ -1020,7 +1016,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 	RESERVE_SPACE(8 + name->len);
 	*p++ = cpu_to_be32(OP_LINK);
 	*p++ = cpu_to_be32(name->len);
-	WRITEMEM(name->name, name->len);
+	p = xdr_encode_opaque_fixed(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_link_maxsz;
 }
@@ -1057,16 +1053,16 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	if (args->new_lock_owner){
 		RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
 		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
-		WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
+		p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 		p = xdr_encode_hyper(p, args->lock_owner.clientid);
 		*p++ = cpu_to_be32(16);
-		WRITEMEM("lock id:", 8);
+		p = xdr_encode_opaque_fixed(p, "lock id:", 8);
 		p = xdr_encode_hyper(p, args->lock_owner.id);
 	}
 	else {
 		RESERVE_SPACE(NFS4_STATEID_SIZE+4);
-		WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
+		p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 	}
 	hdr->nops++;
@@ -1084,7 +1080,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	p = xdr_encode_hyper(p, args->lock_owner.clientid);
 	*p++ = cpu_to_be32(16);
-	WRITEMEM("lock id:", 8);
+	p = xdr_encode_opaque_fixed(p, "lock id:", 8);
 	p = xdr_encode_hyper(p, args->lock_owner.id);
 	hdr->nops++;
 	hdr->replen += decode_lockt_maxsz;
@@ -1098,7 +1094,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	*p++ = cpu_to_be32(OP_LOCKU);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	*p++ = cpu_to_be32(args->seqid->sequence->counter);
-	WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
 	p = xdr_encode_hyper(p, args->fl->fl_start);
 	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	hdr->nops++;
@@ -1113,7 +1109,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 	RESERVE_SPACE(8 + len);
 	*p++ = cpu_to_be32(OP_LOOKUP);
 	*p++ = cpu_to_be32(len);
-	WRITEMEM(name->name, len);
+	p = xdr_encode_opaque_fixed(p, name->name, len);
 	hdr->nops++;
 	hdr->replen += decode_lookup_maxsz;
 }
@@ -1153,7 +1149,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 	RESERVE_SPACE(28);
 	p = xdr_encode_hyper(p, arg->clientid);
 	*p++ = cpu_to_be32(16);
-	WRITEMEM("open id:", 8);
+	p = xdr_encode_opaque_fixed(p, "open id:", 8);
 	p = xdr_encode_hyper(p, arg->id);
 }
 
@@ -1233,7 +1229,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
-	WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	encode_string(xdr, name->len, name->name);
 }
 
@@ -1264,7 +1260,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_CONFIRM);
-	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	hdr->nops++;
 	hdr->replen += decode_open_confirm_maxsz;
@@ -1276,7 +1272,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
-	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	hdr->nops++;
@@ -1292,7 +1288,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 	RESERVE_SPACE(8 + len);
 	*p++ = cpu_to_be32(OP_PUTFH);
 	*p++ = cpu_to_be32(len);
-	WRITEMEM(fh->data, len);
+	p = xdr_encode_opaque_fixed(p, fh->data, len);
 	hdr->nops++;
 	hdr->replen += decode_putfh_maxsz;
 }
@@ -1315,9 +1311,9 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
 	RESERVE_SPACE(NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
 		nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
-		WRITEMEM(stateid.data, NFS4_STATEID_SIZE);
+		p = xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
 	} else
-		WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
+		p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
 }
 
 static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
@@ -1347,7 +1343,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
 	*p++ = cpu_to_be32(OP_READDIR);
 	p = xdr_encode_hyper(p, readdir->cookie);
-	WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE);
+	p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
 	*p++ = cpu_to_be32(readdir->count);
 	*p++ = cpu_to_be32(2);
@@ -1386,7 +1382,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 	RESERVE_SPACE(8 + name->len);
 	*p++ = cpu_to_be32(OP_REMOVE);
 	*p++ = cpu_to_be32(name->len);
-	WRITEMEM(name->name, name->len);
+	p = xdr_encode_opaque_fixed(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_remove_maxsz;
 }
@@ -1398,11 +1394,11 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 	RESERVE_SPACE(8 + oldname->len);
 	*p++ = cpu_to_be32(OP_RENAME);
 	*p++ = cpu_to_be32(oldname->len);
-	WRITEMEM(oldname->name, oldname->len);
+	p = xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
 
 	RESERVE_SPACE(4 + newname->len);
 	*p++ = cpu_to_be32(newname->len);
-	WRITEMEM(newname->name, newname->len);
+	p = xdr_encode_opaque_fixed(p, newname->name, newname->len);
 	hdr->nops++;
 	hdr->replen += decode_rename_maxsz;
 }
@@ -1436,7 +1432,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
-	WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
 	RESERVE_SPACE(2*4);
 	*p++ = cpu_to_be32(1);
 	*p++ = cpu_to_be32(FATTR4_WORD0_ACL);
@@ -1467,7 +1463,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
-	WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setattr_maxsz;
 	encode_attrs(xdr, arg->iap, server);
@@ -1479,7 +1475,7 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 
 	RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID);
-	WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
+	p = xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
 
 	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
 	RESERVE_SPACE(4);
@@ -1499,7 +1495,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
 	p = xdr_encode_hyper(p, client_state->cl_clientid);
-	WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+	p = xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_confirm_maxsz;
 }
@@ -1530,7 +1526,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
 
 	*p++ = cpu_to_be32(OP_DELEGRETURN);
-	WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_delegreturn_maxsz;
 }
@@ -1545,7 +1541,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 
 	RESERVE_SPACE(4 + sizeof(args->verifier->data));
 	*p++ = cpu_to_be32(OP_EXCHANGE_ID);
-	WRITEMEM(args->verifier->data, sizeof(args->verifier->data));
+	p = xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
 
 	encode_string(xdr, args->id_len, args->id);
 
@@ -1611,7 +1607,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 			clp->cl_ipaddr);
 	RESERVE_SPACE(16 + len);
 	*p++ = cpu_to_be32(len);
-	WRITEMEM(machine_name, len);
+	p = xdr_encode_opaque_fixed(p, machine_name, len);
 	*p++ = cpu_to_be32(0);				/* UID */
 	*p++ = cpu_to_be32(0);				/* GID */
 	*p++ = cpu_to_be32(0);				/* No more gids */
@@ -1626,7 +1622,7 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 	__be32 *p;
 	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(OP_DESTROY_SESSION);
-	WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	hdr->nops++;
 	hdr->replen += decode_destroy_session_maxsz;
 }
@@ -1666,7 +1662,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 		slot->seq_nr, args->sa_slotid,
 		tp->highest_used_slotid, args->sa_cache_this);
 	RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
-	WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(slot->seq_nr);
 	*p++ = cpu_to_be32(args->sa_slotid);
 	*p++ = cpu_to_be32(tp->highest_used_slotid);
-- 
cgit v1.2.3-70-g09d2


From 42edd698125b76a38bd9999015202db036dfbc76 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:13 +0300
Subject: nfs: nfs4xdr: optimize RESERVE_SPACE in encode_create_session and
 encode_sequence

Coalesce multilpe constant RESERVE_SPACEs into one

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 17915c8ea7fa..d460d81758ae 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1562,17 +1562,15 @@ static void encode_create_session(struct xdr_stream *xdr,
 	uint32_t len;
 	struct nfs_client *clp = args->client;
 
-	RESERVE_SPACE(4);
-	*p++ = cpu_to_be32(OP_CREATE_SESSION);
+	len = scnprintf(machine_name, sizeof(machine_name), "%s",
+			clp->cl_ipaddr);
 
-	RESERVE_SPACE(8);
+	RESERVE_SPACE(20 + 2*28 + 20 + len + 12);
+	*p++ = cpu_to_be32(OP_CREATE_SESSION);
 	p = xdr_encode_hyper(p, clp->cl_ex_clid);
-
-	RESERVE_SPACE(8);
 	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
 	*p++ = cpu_to_be32(args->flags);			/*flags */
 
-	RESERVE_SPACE(2*28);			/* 2 channel_attrs */
 	/* Fore Channel */
 	*p++ = cpu_to_be32(args->fc_attrs.headerpadsz);	/* header padding size */
 	*p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz);	/* max req size */
@@ -1591,21 +1589,12 @@ static void encode_create_session(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(args->bc_attrs.max_reqs);	/* max requests */
 	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
 
-	RESERVE_SPACE(4);
 	*p++ = cpu_to_be32(args->cb_program);		/* cb_program */
-
-	RESERVE_SPACE(4);			/* # of security flavors */
 	*p++ = cpu_to_be32(1);
-
-	RESERVE_SPACE(4);
 	*p++ = cpu_to_be32(RPC_AUTH_UNIX);			/* auth_sys */
 
 	/* authsys_parms rfc1831 */
-	RESERVE_SPACE(4);
 	*p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
-	len = scnprintf(machine_name, sizeof(machine_name), "%s",
-			clp->cl_ipaddr);
-	RESERVE_SPACE(16 + len);
 	*p++ = cpu_to_be32(len);
 	p = xdr_encode_opaque_fixed(p, machine_name, len);
 	*p++ = cpu_to_be32(0);				/* UID */
@@ -1646,7 +1635,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 	WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
 	slot = tp->slots + args->sa_slotid;
 
-	RESERVE_SPACE(4);
+	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN + 16);
 	*p++ = cpu_to_be32(OP_SEQUENCE);
 
 	/*
@@ -1661,7 +1650,6 @@ static void encode_sequence(struct xdr_stream *xdr,
 		((u32 *)session->sess_id.data)[3],
 		slot->seq_nr, args->sa_slotid,
 		tp->highest_used_slotid, args->sa_cache_this);
-	RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
 	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(slot->seq_nr);
 	*p++ = cpu_to_be32(args->sa_slotid);
-- 
cgit v1.2.3-70-g09d2


From 2220f13a8b90d2259f3094cb54cf4de67d8eee2d Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:18 +0300
Subject: nfs: nfs4xdr: encode_compound_hdr does not have to round up reserved
 bytes

This is already done by xdr_reserve_space and since encode_compound_hdr
is adding a byte count to "12" which is already word aligned, the xdr
level rounding will work just as well.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d460d81758ae..7c2b162dd845 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -740,7 +740,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 
 	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
-	RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
+	RESERVE_SPACE(12 + hdr->taglen);
 	*p++ = cpu_to_be32(hdr->taglen);
 	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
-- 
cgit v1.2.3-70-g09d2


From 13c65ce90006badccd5663e558e3c85869ae5ce6 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:25 +0300
Subject: nfs: nfs4xdr: change RESERVE_SPACE macro into a static helper

In order to open code and expose the result pointer assignment.

Alternatively, we can open code the call to xdr_reserve_space
and do the BUG_ON an the error case at the call site.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 138 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 65 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7c2b162dd845..3bcde49fbea8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -702,20 +702,12 @@ struct compound_hdr {
 	u32		minorversion;
 };
 
-/*
- * START OF "GENERIC" ENCODE ROUTINES.
- *   These may look a little ugly since they are imported from a "generic"
- * set of XDR encode/decode routines which are intended to be shared by
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
- *
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
-#define RESERVE_SPACE(nbytes)	do {				\
-	p = xdr_reserve_space(xdr, nbytes);			\
-	BUG_ON(!p);						\
-} while (0)
+static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
+{
+	__be32 *p = xdr_reserve_space(xdr, nbytes);
+	BUG_ON(!p);
+	return p;
+}
 
 static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
 {
@@ -740,7 +732,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 
 	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
-	RESERVE_SPACE(12 + hdr->taglen);
+	p = reserve_space(xdr, 12 + hdr->taglen);
 	*p++ = cpu_to_be32(hdr->taglen);
 	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
@@ -820,7 +812,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 		len += 16;
 	else if (iap->ia_valid & ATTR_MTIME)
 		len += 4;
-	RESERVE_SPACE(len);
+	p = reserve_space(xdr, len);
 
 	/*
 	 * We write the bitmap length now, but leave the bitmap and the attribute
@@ -891,7 +883,7 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8);
+	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_ACCESS);
 	*p++ = cpu_to_be32(access);
 	hdr->nops++;
@@ -902,7 +894,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_CLOSE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
@@ -914,7 +906,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
 {
 	__be32 *p;
 
-	RESERVE_SPACE(16);
+	p = reserve_space(xdr, 16);
 	*p++ = cpu_to_be32(OP_COMMIT);
 	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->count);
@@ -926,19 +918,19 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8);
+	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_CREATE);
 	*p++ = cpu_to_be32(create->ftype);
 
 	switch (create->ftype) {
 	case NF4LNK:
-		RESERVE_SPACE(4);
+		p = reserve_space(xdr, 4);
 		*p++ = cpu_to_be32(create->u.symlink.len);
 		xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
 		break;
 
 	case NF4BLK: case NF4CHR:
-		RESERVE_SPACE(8);
+		p = reserve_space(xdr, 8);
 		*p++ = cpu_to_be32(create->u.device.specdata1);
 		*p++ = cpu_to_be32(create->u.device.specdata2);
 		break;
@@ -947,7 +939,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 		break;
 	}
 
-	RESERVE_SPACE(4 + create->name->len);
+	p = reserve_space(xdr, 4 + create->name->len);
 	*p++ = cpu_to_be32(create->name->len);
 	p = xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
 	hdr->nops++;
@@ -960,7 +952,7 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
 {
 	__be32 *p;
 
-	RESERVE_SPACE(12);
+	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(OP_GETATTR);
 	*p++ = cpu_to_be32(1);
 	*p++ = cpu_to_be32(bitmap);
@@ -972,7 +964,7 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
 {
 	__be32 *p;
 
-	RESERVE_SPACE(16);
+	p = reserve_space(xdr, 16);
 	*p++ = cpu_to_be32(OP_GETATTR);
 	*p++ = cpu_to_be32(2);
 	*p++ = cpu_to_be32(bm0);
@@ -1003,7 +995,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_GETFH);
 	hdr->nops++;
 	hdr->replen += decode_getfh_maxsz;
@@ -1013,7 +1005,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8 + name->len);
+	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_LINK);
 	*p++ = cpu_to_be32(name->len);
 	p = xdr_encode_opaque_fixed(p, name->name, name->len);
@@ -1043,7 +1035,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 {
 	__be32 *p;
 
-	RESERVE_SPACE(32);
+	p = reserve_space(xdr, 32);
 	*p++ = cpu_to_be32(OP_LOCK);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
 	*p++ = cpu_to_be32(args->reclaim);
@@ -1051,7 +1043,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	*p++ = cpu_to_be32(args->new_lock_owner);
 	if (args->new_lock_owner){
-		RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
+		p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
 		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
 		p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
@@ -1061,7 +1053,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 		p = xdr_encode_hyper(p, args->lock_owner.id);
 	}
 	else {
-		RESERVE_SPACE(NFS4_STATEID_SIZE+4);
+		p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
 		p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
 		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
 	}
@@ -1073,7 +1065,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 {
 	__be32 *p;
 
-	RESERVE_SPACE(52);
+	p = reserve_space(xdr, 52);
 	*p++ = cpu_to_be32(OP_LOCKT);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	p = xdr_encode_hyper(p, args->fl->fl_start);
@@ -1090,7 +1082,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 {
 	__be32 *p;
 
-	RESERVE_SPACE(12+NFS4_STATEID_SIZE+16);
+	p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16);
 	*p++ = cpu_to_be32(OP_LOCKU);
 	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
 	*p++ = cpu_to_be32(args->seqid->sequence->counter);
@@ -1106,7 +1098,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 	int len = name->len;
 	__be32 *p;
 
-	RESERVE_SPACE(8 + len);
+	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_LOOKUP);
 	*p++ = cpu_to_be32(len);
 	p = xdr_encode_opaque_fixed(p, name->name, len);
@@ -1118,7 +1110,7 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8);
+	p = reserve_space(xdr, 8);
 	switch (fmode & (FMODE_READ|FMODE_WRITE)) {
 	case FMODE_READ:
 		*p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
@@ -1142,11 +1134,11 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
  * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
  * owner 4 = 32
  */
-	RESERVE_SPACE(8);
+	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_OPEN);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
-	RESERVE_SPACE(28);
+	p = reserve_space(xdr, 28);
 	p = xdr_encode_hyper(p, arg->clientid);
 	*p++ = cpu_to_be32(16);
 	p = xdr_encode_opaque_fixed(p, "open id:", 8);
@@ -1157,7 +1149,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	switch(arg->open_flags & O_EXCL) {
 	case 0:
 		*p++ = cpu_to_be32(NFS4_CREATE_UNCHECKED);
@@ -1173,7 +1165,7 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	switch (arg->open_flags & O_CREAT) {
 	case 0:
 		*p++ = cpu_to_be32(NFS4_OPEN_NOCREATE);
@@ -1189,7 +1181,7 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	switch (delegation_type) {
 	case 0:
 		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
@@ -1209,7 +1201,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
 	encode_string(xdr, name->len, name->name);
 }
@@ -1218,7 +1210,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
 	encode_delegation_type(xdr, type);
 }
@@ -1227,7 +1219,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
 	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	encode_string(xdr, name->len, name->name);
@@ -1258,7 +1250,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_CONFIRM);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
@@ -1270,7 +1262,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
@@ -1285,7 +1277,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 	int len = fh->size;
 	__be32 *p;
 
-	RESERVE_SPACE(8 + len);
+	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_PUTFH);
 	*p++ = cpu_to_be32(len);
 	p = xdr_encode_opaque_fixed(p, fh->data, len);
@@ -1297,7 +1289,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_PUTROOTFH);
 	hdr->nops++;
 	hdr->replen += decode_putrootfh_maxsz;
@@ -1308,7 +1300,7 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
 	nfs4_stateid stateid;
 	__be32 *p;
 
-	RESERVE_SPACE(NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
 		nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
 		p = xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
@@ -1320,12 +1312,12 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_READ);
 
 	encode_stateid(xdr, args->context);
 
-	RESERVE_SPACE(12);
+	p = reserve_space(xdr, 12);
 	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->count);
 	hdr->nops++;
@@ -1340,7 +1332,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	};
 	__be32 *p;
 
-	RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
+	p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
 	*p++ = cpu_to_be32(OP_READDIR);
 	p = xdr_encode_hyper(p, readdir->cookie);
 	p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
@@ -1369,7 +1361,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_READLINK);
 	hdr->nops++;
 	hdr->replen += decode_readlink_maxsz;
@@ -1379,7 +1371,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8 + name->len);
+	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_REMOVE);
 	*p++ = cpu_to_be32(name->len);
 	p = xdr_encode_opaque_fixed(p, name->name, name->len);
@@ -1391,12 +1383,12 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 {
 	__be32 *p;
 
-	RESERVE_SPACE(8 + oldname->len);
+	p = reserve_space(xdr, 8 + oldname->len);
 	*p++ = cpu_to_be32(OP_RENAME);
 	*p++ = cpu_to_be32(oldname->len);
 	p = xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
 
-	RESERVE_SPACE(4 + newname->len);
+	p = reserve_space(xdr, 4 + newname->len);
 	*p++ = cpu_to_be32(newname->len);
 	p = xdr_encode_opaque_fixed(p, newname->name, newname->len);
 	hdr->nops++;
@@ -1407,7 +1399,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
 {
 	__be32 *p;
 
-	RESERVE_SPACE(12);
+	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(OP_RENEW);
 	p = xdr_encode_hyper(p, client_stateid->cl_clientid);
 	hdr->nops++;
@@ -1419,7 +1411,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_RESTOREFH);
 	hdr->nops++;
 	hdr->replen += decode_restorefh_maxsz;
@@ -1430,15 +1422,15 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
 	p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
-	RESERVE_SPACE(2*4);
+	p = reserve_space(xdr, 2*4);
 	*p++ = cpu_to_be32(1);
 	*p++ = cpu_to_be32(FATTR4_WORD0_ACL);
 	if (arg->acl_len % 4)
 		return -EINVAL;
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(arg->acl_len);
 	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
 	hdr->nops++;
@@ -1451,7 +1443,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_SAVEFH);
 	hdr->nops++;
 	hdr->replen += decode_savefh_maxsz;
@@ -1461,7 +1453,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
 	p = xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
 	hdr->nops++;
@@ -1473,16 +1465,16 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE);
+	p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID);
 	p = xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
 
 	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(setclientid->sc_prog);
 	encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
 	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(setclientid->sc_cb_ident);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_maxsz;
@@ -1492,7 +1484,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 {
 	__be32 *p;
 
-	RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
+	p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
 	p = xdr_encode_hyper(p, client_state->cl_clientid);
 	p = xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
@@ -1504,12 +1496,12 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4);
+	p = reserve_space(xdr, 4);
 	*p++ = cpu_to_be32(OP_WRITE);
 
 	encode_stateid(xdr, args->context);
 
-	RESERVE_SPACE(16);
+	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->stable);
 	*p++ = cpu_to_be32(args->count);
@@ -1523,7 +1515,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 
 	*p++ = cpu_to_be32(OP_DELEGRETURN);
 	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
@@ -1539,13 +1531,13 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 {
 	__be32 *p;
 
-	RESERVE_SPACE(4 + sizeof(args->verifier->data));
+	p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
 	*p++ = cpu_to_be32(OP_EXCHANGE_ID);
 	p = xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
 
 	encode_string(xdr, args->id_len, args->id);
 
-	RESERVE_SPACE(12);
+	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(args->flags);
 	*p++ = cpu_to_be32(0);	/* zero length state_protect4_a */
 	*p++ = cpu_to_be32(0);	/* zero length implementation id array */
@@ -1565,7 +1557,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	len = scnprintf(machine_name, sizeof(machine_name), "%s",
 			clp->cl_ipaddr);
 
-	RESERVE_SPACE(20 + 2*28 + 20 + len + 12);
+	p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
 	*p++ = cpu_to_be32(OP_CREATE_SESSION);
 	p = xdr_encode_hyper(p, clp->cl_ex_clid);
 	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
@@ -1609,7 +1601,7 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 				   struct compound_hdr *hdr)
 {
 	__be32 *p;
-	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
+	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(OP_DESTROY_SESSION);
 	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	hdr->nops++;
@@ -1635,7 +1627,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 	WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
 	slot = tp->slots + args->sa_slotid;
 
-	RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN + 16);
+	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16);
 	*p++ = cpu_to_be32(OP_SEQUENCE);
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 345585132a204859fbb7d8b662e9b6e5b563c6dc Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:30 +0300
Subject: nfs: nfs4xdr: optimize low level encoding

do not increment encoding ptr if not needed.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 128 +++++++++++++++++++++++++++----------------------------
 1 file changed, 64 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3bcde49fbea8..d75f821cad01 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -737,7 +737,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
 	hdr->nops_p = p;
-	*p++ = cpu_to_be32(hdr->nops);
+	*p = cpu_to_be32(hdr->nops);
 }
 
 static void encode_nops(struct compound_hdr *hdr)
@@ -874,7 +874,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	len = (char *)p - (char *)q - 12;
 	*q++ = htonl(bmval0);
 	*q++ = htonl(bmval1);
-	*q++ = htonl(len);
+	*q = htonl(len);
 
 /* out: */
 }
@@ -885,7 +885,7 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
 
 	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_ACCESS);
-	*p++ = cpu_to_be32(access);
+	*p = cpu_to_be32(access);
 	hdr->nops++;
 	hdr->replen += decode_access_maxsz;
 }
@@ -897,7 +897,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 	p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_CLOSE);
 	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
-	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_close_maxsz;
 }
@@ -909,7 +909,7 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
 	p = reserve_space(xdr, 16);
 	*p++ = cpu_to_be32(OP_COMMIT);
 	p = xdr_encode_hyper(p, args->offset);
-	*p++ = cpu_to_be32(args->count);
+	*p = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_commit_maxsz;
 }
@@ -920,19 +920,19 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 
 	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_CREATE);
-	*p++ = cpu_to_be32(create->ftype);
+	*p = cpu_to_be32(create->ftype);
 
 	switch (create->ftype) {
 	case NF4LNK:
 		p = reserve_space(xdr, 4);
-		*p++ = cpu_to_be32(create->u.symlink.len);
+		*p = cpu_to_be32(create->u.symlink.len);
 		xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
 		break;
 
 	case NF4BLK: case NF4CHR:
 		p = reserve_space(xdr, 8);
 		*p++ = cpu_to_be32(create->u.device.specdata1);
-		*p++ = cpu_to_be32(create->u.device.specdata2);
+		*p = cpu_to_be32(create->u.device.specdata2);
 		break;
 
 	default:
@@ -941,7 +941,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 
 	p = reserve_space(xdr, 4 + create->name->len);
 	*p++ = cpu_to_be32(create->name->len);
-	p = xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
+	xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
 	hdr->nops++;
 	hdr->replen += decode_create_maxsz;
 
@@ -955,7 +955,7 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
 	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(OP_GETATTR);
 	*p++ = cpu_to_be32(1);
-	*p++ = cpu_to_be32(bitmap);
+	*p = cpu_to_be32(bitmap);
 	hdr->nops++;
 	hdr->replen += decode_getattr_maxsz;
 }
@@ -968,7 +968,7 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
 	*p++ = cpu_to_be32(OP_GETATTR);
 	*p++ = cpu_to_be32(2);
 	*p++ = cpu_to_be32(bm0);
-	*p++ = cpu_to_be32(bm1);
+	*p = cpu_to_be32(bm1);
 	hdr->nops++;
 	hdr->replen += decode_getattr_maxsz;
 }
@@ -996,7 +996,7 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_GETFH);
+	*p = cpu_to_be32(OP_GETFH);
 	hdr->nops++;
 	hdr->replen += decode_getfh_maxsz;
 }
@@ -1008,7 +1008,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_LINK);
 	*p++ = cpu_to_be32(name->len);
-	p = xdr_encode_opaque_fixed(p, name->name, name->len);
+	xdr_encode_opaque_fixed(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_link_maxsz;
 }
@@ -1041,7 +1041,7 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 	*p++ = cpu_to_be32(args->reclaim);
 	p = xdr_encode_hyper(p, args->fl->fl_start);
 	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
-	*p++ = cpu_to_be32(args->new_lock_owner);
+	*p = cpu_to_be32(args->new_lock_owner);
 	if (args->new_lock_owner){
 		p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
 		*p++ = cpu_to_be32(args->open_seqid->sequence->counter);
@@ -1050,12 +1050,12 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 		p = xdr_encode_hyper(p, args->lock_owner.clientid);
 		*p++ = cpu_to_be32(16);
 		p = xdr_encode_opaque_fixed(p, "lock id:", 8);
-		p = xdr_encode_hyper(p, args->lock_owner.id);
+		xdr_encode_hyper(p, args->lock_owner.id);
 	}
 	else {
 		p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
 		p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
-		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
+		*p = cpu_to_be32(args->lock_seqid->sequence->counter);
 	}
 	hdr->nops++;
 	hdr->replen += decode_lock_maxsz;
@@ -1073,7 +1073,7 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 	p = xdr_encode_hyper(p, args->lock_owner.clientid);
 	*p++ = cpu_to_be32(16);
 	p = xdr_encode_opaque_fixed(p, "lock id:", 8);
-	p = xdr_encode_hyper(p, args->lock_owner.id);
+	xdr_encode_hyper(p, args->lock_owner.id);
 	hdr->nops++;
 	hdr->replen += decode_lockt_maxsz;
 }
@@ -1088,7 +1088,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 	*p++ = cpu_to_be32(args->seqid->sequence->counter);
 	p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
 	p = xdr_encode_hyper(p, args->fl->fl_start);
-	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+	xdr_encode_hyper(p, nfs4_lock_length(args->fl));
 	hdr->nops++;
 	hdr->replen += decode_locku_maxsz;
 }
@@ -1101,7 +1101,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_LOOKUP);
 	*p++ = cpu_to_be32(len);
-	p = xdr_encode_opaque_fixed(p, name->name, len);
+	xdr_encode_opaque_fixed(p, name->name, len);
 	hdr->nops++;
 	hdr->replen += decode_lookup_maxsz;
 }
@@ -1124,7 +1124,7 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
 	default:
 		*p++ = cpu_to_be32(0);
 	}
-	*p++ = cpu_to_be32(0);		/* for linux, share_deny = 0 always */
+	*p = cpu_to_be32(0);		/* for linux, share_deny = 0 always */
 }
 
 static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1136,13 +1136,13 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
  */
 	p = reserve_space(xdr, 8);
 	*p++ = cpu_to_be32(OP_OPEN);
-	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
+	*p = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	p = reserve_space(xdr, 28);
 	p = xdr_encode_hyper(p, arg->clientid);
 	*p++ = cpu_to_be32(16);
 	p = xdr_encode_opaque_fixed(p, "open id:", 8);
-	p = xdr_encode_hyper(p, arg->id);
+	xdr_encode_hyper(p, arg->id);
 }
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1152,11 +1152,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 	p = reserve_space(xdr, 4);
 	switch(arg->open_flags & O_EXCL) {
 	case 0:
-		*p++ = cpu_to_be32(NFS4_CREATE_UNCHECKED);
+		*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
 		encode_attrs(xdr, arg->u.attrs, arg->server);
 		break;
 	default:
-		*p++ = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
+		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
 		encode_nfs4_verifier(xdr, &arg->u.verifier);
 	}
 }
@@ -1168,11 +1168,11 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
 	p = reserve_space(xdr, 4);
 	switch (arg->open_flags & O_CREAT) {
 	case 0:
-		*p++ = cpu_to_be32(NFS4_OPEN_NOCREATE);
+		*p = cpu_to_be32(NFS4_OPEN_NOCREATE);
 		break;
 	default:
 		BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
-		*p++ = cpu_to_be32(NFS4_OPEN_CREATE);
+		*p = cpu_to_be32(NFS4_OPEN_CREATE);
 		encode_createmode(xdr, arg);
 	}
 }
@@ -1184,13 +1184,13 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
 	p = reserve_space(xdr, 4);
 	switch (delegation_type) {
 	case 0:
-		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
 		break;
 	case FMODE_READ:
-		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
 		break;
 	case FMODE_WRITE|FMODE_READ:
-		*p++ = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
+		*p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
 		break;
 	default:
 		BUG();
@@ -1202,7 +1202,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
 	encode_string(xdr, name->len, name->name);
 }
 
@@ -1211,7 +1211,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
 	encode_delegation_type(xdr, type);
 }
 
@@ -1221,7 +1221,7 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
-	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	encode_string(xdr, name->len, name->name);
 }
 
@@ -1253,7 +1253,7 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_CONFIRM);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
-	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
+	*p = cpu_to_be32(arg->seqid->sequence->counter);
 	hdr->nops++;
 	hdr->replen += decode_open_confirm_maxsz;
 }
@@ -1265,7 +1265,7 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
 	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
 	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
-	*p++ = cpu_to_be32(arg->seqid->sequence->counter);
+	*p = cpu_to_be32(arg->seqid->sequence->counter);
 	encode_share_access(xdr, arg->fmode);
 	hdr->nops++;
 	hdr->replen += decode_open_downgrade_maxsz;
@@ -1280,7 +1280,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_PUTFH);
 	*p++ = cpu_to_be32(len);
-	p = xdr_encode_opaque_fixed(p, fh->data, len);
+	xdr_encode_opaque_fixed(p, fh->data, len);
 	hdr->nops++;
 	hdr->replen += decode_putfh_maxsz;
 }
@@ -1290,7 +1290,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_PUTROOTFH);
+	*p = cpu_to_be32(OP_PUTROOTFH);
 	hdr->nops++;
 	hdr->replen += decode_putrootfh_maxsz;
 }
@@ -1303,9 +1303,9 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
 	p = reserve_space(xdr, NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
 		nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
-		p = xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
+		xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
 	} else
-		p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
+		xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
 }
 
 static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
@@ -1313,13 +1313,13 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_READ);
+	*p = cpu_to_be32(OP_READ);
 
 	encode_stateid(xdr, args->context);
 
 	p = reserve_space(xdr, 12);
 	p = xdr_encode_hyper(p, args->offset);
-	*p++ = cpu_to_be32(args->count);
+	*p = cpu_to_be32(args->count);
 	hdr->nops++;
 	hdr->replen += decode_read_maxsz;
 }
@@ -1345,7 +1345,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	else
 		attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
-	*p++ = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+	*p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
 	hdr->nops++;
 	hdr->replen += decode_readdir_maxsz;
 	dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
@@ -1362,7 +1362,7 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_READLINK);
+	*p = cpu_to_be32(OP_READLINK);
 	hdr->nops++;
 	hdr->replen += decode_readlink_maxsz;
 }
@@ -1374,7 +1374,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_REMOVE);
 	*p++ = cpu_to_be32(name->len);
-	p = xdr_encode_opaque_fixed(p, name->name, name->len);
+	xdr_encode_opaque_fixed(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_remove_maxsz;
 }
@@ -1386,11 +1386,11 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 	p = reserve_space(xdr, 8 + oldname->len);
 	*p++ = cpu_to_be32(OP_RENAME);
 	*p++ = cpu_to_be32(oldname->len);
-	p = xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
+	xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
 
 	p = reserve_space(xdr, 4 + newname->len);
 	*p++ = cpu_to_be32(newname->len);
-	p = xdr_encode_opaque_fixed(p, newname->name, newname->len);
+	xdr_encode_opaque_fixed(p, newname->name, newname->len);
 	hdr->nops++;
 	hdr->replen += decode_rename_maxsz;
 }
@@ -1401,7 +1401,7 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
 
 	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(OP_RENEW);
-	p = xdr_encode_hyper(p, client_stateid->cl_clientid);
+	xdr_encode_hyper(p, client_stateid->cl_clientid);
 	hdr->nops++;
 	hdr->replen += decode_renew_maxsz;
 }
@@ -1412,7 +1412,7 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_RESTOREFH);
+	*p = cpu_to_be32(OP_RESTOREFH);
 	hdr->nops++;
 	hdr->replen += decode_restorefh_maxsz;
 }
@@ -1424,14 +1424,14 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
-	p = xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
 	p = reserve_space(xdr, 2*4);
 	*p++ = cpu_to_be32(1);
-	*p++ = cpu_to_be32(FATTR4_WORD0_ACL);
+	*p = cpu_to_be32(FATTR4_WORD0_ACL);
 	if (arg->acl_len % 4)
 		return -EINVAL;
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(arg->acl_len);
+	*p = cpu_to_be32(arg->acl_len);
 	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
 	hdr->nops++;
 	hdr->replen += decode_setacl_maxsz;
@@ -1444,7 +1444,7 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_SAVEFH);
+	*p = cpu_to_be32(OP_SAVEFH);
 	hdr->nops++;
 	hdr->replen += decode_savefh_maxsz;
 }
@@ -1455,7 +1455,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 	*p++ = cpu_to_be32(OP_SETATTR);
-	p = xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setattr_maxsz;
 	encode_attrs(xdr, arg->iap, server);
@@ -1467,15 +1467,15 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 
 	p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID);
-	p = xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
+	xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
 
 	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(setclientid->sc_prog);
+	*p = cpu_to_be32(setclientid->sc_prog);
 	encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
 	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(setclientid->sc_cb_ident);
+	*p = cpu_to_be32(setclientid->sc_cb_ident);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_maxsz;
 }
@@ -1487,7 +1487,7 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 	p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
 	p = xdr_encode_hyper(p, client_state->cl_clientid);
-	p = xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+	xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_confirm_maxsz;
 }
@@ -1497,14 +1497,14 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	__be32 *p;
 
 	p = reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(OP_WRITE);
+	*p = cpu_to_be32(OP_WRITE);
 
 	encode_stateid(xdr, args->context);
 
 	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, args->offset);
 	*p++ = cpu_to_be32(args->stable);
-	*p++ = cpu_to_be32(args->count);
+	*p = cpu_to_be32(args->count);
 
 	xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
 	hdr->nops++;
@@ -1518,7 +1518,7 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
 
 	*p++ = cpu_to_be32(OP_DELEGRETURN);
-	p = xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
+	xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_delegreturn_maxsz;
 }
@@ -1533,14 +1533,14 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 
 	p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
 	*p++ = cpu_to_be32(OP_EXCHANGE_ID);
-	p = xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
+	xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
 
 	encode_string(xdr, args->id_len, args->id);
 
 	p = reserve_space(xdr, 12);
 	*p++ = cpu_to_be32(args->flags);
 	*p++ = cpu_to_be32(0);	/* zero length state_protect4_a */
-	*p++ = cpu_to_be32(0);	/* zero length implementation id array */
+	*p = cpu_to_be32(0);	/* zero length implementation id array */
 	hdr->nops++;
 	hdr->replen += decode_exchange_id_maxsz;
 }
@@ -1591,7 +1591,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	p = xdr_encode_opaque_fixed(p, machine_name, len);
 	*p++ = cpu_to_be32(0);				/* UID */
 	*p++ = cpu_to_be32(0);				/* GID */
-	*p++ = cpu_to_be32(0);				/* No more gids */
+	*p = cpu_to_be32(0);				/* No more gids */
 	hdr->nops++;
 	hdr->replen += decode_create_session_maxsz;
 }
@@ -1603,7 +1603,7 @@ static void encode_destroy_session(struct xdr_stream *xdr,
 	__be32 *p;
 	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
 	*p++ = cpu_to_be32(OP_DESTROY_SESSION);
-	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+	xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
 	hdr->nops++;
 	hdr->replen += decode_destroy_session_maxsz;
 }
@@ -1646,7 +1646,7 @@ static void encode_sequence(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(slot->seq_nr);
 	*p++ = cpu_to_be32(args->sa_slotid);
 	*p++ = cpu_to_be32(tp->highest_used_slotid);
-	*p++ = cpu_to_be32(args->sa_cache_this);
+	*p = cpu_to_be32(args->sa_cache_this);
 	hdr->nops++;
 	hdr->replen += decode_sequence_maxsz;
 #endif /* CONFIG_NFS_V4_1 */
-- 
cgit v1.2.3-70-g09d2


From 811652bd6edd66dd35bf9caacdfe96d19f75a47e Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:34 +0300
Subject: nfs: nfs4xdr: merge xdr_encode_int+xdr_encode_opaque_fixed into
 xdr_encode_opaque

use encode_string where appropriate.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 42 ++++++++++++++----------------------------
 1 file changed, 14 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d75f821cad01..efa78ad98228 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -732,9 +732,8 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
 
 	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
-	p = reserve_space(xdr, 12 + hdr->taglen);
-	*p++ = cpu_to_be32(hdr->taglen);
-	p = xdr_encode_opaque_fixed(p, hdr->tag, hdr->taglen);
+	p = reserve_space(xdr, 4 + hdr->taglen + 8);
+	p = xdr_encode_opaque(p, hdr->tag, hdr->taglen);
 	*p++ = cpu_to_be32(hdr->minorversion);
 	hdr->nops_p = p;
 	*p = cpu_to_be32(hdr->nops);
@@ -832,13 +831,11 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	}
 	if (iap->ia_valid & ATTR_UID) {
 		bmval1 |= FATTR4_WORD1_OWNER;
-		*p++ = cpu_to_be32(owner_namelen);
-		p = xdr_encode_opaque_fixed(p, owner_name, owner_namelen);
+		p = xdr_encode_opaque(p, owner_name, owner_namelen);
 	}
 	if (iap->ia_valid & ATTR_GID) {
 		bmval1 |= FATTR4_WORD1_OWNER_GROUP;
-		*p++ = cpu_to_be32(owner_grouplen);
-		p = xdr_encode_opaque_fixed(p, owner_group, owner_grouplen);
+		p = xdr_encode_opaque(p, owner_group, owner_grouplen);
 	}
 	if (iap->ia_valid & ATTR_ATIME_SET) {
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
@@ -939,9 +936,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 		break;
 	}
 
-	p = reserve_space(xdr, 4 + create->name->len);
-	*p++ = cpu_to_be32(create->name->len);
-	xdr_encode_opaque_fixed(p, create->name->name, create->name->len);
+	encode_string(xdr, create->name->len, create->name->name);
 	hdr->nops++;
 	hdr->replen += decode_create_maxsz;
 
@@ -1007,8 +1002,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 
 	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_LINK);
-	*p++ = cpu_to_be32(name->len);
-	xdr_encode_opaque_fixed(p, name->name, name->len);
+	xdr_encode_opaque(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_link_maxsz;
 }
@@ -1100,8 +1094,7 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
 
 	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_LOOKUP);
-	*p++ = cpu_to_be32(len);
-	xdr_encode_opaque_fixed(p, name->name, len);
+	xdr_encode_opaque(p, name->name, len);
 	hdr->nops++;
 	hdr->replen += decode_lookup_maxsz;
 }
@@ -1279,8 +1272,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
 
 	p = reserve_space(xdr, 8 + len);
 	*p++ = cpu_to_be32(OP_PUTFH);
-	*p++ = cpu_to_be32(len);
-	xdr_encode_opaque_fixed(p, fh->data, len);
+	xdr_encode_opaque(p, fh->data, len);
 	hdr->nops++;
 	hdr->replen += decode_putfh_maxsz;
 }
@@ -1373,8 +1365,7 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 
 	p = reserve_space(xdr, 8 + name->len);
 	*p++ = cpu_to_be32(OP_REMOVE);
-	*p++ = cpu_to_be32(name->len);
-	xdr_encode_opaque_fixed(p, name->name, name->len);
+	xdr_encode_opaque(p, name->name, name->len);
 	hdr->nops++;
 	hdr->replen += decode_remove_maxsz;
 }
@@ -1383,14 +1374,10 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 {
 	__be32 *p;
 
-	p = reserve_space(xdr, 8 + oldname->len);
-	*p++ = cpu_to_be32(OP_RENAME);
-	*p++ = cpu_to_be32(oldname->len);
-	xdr_encode_opaque_fixed(p, oldname->name, oldname->len);
-
-	p = reserve_space(xdr, 4 + newname->len);
-	*p++ = cpu_to_be32(newname->len);
-	xdr_encode_opaque_fixed(p, newname->name, newname->len);
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_RENAME);
+	encode_string(xdr, oldname->len, oldname->name);
+	encode_string(xdr, newname->len, newname->name);
 	hdr->nops++;
 	hdr->replen += decode_rename_maxsz;
 }
@@ -1587,8 +1574,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 
 	/* authsys_parms rfc1831 */
 	*p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */
-	*p++ = cpu_to_be32(len);
-	p = xdr_encode_opaque_fixed(p, machine_name, len);
+	p = xdr_encode_opaque(p, machine_name, len);
 	*p++ = cpu_to_be32(0);				/* UID */
 	*p++ = cpu_to_be32(0);				/* GID */
 	*p = cpu_to_be32(0);				/* No more gids */
-- 
cgit v1.2.3-70-g09d2


From 6f723f7710024bb151ca8c5277ce8c71beec4db8 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:37 +0300
Subject: nfs: nfs4xdr: get rid of READ32

s/READ32\((.*)\)/\1 = be32_to_cpup(p++)/

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 145 +++++++++++++++++++++++++++----------------------------
 1 file changed, 72 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index efa78ad98228..5f6b46f17c5b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2433,7 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define READ32(x)         (x) = ntohl(*p++)
 #define READ64(x)         do {			\
 	(x) = (u64)ntohl(*p++) << 32;		\
 	(x) |= ntohl(*p++);			\
@@ -2464,7 +2463,7 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
 	__be32 *p;
 
 	READ_BUF(4);
-	READ32(*len);
+	*len = be32_to_cpup(p++);
 	READ_BUF(*len);
 	*string = (char *)p;
 	return 0;
@@ -2475,13 +2474,13 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	__be32 *p;
 
 	READ_BUF(8);
-	READ32(hdr->status);
-	READ32(hdr->taglen);
+	hdr->status = be32_to_cpup(p++);
+	hdr->taglen = be32_to_cpup(p++);
 
 	READ_BUF(hdr->taglen + 4);
 	hdr->tag = (char *)p;
 	p += XDR_QUADLEN(hdr->taglen);
-	READ32(hdr->nops);
+	hdr->nops = be32_to_cpup(p++);
 	if (unlikely(hdr->nops < 1))
 		return nfs4_stat_to_errno(hdr->status);
 	return 0;
@@ -2494,14 +2493,14 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 	int32_t nfserr;
 
 	READ_BUF(8);
-	READ32(opnum);
+	opnum = be32_to_cpup(p++);
 	if (opnum != expected) {
 		dprintk("nfs: Server returned operation"
 			" %d but we issued a request for %d\n",
 				opnum, expected);
 		return -EIO;
 	}
-	READ32(nfserr);
+	nfserr = be32_to_cpup(p++);
 	if (nfserr != NFS_OK)
 		return nfs4_stat_to_errno(nfserr);
 	return 0;
@@ -2524,14 +2523,14 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	__be32 *p;
 
 	READ_BUF(4);
-	READ32(bmlen);
+	bmlen = be32_to_cpup(p++);
 
 	bitmap[0] = bitmap[1] = 0;
 	READ_BUF((bmlen << 2));
 	if (bmlen > 0) {
-		READ32(bitmap[0]);
+		bitmap[0] = be32_to_cpup(p++);
 		if (bmlen > 1)
-			READ32(bitmap[1]);
+			bitmap[1] = be32_to_cpup(p++);
 	}
 	return 0;
 }
@@ -2541,7 +2540,7 @@ static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen,
 	__be32 *p;
 
 	READ_BUF(4);
-	READ32(*attrlen);
+	*attrlen = be32_to_cpup(p++);
 	*savep = xdr->p;
 	return 0;
 }
@@ -2567,7 +2566,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
 		READ_BUF(4);
-		READ32(*type);
+		*type = be32_to_cpup(p++);
 		if (*type < NF4REG || *type > NF4NAMEDATTR) {
 			dprintk("%s: bad type %d\n", __func__, *type);
 			return -EIO;
@@ -2625,7 +2624,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
 		READ_BUF(4);
-		READ32(*res);
+		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
 	}
 	dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
@@ -2641,7 +2640,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
 		READ_BUF(4);
-		READ32(*res);
+		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
 	}
 	dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
@@ -2679,7 +2678,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
 		READ_BUF(4);
-		READ32(*res);
+		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
 	}
 	dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
@@ -2695,7 +2694,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
 		READ_BUF(4);
-		READ32(*res);
+		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
 	}
 	dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
@@ -2796,7 +2795,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 	int status = 0;
 
 	READ_BUF(4);
-	READ32(n);
+	n = be32_to_cpup(p++);
 	if (n == 0)
 		goto root_path;
 	dprintk("path ");
@@ -2848,7 +2847,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	if (unlikely(status != 0))
 		goto out;
 	READ_BUF(4);
-	READ32(n);
+	n = be32_to_cpup(p++);
 	if (n <= 0)
 		goto out_eio;
 	res->nlocations = 0;
@@ -2857,7 +2856,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		struct nfs4_fs_location *loc = &res->locations[res->nlocations];
 
 		READ_BUF(4);
-		READ32(m);
+		m = be32_to_cpup(p++);
 
 		loc->nservers = 0;
 		dprintk("%s: servers ", __func__);
@@ -2928,7 +2927,7 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
 		READ_BUF(4);
-		READ32(*maxlink);
+		*maxlink = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
 	}
 	dprintk("%s: maxlink=%u\n", __func__, *maxlink);
@@ -2945,7 +2944,7 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
 		READ_BUF(4);
-		READ32(*maxname);
+		*maxname = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
 	}
 	dprintk("%s: maxname=%u\n", __func__, *maxname);
@@ -3005,7 +3004,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
 		READ_BUF(4);
-		READ32(tmp);
+		tmp = be32_to_cpup(p++);
 		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
 		ret = NFS_ATTR_FATTR_MODE;
@@ -3024,7 +3023,7 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
 		READ_BUF(4);
-		READ32(*nlink);
+		*nlink = be32_to_cpup(p++);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
 		ret = NFS_ATTR_FATTR_NLINK;
 	}
@@ -3043,7 +3042,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
 		READ_BUF(4);
-		READ32(len);
+		len = be32_to_cpup(p++);
 		READ_BUF(len);
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
@@ -3071,7 +3070,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
 		READ_BUF(4);
-		READ32(len);
+		len = be32_to_cpup(p++);
 		READ_BUF(len);
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
@@ -3101,8 +3100,8 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 		dev_t tmp;
 
 		READ_BUF(8);
-		READ32(major);
-		READ32(minor);
+		major = be32_to_cpup(p++);
+		minor = be32_to_cpup(p++);
 		tmp = MKDEV(major, minor);
 		if (MAJOR(tmp) == major && MINOR(tmp) == minor)
 			*rdev = tmp;
@@ -3191,7 +3190,7 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
 
 	READ_BUF(12);
 	READ64(sec);
-	READ32(nsec);
+	nsec = be32_to_cpup(p++);
 	time->tv_sec = (time_t)sec;
 	time->tv_nsec = (long)nsec;
 	return 0;
@@ -3273,7 +3272,7 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 	__be32 *p;
 
 	READ_BUF(20);
-	READ32(cinfo->atomic);
+	cinfo->atomic = be32_to_cpup(p++);
 	READ64(cinfo->before);
 	READ64(cinfo->after);
 	return 0;
@@ -3289,8 +3288,8 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 	if (status)
 		return status;
 	READ_BUF(8);
-	READ32(supp);
-	READ32(acc);
+	supp = be32_to_cpup(p++);
+	acc = be32_to_cpup(p++);
 	access->supported = supp;
 	access->access = acc;
 	return 0;
@@ -3336,7 +3335,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 	if ((status = decode_change_info(xdr, cinfo)))
 		return status;
 	READ_BUF(4);
-	READ32(bmlen);
+	bmlen = be32_to_cpup(p++);
 	READ_BUF(bmlen << 2);
 	return 0;
 }
@@ -3591,7 +3590,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 		return status;
 
 	READ_BUF(4);
-	READ32(len);
+	len = be32_to_cpup(p++);
 	if (len > NFS4_FHSIZE)
 		return -EIO;
 	fh->size = len;
@@ -3622,7 +3621,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 	READ_BUF(32);
 	READ64(offset);
 	READ64(length);
-	READ32(type);
+	type = be32_to_cpup(p++);
 	if (fl != NULL) {
 		fl->fl_start = (loff_t)offset;
 		fl->fl_end = fl->fl_start + (loff_t)length - 1;
@@ -3634,7 +3633,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 		fl->fl_pid = 0;
 	}
 	READ64(clientid);
-	READ32(namelen);
+	namelen = be32_to_cpup(p++);
 	READ_BUF(namelen);
 	return -NFS4ERR_DENIED;
 }
@@ -3695,14 +3694,14 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 	uint32_t limit_type, nblocks, blocksize;
 
 	READ_BUF(12);
-	READ32(limit_type);
+	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
 	case 1:
 		READ64(*maxsize);
 		break;
 	case 2:
-		READ32(nblocks);
-		READ32(blocksize);
+		nblocks = be32_to_cpup(p++);
+		blocksize = be32_to_cpup(p++);
 		*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
 	}
 	return 0;
@@ -3714,14 +3713,14 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	uint32_t delegation_type;
 
 	READ_BUF(4);
-	READ32(delegation_type);
+	delegation_type = be32_to_cpup(p++);
 	if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
 		res->delegation_type = 0;
 		return 0;
 	}
 	READ_BUF(NFS4_STATEID_SIZE+4);
 	COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
-	READ32(res->do_recall);
+	res->do_recall = be32_to_cpup(p++);
 
 	switch (delegation_type) {
 	case NFS4_OPEN_DELEGATE_READ:
@@ -3752,15 +3751,15 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 	decode_change_info(xdr, &res->cinfo);
 
 	READ_BUF(8);
-	READ32(res->rflags);
-	READ32(bmlen);
+	res->rflags = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p++);
 	if (bmlen > 10)
 		goto xdr_error;
 
 	READ_BUF(bmlen << 2);
 	savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
 	for (i = 0; i < savewords; ++i)
-		READ32(res->attrset[i]);
+		res->attrset[i] = be32_to_cpup(p++);
 	for (; i < NFS4_BITMAP_SIZE; i++)
 		res->attrset[i] = 0;
 
@@ -3821,8 +3820,8 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
 	if (status)
 		return status;
 	READ_BUF(8);
-	READ32(eof);
-	READ32(count);
+	eof = be32_to_cpup(p++);
+	count = be32_to_cpup(p++);
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
@@ -3948,7 +3947,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 
 	/* Convert length of symlink */
 	READ_BUF(4);
-	READ32(len);
+	len = be32_to_cpup(p++);
 	if (len >= rcvbuf->page_len || len <= 0) {
 		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
@@ -4070,7 +4069,7 @@ static int decode_setattr(struct xdr_stream *xdr)
 	if (status)
 		return status;
 	READ_BUF(4);
-	READ32(bmlen);
+	bmlen = be32_to_cpup(p++);
 	READ_BUF(bmlen << 2);
 	return 0;
 }
@@ -4082,13 +4081,13 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	int32_t nfserr;
 
 	READ_BUF(8);
-	READ32(opnum);
+	opnum = be32_to_cpup(p++);
 	if (opnum != OP_SETCLIENTID) {
 		dprintk("nfs: decode_setclientid: Server returned operation"
 			" %d\n", opnum);
 		return -EIO;
 	}
-	READ32(nfserr);
+	nfserr = be32_to_cpup(p++);
 	if (nfserr == NFS_OK) {
 		READ_BUF(8 + NFS4_VERIFIER_SIZE);
 		READ64(clp->cl_clientid);
@@ -4098,12 +4097,12 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 
 		/* skip netid string */
 		READ_BUF(4);
-		READ32(len);
+		len = be32_to_cpup(p++);
 		READ_BUF(len);
 
 		/* skip uaddr string */
 		READ_BUF(4);
-		READ32(len);
+		len = be32_to_cpup(p++);
 		READ_BUF(len);
 		return -NFSERR_CLID_INUSE;
 	} else
@@ -4127,8 +4126,8 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
 		return status;
 
 	READ_BUF(16);
-	READ32(res->count);
-	READ32(res->verf->committed);
+	res->count = be32_to_cpup(p++);
+	res->verf->committed = be32_to_cpup(p++);
 	COPYMEM(res->verf->verifier, 8);
 	return 0;
 }
@@ -4154,11 +4153,11 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	READ_BUF(8);
 	READ64(clp->cl_ex_clid);
 	READ_BUF(12);
-	READ32(clp->cl_seqid);
-	READ32(clp->cl_exchange_flags);
+	clp->cl_seqid = be32_to_cpup(p++);
+	clp->cl_exchange_flags = be32_to_cpup(p++);
 
 	/* We ask for SP4_NONE */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	if (dummy != SP4_NONE)
 		return -EIO;
 
@@ -4167,17 +4166,17 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 
 	/* Throw away Major id */
 	READ_BUF(4);
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	READ_BUF(dummy);
 
 	/* Throw away server_scope */
 	READ_BUF(4);
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	READ_BUF(dummy);
 
 	/* Throw away Implementation id array */
 	READ_BUF(4);
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	READ_BUF(dummy);
 
 	return 0;
@@ -4190,13 +4189,13 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 	u32 nr_attrs;
 
 	READ_BUF(28);
-	READ32(attrs->headerpadsz);
-	READ32(attrs->max_rqst_sz);
-	READ32(attrs->max_resp_sz);
-	READ32(attrs->max_resp_sz_cached);
-	READ32(attrs->max_ops);
-	READ32(attrs->max_reqs);
-	READ32(nr_attrs);
+	attrs->headerpadsz = be32_to_cpup(p++);
+	attrs->max_rqst_sz = be32_to_cpup(p++);
+	attrs->max_resp_sz = be32_to_cpup(p++);
+	attrs->max_resp_sz_cached = be32_to_cpup(p++);
+	attrs->max_ops = be32_to_cpup(p++);
+	attrs->max_reqs = be32_to_cpup(p++);
+	nr_attrs = be32_to_cpup(p++);
 	if (unlikely(nr_attrs > 1)) {
 		printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
 			__func__, nr_attrs);
@@ -4226,8 +4225,8 @@ static int decode_create_session(struct xdr_stream *xdr,
 
 	/* seqid, flags */
 	READ_BUF(8);
-	READ32(clp->cl_seqid);
-	READ32(session->flags);
+	clp->cl_seqid = be32_to_cpup(p++);
+	session->flags = be32_to_cpup(p++);
 
 	/* Channel attributes */
 	status = decode_chan_attrs(xdr, &session->fc_attrs);
@@ -4275,23 +4274,23 @@ static int decode_sequence(struct xdr_stream *xdr,
 		goto out_err;
 	}
 	/* seqid */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	if (dummy != slot->seq_nr) {
 		dprintk("%s Invalid sequence number\n", __func__);
 		goto out_err;
 	}
 	/* slot id */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	if (dummy != res->sr_slotid) {
 		dprintk("%s Invalid slot id\n", __func__);
 		goto out_err;
 	}
 	/* highest slot id - currently not processed */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	/* target highest slot id - currently not processed */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	/* result flags - currently not processed */
-	READ32(dummy);
+	dummy = be32_to_cpup(p++);
 	status = 0;
 out_err:
 	res->sr_status = status;
-- 
cgit v1.2.3-70-g09d2


From 3ceb4dbb993fdab6a6fafc69db36686278871134 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:41 +0300
Subject: nfs: nfs4xdr: get rid of READ64

s/READ64\(\*(.*)\)/p = xdr_decode_hyper(p, \1)/
s/READ64\((.*)\)/p = xdr_decode_hyper(p, &\1)/

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 54 +++++++++++++++++++++++++-----------------------------
 1 file changed, 25 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5f6b46f17c5b..238189ceae3f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2433,10 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define READ64(x)         do {			\
-	(x) = (u64)ntohl(*p++) << 32;		\
-	(x) |= ntohl(*p++);			\
-} while (0)
 #define READTIME(x)       do {			\
 	p++;					\
 	(x.tv_sec) = ntohl(*p++);		\
@@ -2588,7 +2584,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
 		READ_BUF(8);
-		READ64(*change);
+		p = xdr_decode_hyper(p, change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
 		ret = NFS_ATTR_FATTR_CHANGE;
 	}
@@ -2607,7 +2603,7 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
 		READ_BUF(8);
-		READ64(*size);
+		p = xdr_decode_hyper(p, size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
 		ret = NFS_ATTR_FATTR_SIZE;
 	}
@@ -2658,8 +2654,8 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
 		READ_BUF(16);
-		READ64(fsid->major);
-		READ64(fsid->minor);
+		p = xdr_decode_hyper(p, &fsid->major);
+		p = xdr_decode_hyper(p, &fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
 		ret = NFS_ATTR_FATTR_FSID;
 	}
@@ -2711,7 +2707,7 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
 		READ_BUF(8);
-		READ64(*fileid);
+		p = xdr_decode_hyper(p, fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
@@ -2729,7 +2725,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
 		READ_BUF(8);
-		READ64(*fileid);
+		p = xdr_decode_hyper(p, fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
@@ -2747,7 +2743,7 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
 	}
 	dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
@@ -2764,7 +2760,7 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
 	}
 	dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
@@ -2781,7 +2777,7 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
 	}
 	dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
@@ -2910,7 +2906,7 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
 	}
 	dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
@@ -2962,7 +2958,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
 		uint64_t maxread;
 		READ_BUF(8);
-		READ64(maxread);
+		p = xdr_decode_hyper(p, &maxread);
 		if (maxread > 0x7FFFFFFF)
 			maxread = 0x7FFFFFFF;
 		*res = (uint32_t)maxread;
@@ -2983,7 +2979,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
 		uint64_t maxwrite;
 		READ_BUF(8);
-		READ64(maxwrite);
+		p = xdr_decode_hyper(p, &maxwrite);
 		if (maxwrite > 0x7FFFFFFF)
 			maxwrite = 0x7FFFFFFF;
 		*res = (uint32_t)maxwrite;
@@ -3122,7 +3118,7 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
 	}
 	dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
@@ -3139,7 +3135,7 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
 	}
 	dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
@@ -3156,7 +3152,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
 		READ_BUF(8);
-		READ64(*res);
+		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
 	}
 	dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
@@ -3173,7 +3169,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
 		READ_BUF(8);
-		READ64(*used);
+		p = xdr_decode_hyper(p, used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
 		ret = NFS_ATTR_FATTR_SPACE_USED;
 	}
@@ -3189,7 +3185,7 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
 	uint32_t nsec;
 
 	READ_BUF(12);
-	READ64(sec);
+	p = xdr_decode_hyper(p, &sec);
 	nsec = be32_to_cpup(p++);
 	time->tv_sec = (time_t)sec;
 	time->tv_nsec = (long)nsec;
@@ -3273,8 +3269,8 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 
 	READ_BUF(20);
 	cinfo->atomic = be32_to_cpup(p++);
-	READ64(cinfo->before);
-	READ64(cinfo->after);
+	p = xdr_decode_hyper(p, &cinfo->before);
+	p = xdr_decode_hyper(p, &cinfo->after);
 	return 0;
 }
 
@@ -3619,8 +3615,8 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 	uint32_t namelen, type;
 
 	READ_BUF(32);
-	READ64(offset);
-	READ64(length);
+	p = xdr_decode_hyper(p, &offset);
+	p = xdr_decode_hyper(p, &length);
 	type = be32_to_cpup(p++);
 	if (fl != NULL) {
 		fl->fl_start = (loff_t)offset;
@@ -3632,7 +3628,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 			fl->fl_type = F_RDLCK;
 		fl->fl_pid = 0;
 	}
-	READ64(clientid);
+	p = xdr_decode_hyper(p, &clientid);
 	namelen = be32_to_cpup(p++);
 	READ_BUF(namelen);
 	return -NFS4ERR_DENIED;
@@ -3697,7 +3693,7 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
 	case 1:
-		READ64(*maxsize);
+		p = xdr_decode_hyper(p, maxsize);
 		break;
 	case 2:
 		nblocks = be32_to_cpup(p++);
@@ -4090,7 +4086,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	nfserr = be32_to_cpup(p++);
 	if (nfserr == NFS_OK) {
 		READ_BUF(8 + NFS4_VERIFIER_SIZE);
-		READ64(clp->cl_clientid);
+		p = xdr_decode_hyper(p, &clp->cl_clientid);
 		COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE);
 	} else if (nfserr == NFSERR_CLID_INUSE) {
 		uint32_t len;
@@ -4151,7 +4147,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 		return status;
 
 	READ_BUF(8);
-	READ64(clp->cl_ex_clid);
+	p = xdr_decode_hyper(p, &clp->cl_ex_clid);
 	READ_BUF(12);
 	clp->cl_seqid = be32_to_cpup(p++);
 	clp->cl_exchange_flags = be32_to_cpup(p++);
-- 
cgit v1.2.3-70-g09d2


From c816fd3406462702dee2e3859e70132c3aab7c10 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:44 +0300
Subject: nfs: nfs4xdr: get rid of READTIME

It has no users.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 238189ceae3f..0c26bb2d43d9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2433,11 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define READTIME(x)       do {			\
-	p++;					\
-	(x.tv_sec) = ntohl(*p++);		\
-	(x.tv_nsec) = ntohl(*p++);		\
-} while (0)
 #define COPYMEM(x,nbytes) do {			\
 	memcpy((x), p, nbytes);			\
 	p += XDR_QUADLEN(nbytes);		\
-- 
cgit v1.2.3-70-g09d2


From 686841b3cc3a71918b45ed148be7a01a4f10e3f8 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:48 +0300
Subject: nfs: nfs4xdr: introduce print_overflow_msg

Part fo the nfs4xdr cleanup.  READ_BUF will go away.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0c26bb2d43d9..8255ec7079d4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2441,14 +2441,18 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
 #define READ_BUF(nbytes)  do { \
 	p = xdr_inline_decode(xdr, nbytes); \
 	if (unlikely(!p)) { \
-		dprintk("nfs: %s: prematurely hit end of receive" \
-				" buffer\n", __func__); \
-		dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
-				__func__, xdr->p, nbytes, xdr->end); \
+		print_overflow_msg(__func__, xdr); \
 		return -EIO; \
 	} \
 } while (0)
 
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+	dprintk("nfs: %s: prematurely hit end of receive buffer. "
+		"Remaining buffer length is %tu words.\n",
+		func, xdr->end - xdr->p);
+}
+
 static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
 {
 	__be32 *p;
-- 
cgit v1.2.3-70-g09d2


From 07d30434cfe2f1a1553143c6b20f1fe68d2ef80a Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:52 +0300
Subject: nfs: nfs4xdr: introduce decode_opaque_fixed and decode_stateid
 helpers

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 71 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8255ec7079d4..86e6983ef4fa 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3290,19 +3290,34 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 	return 0;
 }
 
-static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
 {
 	__be32 *p;
+
+	p = xdr_inline_decode(xdr, len);
+	if (likely(p)) {
+		memcpy(buf, p, len);
+		return 0;
+	}
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+	return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
+}
+
+static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
 	int status;
 
 	status = decode_op_hdr(xdr, OP_CLOSE);
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
-	if (status)
-		return status;
-	READ_BUF(NFS4_STATEID_SIZE);
-	COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-	return 0;
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
 }
 
 static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
@@ -3635,15 +3650,15 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 
 static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_LOCK);
 	if (status == -EIO)
 		goto out;
 	if (status == 0) {
-		READ_BUF(NFS4_STATEID_SIZE);
-		COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+		status = decode_stateid(xdr, &res->stateid);
+		if (unlikely(status))
+			goto out;
 	} else if (status == -NFS4ERR_DENIED)
 		status = decode_lock_denied(xdr, NULL);
 	if (res->open_seqid != NULL)
@@ -3664,16 +3679,13 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
 
 static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_LOCKU);
 	if (status != -EIO)
 		nfs_increment_lock_seqid(status, res->seqid);
-	if (status == 0) {
-		READ_BUF(NFS4_STATEID_SIZE);
-		COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-	}
+	if (status == 0)
+		status = decode_stateid(xdr, &res->stateid);
 	return status;
 }
 
@@ -3706,6 +3718,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 {
 	__be32 *p;
 	uint32_t delegation_type;
+	int status;
 
 	READ_BUF(4);
 	delegation_type = be32_to_cpup(p++);
@@ -3713,8 +3726,10 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 		res->delegation_type = 0;
 		return 0;
 	}
-	READ_BUF(NFS4_STATEID_SIZE+4);
-	COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
+	status = decode_stateid(xdr, &res->delegation);
+	if (unlikely(status))
+		return status;
+	READ_BUF(4);
 	res->do_recall = be32_to_cpup(p++);
 
 	switch (delegation_type) {
@@ -3738,10 +3753,10 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 	status = decode_op_hdr(xdr, OP_OPEN);
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
-	if (status)
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	if (unlikely(status))
 		return status;
-	READ_BUF(NFS4_STATEID_SIZE);
-	COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
 
 	decode_change_info(xdr, &res->cinfo);
 
@@ -3766,32 +3781,26 @@ xdr_error:
 
 static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
-	if (status)
-		return status;
-	READ_BUF(NFS4_STATEID_SIZE);
-	COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-	return 0;
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
 }
 
 static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
 	if (status != -EIO)
 		nfs_increment_open_seqid(status, res->seqid);
-	if (status)
-		return status;
-	READ_BUF(NFS4_STATEID_SIZE);
-	COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-	return 0;
+	if (!status)
+		status = decode_stateid(xdr, &res->stateid);
+	return status;
 }
 
 static int decode_putfh(struct xdr_stream *xdr)
-- 
cgit v1.2.3-70-g09d2


From db942bbd09563e169cc5d9004c32c1de33220fd1 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:19:56 +0300
Subject: nfs: nfs4xdr: introduce decode_verifier helper

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[Trond: Fixed up an 'uninitialised variable' issue in decode_readdir]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 86e6983ef4fa..404f2e6373f2 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3320,17 +3320,19 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 	return status;
 }
 
+static int decode_verifier(struct xdr_stream *xdr, void *verifier)
+{
+	return decode_opaque_fixed(xdr, verifier, 8);
+}
+
 static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
 {
-	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_COMMIT);
-	if (status)
-		return status;
-	READ_BUF(8);
-	COPYMEM(res->verf->verifier, 8);
-	return 0;
+	if (!status)
+		status = decode_verifier(xdr, res->verf->verifier);
+	return status;
 }
 
 static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3852,17 +3854,17 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
 	int		status;
 
 	status = decode_op_hdr(xdr, OP_READDIR);
-	if (status)
+	if (!status)
+		status = decode_verifier(xdr, readdir->verifier.data);
+	if (unlikely(status))
 		return status;
-	READ_BUF(8);
-	COPYMEM(readdir->verifier.data, 8);
 	dprintk("%s: verifier = %08x:%08x\n",
 			__func__,
 			((u32 *)readdir->verifier.data)[0],
 			((u32 *)readdir->verifier.data)[1]);
 
 
-	hdrlen = (char *) p - (char *) iov->iov_base;
+	hdrlen = (char *) xdr->p - (char *) iov->iov_base;
 	recvd = rcvbuf->len - hdrlen;
 	if (pglen > recvd)
 		pglen = recvd;
-- 
cgit v1.2.3-70-g09d2


From e78291e4e07520348b0634095cf19ed3bc868965 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:00 +0300
Subject: nfs: nfs4xdr: introduce decode_sessionid helper

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 404f2e6373f2..00630f42a2b4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4212,6 +4212,11 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 	return 0;
 }
 
+static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
+{
+	return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
+}
+
 static int decode_create_session(struct xdr_stream *xdr,
 				 struct nfs41_create_session_res *res)
 {
@@ -4221,14 +4226,11 @@ static int decode_create_session(struct xdr_stream *xdr,
 	struct nfs4_session *session = clp->cl_session;
 
 	status = decode_op_hdr(xdr, OP_CREATE_SESSION);
-
-	if (status)
+	if (!status)
+		status = decode_sessionid(xdr, &session->sess_id);
+	if (unlikely(status))
 		return status;
 
-	/* sessionid */
-	READ_BUF(NFS4_MAX_SESSIONID_LEN);
-	COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN);
-
 	/* seqid, flags */
 	READ_BUF(8);
 	clp->cl_seqid = be32_to_cpup(p++);
@@ -4262,7 +4264,9 @@ static int decode_sequence(struct xdr_stream *xdr,
 		return 0;
 
 	status = decode_op_hdr(xdr, OP_SEQUENCE);
-	if (status)
+	if (!status)
+		status = decode_sessionid(xdr, &id);
+	if (unlikely(status))
 		goto out_err;
 
 	/*
@@ -4271,15 +4275,16 @@ static int decode_sequence(struct xdr_stream *xdr,
 	 */
 	status = -ESERVERFAULT;
 
-	slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
-	READ_BUF(NFS4_MAX_SESSIONID_LEN + 20);
-	COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
 	if (memcmp(id.data, res->sr_session->sess_id.data,
 		   NFS4_MAX_SESSIONID_LEN)) {
 		dprintk("%s Invalid session id\n", __func__);
 		goto out_err;
 	}
+
+	READ_BUF(20);
+
 	/* seqid */
+	slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
 	dummy = be32_to_cpup(p++);
 	if (dummy != slot->seq_nr) {
 		dprintk("%s Invalid sequence number\n", __func__);
-- 
cgit v1.2.3-70-g09d2


From 99398d0655ada44ae464a1c93d13cd438a306ecd Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:05 +0300
Subject: nfs: nfs4xdr: get rid of COPYMEM

Just directly call memcpy.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 00630f42a2b4..f69aafc23d44 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2433,11 +2433,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define COPYMEM(x,nbytes) do {			\
-	memcpy((x), p, nbytes);			\
-	p += XDR_QUADLEN(nbytes);		\
-} while (0)
-
 #define READ_BUF(nbytes)  do { \
 	p = xdr_inline_decode(xdr, nbytes); \
 	if (unlikely(!p)) { \
@@ -3607,7 +3602,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 		return -EIO;
 	fh->size = len;
 	READ_BUF(len);
-	COPYMEM(fh->data, len);
+	memcpy(fh->data, p, len);
 	return 0;
 }
 
@@ -4097,7 +4092,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	if (nfserr == NFS_OK) {
 		READ_BUF(8 + NFS4_VERIFIER_SIZE);
 		p = xdr_decode_hyper(p, &clp->cl_clientid);
-		COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE);
+		memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
 	} else if (nfserr == NFSERR_CLID_INUSE) {
 		uint32_t len;
 
@@ -4134,7 +4129,7 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
 	READ_BUF(16);
 	res->count = be32_to_cpup(p++);
 	res->verf->committed = be32_to_cpup(p++);
-	COPYMEM(res->verf->verifier, 8);
+	memcpy(res->verf->verifier, p, 8);
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 2460ba57c49c36dfef0b62c929461de09240fe17 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:10 +0300
Subject: nfs: nfs4xdr: simplify decode_exchange_id by reusing
 decode_opaque_inline

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f69aafc23d44..3f49da0960e3 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4144,6 +4144,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 {
 	__be32 *p;
 	uint32_t dummy;
+	char *dummy_str;
 	int status;
 	struct nfs_client *clp = res->client;
 
@@ -4166,19 +4167,19 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	READ_BUF(8);
 
 	/* Throw away Major id */
-	READ_BUF(4);
-	dummy = be32_to_cpup(p++);
-	READ_BUF(dummy);
+	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+	if (unlikely(status))
+		return status;
 
 	/* Throw away server_scope */
-	READ_BUF(4);
-	dummy = be32_to_cpup(p++);
-	READ_BUF(dummy);
+	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+	if (unlikely(status))
+		return status;
 
 	/* Throw away Implementation id array */
-	READ_BUF(4);
-	dummy = be32_to_cpup(p++);
-	READ_BUF(dummy);
+	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+	if (unlikely(status))
+		return status;
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From c0eae66ece40bdb8cd88c5106834b71a1c9f421c Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:14 +0300
Subject: nfs: nfs4xdr: get rid of READ_BUF

Use xdr_inline_decode instead.
Open code debug printout and error return.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 475 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 379 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3f49da0960e3..b0d690c3a24c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2423,24 +2423,6 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
 }
 #endif /* CONFIG_NFS_V4_1 */
 
-/*
- * START OF "GENERIC" DECODE ROUTINES.
- *   These may look a little ugly since they are imported from a "generic"
- * set of XDR encode/decode routines which are intended to be shared by
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
- *
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
-#define READ_BUF(nbytes)  do { \
-	p = xdr_inline_decode(xdr, nbytes); \
-	if (unlikely(!p)) { \
-		print_overflow_msg(__func__, xdr); \
-		return -EIO; \
-	} \
-} while (0)
-
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
 {
 	dprintk("nfs: %s: prematurely hit end of receive buffer. "
@@ -2452,28 +2434,42 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
 {
 	__be32 *p;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	*len = be32_to_cpup(p++);
-	READ_BUF(*len);
+	p = xdr_inline_decode(xdr, *len);
+	if (unlikely(!p))
+		goto out_overflow;
 	*string = (char *)p;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	hdr->status = be32_to_cpup(p++);
 	hdr->taglen = be32_to_cpup(p++);
 
-	READ_BUF(hdr->taglen + 4);
+	p = xdr_inline_decode(xdr, hdr->taglen + 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	hdr->tag = (char *)p;
 	p += XDR_QUADLEN(hdr->taglen);
 	hdr->nops = be32_to_cpup(p++);
 	if (unlikely(hdr->nops < 1))
 		return nfs4_stat_to_errno(hdr->status);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
@@ -2482,7 +2478,9 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 	uint32_t opnum;
 	int32_t nfserr;
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	opnum = be32_to_cpup(p++);
 	if (opnum != expected) {
 		dprintk("nfs: Server returned operation"
@@ -2494,6 +2492,9 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 	if (nfserr != NFS_OK)
 		return nfs4_stat_to_errno(nfserr);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 /* Dummy routine */
@@ -2503,8 +2504,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
 	unsigned int strlen;
 	char *str;
 
-	READ_BUF(12);
-	return decode_opaque_inline(xdr, &strlen, &str);
+	p = xdr_inline_decode(xdr, 12);
+	if (likely(p))
+		return decode_opaque_inline(xdr, &strlen, &str);
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
@@ -2512,27 +2516,39 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	uint32_t bmlen;
 	__be32 *p;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	bmlen = be32_to_cpup(p++);
 
 	bitmap[0] = bitmap[1] = 0;
-	READ_BUF((bmlen << 2));
+	p = xdr_inline_decode(xdr, (bmlen << 2));
+	if (unlikely(!p))
+		goto out_overflow;
 	if (bmlen > 0) {
 		bitmap[0] = be32_to_cpup(p++);
 		if (bmlen > 1)
 			bitmap[1] = be32_to_cpup(p++);
 	}
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep)
 {
 	__be32 *p;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	*attrlen = be32_to_cpup(p++);
 	*savep = xdr->p;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
@@ -2555,7 +2571,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*type = be32_to_cpup(p++);
 		if (*type < NF4REG || *type > NF4NAMEDATTR) {
 			dprintk("%s: bad type %d\n", __func__, *type);
@@ -2566,6 +2584,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 	}
 	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
@@ -2577,7 +2598,9 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
 		ret = NFS_ATTR_FATTR_CHANGE;
@@ -2585,6 +2608,9 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 	dprintk("%s: change attribute=%Lu\n", __func__,
 			(unsigned long long)*change);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
@@ -2596,13 +2622,18 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
 		ret = NFS_ATTR_FATTR_SIZE;
 	}
 	dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2613,12 +2644,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
 	}
 	dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2629,12 +2665,17 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
 	}
 	dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
@@ -2647,7 +2688,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
-		READ_BUF(16);
+		p = xdr_inline_decode(xdr, 16);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, &fsid->major);
 		p = xdr_decode_hyper(p, &fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
@@ -2657,6 +2700,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 			(unsigned long long)fsid->major,
 			(unsigned long long)fsid->minor);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2667,12 +2713,17 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
 	}
 	dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2683,12 +2734,17 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*res = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
 	}
 	dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2700,13 +2756,18 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
 	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2718,13 +2779,18 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
 	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2736,12 +2802,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
 	}
 	dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2753,12 +2824,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
 	}
 	dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2770,12 +2846,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
 	}
 	dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
@@ -2784,7 +2865,9 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 	__be32 *p;
 	int status = 0;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	n = be32_to_cpup(p++);
 	if (n == 0)
 		goto root_path;
@@ -2819,6 +2902,9 @@ out_eio:
 	dprintk(" status %d", status);
 	status = -EIO;
 	goto out;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
@@ -2836,7 +2922,9 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	status = decode_pathname(xdr, &res->fs_path);
 	if (unlikely(status != 0))
 		goto out;
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	n = be32_to_cpup(p++);
 	if (n <= 0)
 		goto out_eio;
@@ -2845,7 +2933,9 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		u32 m;
 		struct nfs4_fs_location *loc = &res->locations[res->nlocations];
 
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		m = be32_to_cpup(p++);
 
 		loc->nservers = 0;
@@ -2885,6 +2975,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 out:
 	dprintk("%s: fs_locations done, error = %d\n", __func__, status);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
 out_eio:
 	status = -EIO;
 	goto out;
@@ -2899,12 +2991,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
 	}
 	dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
@@ -2916,12 +3013,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*maxlink = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
 	}
 	dprintk("%s: maxlink=%u\n", __func__, *maxlink);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
@@ -2933,12 +3035,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U)))
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*maxname = be32_to_cpup(p++);
 		bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
 	}
 	dprintk("%s: maxname=%u\n", __func__, *maxname);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2951,7 +3058,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
 		uint64_t maxread;
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, &maxread);
 		if (maxread > 0x7FFFFFFF)
 			maxread = 0x7FFFFFFF;
@@ -2960,6 +3069,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 	}
 	dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2972,7 +3084,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 		return -EIO;
 	if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
 		uint64_t maxwrite;
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, &maxwrite);
 		if (maxwrite > 0x7FFFFFFF)
 			maxwrite = 0x7FFFFFFF;
@@ -2981,6 +3095,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 	}
 	dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
@@ -2993,7 +3110,9 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		tmp = be32_to_cpup(p++);
 		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
@@ -3001,6 +3120,9 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
 	}
 	dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
@@ -3012,13 +3134,18 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		*nlink = be32_to_cpup(p++);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
 		ret = NFS_ATTR_FATTR_NLINK;
 	}
 	dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
@@ -3031,9 +3158,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		len = be32_to_cpup(p++);
-		READ_BUF(len);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
 				ret = NFS_ATTR_FATTR_OWNER;
@@ -3047,6 +3178,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	}
 	dprintk("%s: uid=%d\n", __func__, (int)*uid);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
@@ -3059,9 +3193,13 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		len = be32_to_cpup(p++);
-		READ_BUF(len);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
 				ret = NFS_ATTR_FATTR_GROUP;
@@ -3075,6 +3213,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	}
 	dprintk("%s: gid=%d\n", __func__, (int)*gid);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
@@ -3089,7 +3230,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 	if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) {
 		dev_t tmp;
 
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		major = be32_to_cpup(p++);
 		minor = be32_to_cpup(p++);
 		tmp = MKDEV(major, minor);
@@ -3100,6 +3243,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 	}
 	dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3111,12 +3257,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
 	}
 	dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3128,12 +3279,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
 	}
 	dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3145,12 +3301,17 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
 	}
 	dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
@@ -3162,7 +3323,9 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
-		READ_BUF(8);
+		p = xdr_inline_decode(xdr, 8);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
 		ret = NFS_ATTR_FATTR_SPACE_USED;
@@ -3170,6 +3333,9 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 	dprintk("%s: space used=%Lu\n", __func__,
 			(unsigned long long)*used);
 	return ret;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -3178,12 +3344,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
 	uint64_t sec;
 	uint32_t nsec;
 
-	READ_BUF(12);
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
 	p = xdr_decode_hyper(p, &sec);
 	nsec = be32_to_cpup(p++);
 	time->tv_sec = (time_t)sec;
 	time->tv_nsec = (long)nsec;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -3261,11 +3432,16 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 {
 	__be32 *p;
 
-	READ_BUF(20);
+	p = xdr_inline_decode(xdr, 20);
+	if (unlikely(!p))
+		goto out_overflow;
 	cinfo->atomic = be32_to_cpup(p++);
 	p = xdr_decode_hyper(p, &cinfo->before);
 	p = xdr_decode_hyper(p, &cinfo->after);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
@@ -3277,12 +3453,17 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 	status = decode_op_hdr(xdr, OP_ACCESS);
 	if (status)
 		return status;
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	supp = be32_to_cpup(p++);
 	acc = be32_to_cpup(p++);
 	access->supported = supp;
 	access->access = acc;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
@@ -3341,10 +3522,16 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 		return status;
 	if ((status = decode_change_info(xdr, cinfo)))
 		return status;
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	bmlen = be32_to_cpup(p++);
-	READ_BUF(bmlen << 2);
-	return 0;
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (likely(p))
+		return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
@@ -3596,14 +3783,21 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 	if (status)
 		return status;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	len = be32_to_cpup(p++);
 	if (len > NFS4_FHSIZE)
 		return -EIO;
 	fh->size = len;
-	READ_BUF(len);
+	p = xdr_inline_decode(xdr, len);
+	if (unlikely(!p))
+		goto out_overflow;
 	memcpy(fh->data, p, len);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3625,7 +3819,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 	__be32 *p;
 	uint32_t namelen, type;
 
-	READ_BUF(32);
+	p = xdr_inline_decode(xdr, 32);
+	if (unlikely(!p))
+		goto out_overflow;
 	p = xdr_decode_hyper(p, &offset);
 	p = xdr_decode_hyper(p, &length);
 	type = be32_to_cpup(p++);
@@ -3641,8 +3837,12 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 	}
 	p = xdr_decode_hyper(p, &clientid);
 	namelen = be32_to_cpup(p++);
-	READ_BUF(namelen);
-	return -NFS4ERR_DENIED;
+	p = xdr_inline_decode(xdr, namelen);
+	if (likely(p))
+		return -NFS4ERR_DENIED;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
@@ -3697,7 +3897,9 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 	__be32 *p;
 	uint32_t limit_type, nblocks, blocksize;
 
-	READ_BUF(12);
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
 	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
 	case 1:
@@ -3709,6 +3911,9 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 		*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
 	}
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -3717,7 +3922,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	uint32_t delegation_type;
 	int status;
 
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	delegation_type = be32_to_cpup(p++);
 	if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
 		res->delegation_type = 0;
@@ -3726,7 +3933,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	status = decode_stateid(xdr, &res->delegation);
 	if (unlikely(status))
 		return status;
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	res->do_recall = be32_to_cpup(p++);
 
 	switch (delegation_type) {
@@ -3739,6 +3948,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 				return -EIO;
 	}
 	return decode_ace(xdr, NULL, res->server->nfs_client);
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -3757,13 +3969,17 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 
 	decode_change_info(xdr, &res->cinfo);
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	res->rflags = be32_to_cpup(p++);
 	bmlen = be32_to_cpup(p++);
 	if (bmlen > 10)
 		goto xdr_error;
 
-	READ_BUF(bmlen << 2);
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (unlikely(!p))
+		goto out_overflow;
 	savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
 	for (i = 0; i < savewords; ++i)
 		res->attrset[i] = be32_to_cpup(p++);
@@ -3774,6 +3990,9 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 xdr_error:
 	dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
 	return -EIO;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
@@ -3820,7 +4039,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
 	status = decode_op_hdr(xdr, OP_READ);
 	if (status)
 		return status;
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	eof = be32_to_cpup(p++);
 	count = be32_to_cpup(p++);
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
@@ -3835,6 +4056,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
 	res->eof = eof;
 	res->count = count;
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
@@ -3947,7 +4171,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 		return status;
 
 	/* Convert length of symlink */
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	len = be32_to_cpup(p++);
 	if (len >= rcvbuf->page_len || len <= 0) {
 		dprintk("nfs: server returned giant symlink!\n");
@@ -3972,6 +4198,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 	kaddr[len+rcvbuf->page_base] = '\0';
 	kunmap_atomic(kaddr, KM_USER0);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -4069,10 +4298,16 @@ static int decode_setattr(struct xdr_stream *xdr)
 	status = decode_op_hdr(xdr, OP_SETATTR);
 	if (status)
 		return status;
-	READ_BUF(4);
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
 	bmlen = be32_to_cpup(p++);
-	READ_BUF(bmlen << 2);
-	return 0;
+	p = xdr_inline_decode(xdr, bmlen << 2);
+	if (likely(p))
+		return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
@@ -4081,7 +4316,9 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	uint32_t opnum;
 	int32_t nfserr;
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	opnum = be32_to_cpup(p++);
 	if (opnum != OP_SETCLIENTID) {
 		dprintk("nfs: decode_setclientid: Server returned operation"
@@ -4090,26 +4327,39 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 	}
 	nfserr = be32_to_cpup(p++);
 	if (nfserr == NFS_OK) {
-		READ_BUF(8 + NFS4_VERIFIER_SIZE);
+		p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
+		if (unlikely(!p))
+			goto out_overflow;
 		p = xdr_decode_hyper(p, &clp->cl_clientid);
 		memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
 	} else if (nfserr == NFSERR_CLID_INUSE) {
 		uint32_t len;
 
 		/* skip netid string */
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		len = be32_to_cpup(p++);
-		READ_BUF(len);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
 
 		/* skip uaddr string */
-		READ_BUF(4);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
 		len = be32_to_cpup(p++);
-		READ_BUF(len);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
 		return -NFSERR_CLID_INUSE;
 	} else
 		return nfs4_stat_to_errno(nfserr);
 
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_setclientid_confirm(struct xdr_stream *xdr)
@@ -4126,11 +4376,16 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
 	if (status)
 		return status;
 
-	READ_BUF(16);
+	p = xdr_inline_decode(xdr, 16);
+	if (unlikely(!p))
+		goto out_overflow;
 	res->count = be32_to_cpup(p++);
 	res->verf->committed = be32_to_cpup(p++);
 	memcpy(res->verf->verifier, p, 8);
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_delegreturn(struct xdr_stream *xdr)
@@ -4152,9 +4407,13 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	if (status)
 		return status;
 
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	p = xdr_decode_hyper(p, &clp->cl_ex_clid);
-	READ_BUF(12);
+	p = xdr_inline_decode(xdr, 12);
+	if (unlikely(!p))
+		goto out_overflow;
 	clp->cl_seqid = be32_to_cpup(p++);
 	clp->cl_exchange_flags = be32_to_cpup(p++);
 
@@ -4164,7 +4423,9 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 		return -EIO;
 
 	/* Throw away minor_id */
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 
 	/* Throw away Major id */
 	status = decode_opaque_inline(xdr, &dummy, &dummy_str);
@@ -4182,6 +4443,9 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 		return status;
 
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_chan_attrs(struct xdr_stream *xdr,
@@ -4190,7 +4454,9 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 	__be32 *p;
 	u32 nr_attrs;
 
-	READ_BUF(28);
+	p = xdr_inline_decode(xdr, 28);
+	if (unlikely(!p))
+		goto out_overflow;
 	attrs->headerpadsz = be32_to_cpup(p++);
 	attrs->max_rqst_sz = be32_to_cpup(p++);
 	attrs->max_resp_sz = be32_to_cpup(p++);
@@ -4203,9 +4469,15 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 			__func__, nr_attrs);
 		return -EINVAL;
 	}
-	if (nr_attrs == 1)
-		READ_BUF(4); /* skip rdma_attrs */
+	if (nr_attrs == 1) {
+		p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
+		if (unlikely(!p))
+			goto out_overflow;
+	}
 	return 0;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
@@ -4228,7 +4500,9 @@ static int decode_create_session(struct xdr_stream *xdr,
 		return status;
 
 	/* seqid, flags */
-	READ_BUF(8);
+	p = xdr_inline_decode(xdr, 8);
+	if (unlikely(!p))
+		goto out_overflow;
 	clp->cl_seqid = be32_to_cpup(p++);
 	session->flags = be32_to_cpup(p++);
 
@@ -4237,6 +4511,9 @@ static int decode_create_session(struct xdr_stream *xdr,
 	if (!status)
 		status = decode_chan_attrs(xdr, &session->bc_attrs);
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
 }
 
 static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
@@ -4277,7 +4554,9 @@ static int decode_sequence(struct xdr_stream *xdr,
 		goto out_err;
 	}
 
-	READ_BUF(20);
+	p = xdr_inline_decode(xdr, 20);
+	if (unlikely(!p))
+		goto out_overflow;
 
 	/* seqid */
 	slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
@@ -4302,6 +4581,10 @@ static int decode_sequence(struct xdr_stream *xdr,
 out_err:
 	res->sr_status = status;
 	return status;
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	status = -EIO;
+	goto out_err;
 #else  /* CONFIG_NFS_V4_1 */
 	return 0;
 #endif /* CONFIG_NFS_V4_1 */
-- 
cgit v1.2.3-70-g09d2


From cccddf4f5580131c9b963900e1d3400655e633cc Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 14 Aug 2009 17:20:19 +0300
Subject: nfs: nfs4xdr: optimize low level decoding

do not increment decoding ptr if not needed.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 118 +++++++++++++++++++++++++++----------------------------
 1 file changed, 59 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b0d690c3a24c..14b6f513648f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2437,7 +2437,7 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	*len = be32_to_cpup(p++);
+	*len = be32_to_cpup(p);
 	p = xdr_inline_decode(xdr, *len);
 	if (unlikely(!p))
 		goto out_overflow;
@@ -2456,14 +2456,14 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	if (unlikely(!p))
 		goto out_overflow;
 	hdr->status = be32_to_cpup(p++);
-	hdr->taglen = be32_to_cpup(p++);
+	hdr->taglen = be32_to_cpup(p);
 
 	p = xdr_inline_decode(xdr, hdr->taglen + 4);
 	if (unlikely(!p))
 		goto out_overflow;
 	hdr->tag = (char *)p;
 	p += XDR_QUADLEN(hdr->taglen);
-	hdr->nops = be32_to_cpup(p++);
+	hdr->nops = be32_to_cpup(p);
 	if (unlikely(hdr->nops < 1))
 		return nfs4_stat_to_errno(hdr->status);
 	return 0;
@@ -2488,7 +2488,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 				opnum, expected);
 		return -EIO;
 	}
-	nfserr = be32_to_cpup(p++);
+	nfserr = be32_to_cpup(p);
 	if (nfserr != NFS_OK)
 		return nfs4_stat_to_errno(nfserr);
 	return 0;
@@ -2519,7 +2519,7 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	bmlen = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p);
 
 	bitmap[0] = bitmap[1] = 0;
 	p = xdr_inline_decode(xdr, (bmlen << 2));
@@ -2528,7 +2528,7 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	if (bmlen > 0) {
 		bitmap[0] = be32_to_cpup(p++);
 		if (bmlen > 1)
-			bitmap[1] = be32_to_cpup(p++);
+			bitmap[1] = be32_to_cpup(p);
 	}
 	return 0;
 out_overflow:
@@ -2543,7 +2543,7 @@ static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen,
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	*attrlen = be32_to_cpup(p++);
+	*attrlen = be32_to_cpup(p);
 	*savep = xdr->p;
 	return 0;
 out_overflow:
@@ -2574,7 +2574,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*type = be32_to_cpup(p++);
+		*type = be32_to_cpup(p);
 		if (*type < NF4REG || *type > NF4NAMEDATTR) {
 			dprintk("%s: bad type %d\n", __func__, *type);
 			return -EIO;
@@ -2601,7 +2601,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, change);
+		xdr_decode_hyper(p, change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
 		ret = NFS_ATTR_FATTR_CHANGE;
 	}
@@ -2625,7 +2625,7 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, size);
+		xdr_decode_hyper(p, size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
 		ret = NFS_ATTR_FATTR_SIZE;
 	}
@@ -2647,7 +2647,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*res = be32_to_cpup(p++);
+		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
 	}
 	dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
@@ -2668,7 +2668,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*res = be32_to_cpup(p++);
+		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
 	}
 	dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
@@ -2692,7 +2692,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 		if (unlikely(!p))
 			goto out_overflow;
 		p = xdr_decode_hyper(p, &fsid->major);
-		p = xdr_decode_hyper(p, &fsid->minor);
+		xdr_decode_hyper(p, &fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
 		ret = NFS_ATTR_FATTR_FSID;
 	}
@@ -2716,7 +2716,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*res = be32_to_cpup(p++);
+		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
 	}
 	dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
@@ -2737,7 +2737,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*res = be32_to_cpup(p++);
+		*res = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
 	}
 	dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
@@ -2759,7 +2759,7 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, fileid);
+		xdr_decode_hyper(p, fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
@@ -2782,7 +2782,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, fileid);
+		xdr_decode_hyper(p, fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 		ret = NFS_ATTR_FATTR_FILEID;
 	}
@@ -2805,7 +2805,7 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
 	}
 	dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
@@ -2827,7 +2827,7 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
 	}
 	dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
@@ -2849,7 +2849,7 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
 	}
 	dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
@@ -2868,7 +2868,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	n = be32_to_cpup(p++);
+	n = be32_to_cpup(p);
 	if (n == 0)
 		goto root_path;
 	dprintk("path ");
@@ -2925,7 +2925,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	n = be32_to_cpup(p++);
+	n = be32_to_cpup(p);
 	if (n <= 0)
 		goto out_eio;
 	res->nlocations = 0;
@@ -2936,7 +2936,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		m = be32_to_cpup(p++);
+		m = be32_to_cpup(p);
 
 		loc->nservers = 0;
 		dprintk("%s: servers ", __func__);
@@ -2994,7 +2994,7 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
 	}
 	dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
@@ -3016,7 +3016,7 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*maxlink = be32_to_cpup(p++);
+		*maxlink = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
 	}
 	dprintk("%s: maxlink=%u\n", __func__, *maxlink);
@@ -3038,7 +3038,7 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*maxname = be32_to_cpup(p++);
+		*maxname = be32_to_cpup(p);
 		bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
 	}
 	dprintk("%s: maxname=%u\n", __func__, *maxname);
@@ -3061,7 +3061,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, &maxread);
+		xdr_decode_hyper(p, &maxread);
 		if (maxread > 0x7FFFFFFF)
 			maxread = 0x7FFFFFFF;
 		*res = (uint32_t)maxread;
@@ -3087,7 +3087,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, &maxwrite);
+		xdr_decode_hyper(p, &maxwrite);
 		if (maxwrite > 0x7FFFFFFF)
 			maxwrite = 0x7FFFFFFF;
 		*res = (uint32_t)maxwrite;
@@ -3113,7 +3113,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		tmp = be32_to_cpup(p++);
+		tmp = be32_to_cpup(p);
 		*mode = tmp & ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
 		ret = NFS_ATTR_FATTR_MODE;
@@ -3137,7 +3137,7 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		*nlink = be32_to_cpup(p++);
+		*nlink = be32_to_cpup(p);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
 		ret = NFS_ATTR_FATTR_NLINK;
 	}
@@ -3161,7 +3161,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		len = be32_to_cpup(p++);
+		len = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
@@ -3196,7 +3196,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		len = be32_to_cpup(p++);
+		len = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
@@ -3234,7 +3234,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 		if (unlikely(!p))
 			goto out_overflow;
 		major = be32_to_cpup(p++);
-		minor = be32_to_cpup(p++);
+		minor = be32_to_cpup(p);
 		tmp = MKDEV(major, minor);
 		if (MAJOR(tmp) == major && MINOR(tmp) == minor)
 			*rdev = tmp;
@@ -3260,7 +3260,7 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
 	}
 	dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
@@ -3282,7 +3282,7 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
 	}
 	dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
@@ -3304,7 +3304,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, res);
+		xdr_decode_hyper(p, res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
 	}
 	dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
@@ -3326,7 +3326,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		p = xdr_inline_decode(xdr, 8);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, used);
+		xdr_decode_hyper(p, used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
 		ret = NFS_ATTR_FATTR_SPACE_USED;
 	}
@@ -3348,7 +3348,7 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
 	if (unlikely(!p))
 		goto out_overflow;
 	p = xdr_decode_hyper(p, &sec);
-	nsec = be32_to_cpup(p++);
+	nsec = be32_to_cpup(p);
 	time->tv_sec = (time_t)sec;
 	time->tv_nsec = (long)nsec;
 	return 0;
@@ -3437,7 +3437,7 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 		goto out_overflow;
 	cinfo->atomic = be32_to_cpup(p++);
 	p = xdr_decode_hyper(p, &cinfo->before);
-	p = xdr_decode_hyper(p, &cinfo->after);
+	xdr_decode_hyper(p, &cinfo->after);
 	return 0;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
@@ -3457,7 +3457,7 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 	if (unlikely(!p))
 		goto out_overflow;
 	supp = be32_to_cpup(p++);
-	acc = be32_to_cpup(p++);
+	acc = be32_to_cpup(p);
 	access->supported = supp;
 	access->access = acc;
 	return 0;
@@ -3525,7 +3525,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	bmlen = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p);
 	p = xdr_inline_decode(xdr, bmlen << 2);
 	if (likely(p))
 		return 0;
@@ -3786,7 +3786,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	len = be32_to_cpup(p++);
+	len = be32_to_cpup(p);
 	if (len > NFS4_FHSIZE)
 		return -EIO;
 	fh->size = len;
@@ -3836,7 +3836,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 		fl->fl_pid = 0;
 	}
 	p = xdr_decode_hyper(p, &clientid);
-	namelen = be32_to_cpup(p++);
+	namelen = be32_to_cpup(p);
 	p = xdr_inline_decode(xdr, namelen);
 	if (likely(p))
 		return -NFS4ERR_DENIED;
@@ -3903,11 +3903,11 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 	limit_type = be32_to_cpup(p++);
 	switch (limit_type) {
 	case 1:
-		p = xdr_decode_hyper(p, maxsize);
+		xdr_decode_hyper(p, maxsize);
 		break;
 	case 2:
 		nblocks = be32_to_cpup(p++);
-		blocksize = be32_to_cpup(p++);
+		blocksize = be32_to_cpup(p);
 		*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
 	}
 	return 0;
@@ -3925,7 +3925,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	delegation_type = be32_to_cpup(p++);
+	delegation_type = be32_to_cpup(p);
 	if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
 		res->delegation_type = 0;
 		return 0;
@@ -3936,7 +3936,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	res->do_recall = be32_to_cpup(p++);
+	res->do_recall = be32_to_cpup(p);
 
 	switch (delegation_type) {
 	case NFS4_OPEN_DELEGATE_READ:
@@ -3973,7 +3973,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 	if (unlikely(!p))
 		goto out_overflow;
 	res->rflags = be32_to_cpup(p++);
-	bmlen = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p);
 	if (bmlen > 10)
 		goto xdr_error;
 
@@ -4043,7 +4043,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
 	if (unlikely(!p))
 		goto out_overflow;
 	eof = be32_to_cpup(p++);
-	count = be32_to_cpup(p++);
+	count = be32_to_cpup(p);
 	hdrlen = (u8 *) p - (u8 *) iov->iov_base;
 	recvd = req->rq_rcv_buf.len - hdrlen;
 	if (count > recvd) {
@@ -4174,7 +4174,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	len = be32_to_cpup(p++);
+	len = be32_to_cpup(p);
 	if (len >= rcvbuf->page_len || len <= 0) {
 		dprintk("nfs: server returned giant symlink!\n");
 		return -ENAMETOOLONG;
@@ -4301,7 +4301,7 @@ static int decode_setattr(struct xdr_stream *xdr)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	bmlen = be32_to_cpup(p++);
+	bmlen = be32_to_cpup(p);
 	p = xdr_inline_decode(xdr, bmlen << 2);
 	if (likely(p))
 		return 0;
@@ -4325,7 +4325,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 			" %d\n", opnum);
 		return -EIO;
 	}
-	nfserr = be32_to_cpup(p++);
+	nfserr = be32_to_cpup(p);
 	if (nfserr == NFS_OK) {
 		p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
 		if (unlikely(!p))
@@ -4339,7 +4339,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		len = be32_to_cpup(p++);
+		len = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
@@ -4348,7 +4348,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		len = be32_to_cpup(p++);
+		len = be32_to_cpup(p);
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
@@ -4410,7 +4410,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
 		goto out_overflow;
-	p = xdr_decode_hyper(p, &clp->cl_ex_clid);
+	xdr_decode_hyper(p, &clp->cl_ex_clid);
 	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(!p))
 		goto out_overflow;
@@ -4418,7 +4418,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	clp->cl_exchange_flags = be32_to_cpup(p++);
 
 	/* We ask for SP4_NONE */
-	dummy = be32_to_cpup(p++);
+	dummy = be32_to_cpup(p);
 	if (dummy != SP4_NONE)
 		return -EIO;
 
@@ -4463,7 +4463,7 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
 	attrs->max_resp_sz_cached = be32_to_cpup(p++);
 	attrs->max_ops = be32_to_cpup(p++);
 	attrs->max_reqs = be32_to_cpup(p++);
-	nr_attrs = be32_to_cpup(p++);
+	nr_attrs = be32_to_cpup(p);
 	if (unlikely(nr_attrs > 1)) {
 		printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
 			__func__, nr_attrs);
@@ -4504,7 +4504,7 @@ static int decode_create_session(struct xdr_stream *xdr,
 	if (unlikely(!p))
 		goto out_overflow;
 	clp->cl_seqid = be32_to_cpup(p++);
-	session->flags = be32_to_cpup(p++);
+	session->flags = be32_to_cpup(p);
 
 	/* Channel attributes */
 	status = decode_chan_attrs(xdr, &session->fc_attrs);
@@ -4576,7 +4576,7 @@ static int decode_sequence(struct xdr_stream *xdr,
 	/* target highest slot id - currently not processed */
 	dummy = be32_to_cpup(p++);
 	/* result flags - currently not processed */
-	dummy = be32_to_cpup(p++);
+	dummy = be32_to_cpup(p);
 	status = 0;
 out_err:
 	res->sr_status = status;
-- 
cgit v1.2.3-70-g09d2


From b2add73dbf93fd50f00564d7abc3e2b9aa9dd20c Mon Sep 17 00:00:00 2001
From: Guillaume Knispel <gknispel@proformatique.com>
Date: Sat, 15 Aug 2009 19:30:24 +0200
Subject: poll/select: initialize triggered field of struct poll_wqueues

The triggered field of struct poll_wqueues introduced in commit
5f820f648c92a5ecc771a96b3c29aa6e90013bba ("poll: allow f_op->poll to
sleep").

It was first set to 1 in pollwake() (now __pollwake() ), tested and
later set to 0 in poll_schedule_timeout(), but not initialized before.

As a result when the process needs to sleep, triggered was likely to be
non-zero even if pollwake() is not called before the first
poll_schedule_timeout(), meaning schedule_hrtimeout_range() would not be
called and an extra loop calling all ->poll() would be done.

This patch initialize triggered to 0 in poll_initwait() so the ->poll()
are not called twice before the process goes to sleep when it needs to.

Signed-off-by: Guillaume Knispel <gknispel@proformatique.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/select.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index d870237e42c7..8084834e123e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -110,6 +110,7 @@ void poll_initwait(struct poll_wqueues *pwq)
 {
 	init_poll_funcptr(&pwq->pt, __pollwait);
 	pwq->polling_task = current;
+	pwq->triggered = 0;
 	pwq->error = 0;
 	pwq->table = NULL;
 	pwq->inline_index = 0;
-- 
cgit v1.2.3-70-g09d2


From bc990f5cb424cdca9dda866785d088e2c2110ecc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 16 Aug 2009 20:36:34 -0400
Subject: xfs: fix locking in xfs_iget_cache_hit

The locking in xfs_iget_cache_hit currently has numerous problems:

 - we clear the reclaim tag without i_flags_lock which protects
   modifications to it
 - we call inode_init_always which can sleep with pag_ici_lock
   held (this is oss.sgi.com BZ #819)
 - we acquire and drop i_flags_lock a lot and thus provide no
   consistency between the various flags we set/clear under it

This patch fixes all that with a major revamp of the locking in
the function.  The new version acquires i_flags_lock early and
only drops it once we need to call into inode_init_always or before
calling xfs_ilock.

This patch fixes a bug seen in the wild where we race modifying the
reclaim tag.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c |  13 ++++-
 fs/xfs/linux-2.6/xfs_sync.h |   1 +
 fs/xfs/xfs_iget.c           | 113 +++++++++++++++++++++++---------------------
 3 files changed, 70 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index b619d6b8ca43..98ef624d9baf 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -708,6 +708,16 @@ xfs_reclaim_inode(
 	return 0;
 }
 
+void
+__xfs_inode_set_reclaim_tag(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip)
+{
+	radix_tree_tag_set(&pag->pag_ici_root,
+			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+			   XFS_ICI_RECLAIM_TAG);
+}
+
 /*
  * We set the inode flag atomically with the radix tree tag.
  * Once we get tag lookups on the radix tree, this inode flag
@@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
 
 	read_lock(&pag->pag_ici_lock);
 	spin_lock(&ip->i_flags_lock);
-	radix_tree_tag_set(&pag->pag_ici_root,
-			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+	__xfs_inode_set_reclaim_tag(pag, ip);
 	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
 	spin_unlock(&ip->i_flags_lock);
 	read_unlock(&pag->pag_ici_lock);
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 2a10301c99c7..59120602588a 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
 void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 				struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 34ec86923f7e..ecbf8b4d2e2e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -191,80 +191,82 @@ xfs_iget_cache_hit(
 	int			flags,
 	int			lock_flags) __releases(pag->pag_ici_lock)
 {
+	struct inode		*inode = VFS_I(ip);
 	struct xfs_mount	*mp = ip->i_mount;
-	int			error = EAGAIN;
+	int			error;
+
+	spin_lock(&ip->i_flags_lock);
 
 	/*
-	 * If INEW is set this inode is being set up
-	 * If IRECLAIM is set this inode is being torn down
-	 * Pause and try again.
+	 * If we are racing with another cache hit that is currently
+	 * instantiating this inode or currently recycling it out of
+	 * reclaimabe state, wait for the initialisation to complete
+	 * before continuing.
+	 *
+	 * XXX(hch): eventually we should do something equivalent to
+	 *	     wait_on_inode to wait for these flags to be cleared
+	 *	     instead of polling for it.
 	 */
-	if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
 		XFS_STATS_INC(xs_ig_frecycle);
+		error = EAGAIN;
 		goto out_error;
 	}
 
-	/* If IRECLAIMABLE is set, we've torn down the vfs inode part */
-	if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
-
-		/*
-		 * If lookup is racing with unlink, then we should return an
-		 * error immediately so we don't remove it from the reclaim
-		 * list and potentially leak the inode.
-		 */
-		if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-			error = ENOENT;
-			goto out_error;
-		}
+	/*
+	 * If lookup is racing with unlink return an error immediately.
+	 */
+	if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+		error = ENOENT;
+		goto out_error;
+	}
 
+	/*
+	 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+	 * Need to carefully get it back into useable state.
+	 */
+	if (ip->i_flags & XFS_IRECLAIMABLE) {
 		xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
 
 		/*
-		 * We need to re-initialise the VFS inode as it has been
-		 * 'freed' by the VFS. Do this here so we can deal with
-		 * errors cleanly, then tag it so it can be set up correctly
-		 * later.
+		 * We need to set XFS_INEW atomically with clearing the
+		 * reclaimable tag so that we do have an indicator of the
+		 * inode still being initialized.
 		 */
-		if (inode_init_always(mp->m_super, VFS_I(ip))) {
-			error = ENOMEM;
-			goto out_error;
-		}
+		ip->i_flags |= XFS_INEW;
+		ip->i_flags &= ~XFS_IRECLAIMABLE;
+		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
 
-		/*
-		 * We must set the XFS_INEW flag before clearing the
-		 * XFS_IRECLAIMABLE flag so that if a racing lookup does
-		 * not find the XFS_IRECLAIMABLE above but has the igrab()
-		 * below succeed we can safely check XFS_INEW to detect
-		 * that this inode is still being initialised.
-		 */
-		xfs_iflags_set(ip, XFS_INEW);
-		xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
+		spin_unlock(&ip->i_flags_lock);
+		read_unlock(&pag->pag_ici_lock);
 
-		/* clear the radix tree reclaim flag as well. */
-		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
-	} else if (!igrab(VFS_I(ip))) {
+		error = -inode_init_always(mp->m_super, inode);
+		if (error) {
+			/*
+			 * Re-initializing the inode failed, and we are in deep
+			 * trouble.  Try to re-add it to the reclaim list.
+			 */
+			read_lock(&pag->pag_ici_lock);
+			spin_lock(&ip->i_flags_lock);
+
+			ip->i_flags &= ~XFS_INEW;
+			ip->i_flags |= XFS_IRECLAIMABLE;
+			__xfs_inode_set_reclaim_tag(pag, ip);
+			goto out_error;
+		}
+		inode->i_state = I_LOCK|I_NEW;
+	} else {
 		/* If the VFS inode is being torn down, pause and try again. */
-		XFS_STATS_INC(xs_ig_frecycle);
-		goto out_error;
-	} else if (xfs_iflags_test(ip, XFS_INEW)) {
-		/*
-		 * We are racing with another cache hit that is
-		 * currently recycling this inode out of the XFS_IRECLAIMABLE
-		 * state. Wait for the initialisation to complete before
-		 * continuing.
-		 */
-		wait_on_inode(VFS_I(ip));
-	}
+		if (!igrab(inode)) {
+			error = EAGAIN;
+			goto out_error;
+		}
 
-	if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-		error = ENOENT;
-		iput(VFS_I(ip));
-		goto out_error;
+		/* We've got a live one. */
+		spin_unlock(&ip->i_flags_lock);
+		read_unlock(&pag->pag_ici_lock);
 	}
 
-	/* We've got a live one. */
-	read_unlock(&pag->pag_ici_lock);
-
 	if (lock_flags != 0)
 		xfs_ilock(ip, lock_flags);
 
@@ -274,6 +276,7 @@ xfs_iget_cache_hit(
 	return 0;
 
 out_error:
+	spin_unlock(&ip->i_flags_lock);
 	read_unlock(&pag->pag_ici_lock);
 	return error;
 }
-- 
cgit v1.2.3-70-g09d2


From 8633ecfaba4ce53c094f8db92dbd7ac474a7aa36 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 31 Jul 2009 11:07:29 +0100
Subject: GFS2: Add online uevent to GFS2

We already have an offline uevent (used when a withdraw occurs)
but no online uevent. This adds an online uevent so that userspace
will be able to detect a successful mount by means other than
not receiving a remove event after the add & recovery (change)
uevents.

It has also been added to the remount path as well - we can't use
a change uevent there as older GFS2 userspace acts on change uevents
according to the state that it thinks the fs is in, so we can't
easily add any new ones.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 13 ++++++++++++-
 fs/gfs2/super.c      |  1 +
 fs/gfs2/super.h      |  4 ++--
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7bc3c45cd676..39021c020176 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1133,6 +1133,17 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
 		lm->lm_unmount(sdp);
 }
 
+void gfs2_online_uevent(struct gfs2_sbd *sdp)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	char ro[20];
+	char spectator[20];
+	char *envp[] = { ro, spectator, NULL };
+	sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+	sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
+	kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp);
+}
+
 /**
  * fill_super - Read in superblock
  * @sb: The VFS superblock
@@ -1236,7 +1247,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	gfs2_glock_dq_uninit(&mount_gh);
-
+	gfs2_online_uevent(sdp);
 	return 0;
 
 fail_threads:
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f522bb017973..85bd2bc8c1de 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1084,6 +1084,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 	gt->gt_log_flush_secs = args.ar_commit;
 	spin_unlock(&gt->gt_spin);
 
+	gfs2_online_uevent(sdp);
 	return 0;
 }
 
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 22e0417ed996..911c954cefbd 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -25,7 +25,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 	return x;
 }
 
-void gfs2_jindex_free(struct gfs2_sbd *sdp);
+extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
 
 extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
 
@@ -36,7 +36,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
 				     struct gfs2_inode **ipp);
 
 extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-
+extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
 extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
 extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
 			       s64 dinodes);
-- 
cgit v1.2.3-70-g09d2


From 440d6da207e9eef2b576331cfedd5ecae9068663 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 31 Jul 2009 12:16:25 +0100
Subject: GFS2: Add some more info to uevents

With each uevent, we now always include the journal ID. We
can't call it JID since that is already in use by some of
the individual events relating to recovery, so we use
JOURNALID instead. We don't send the JOURNALID for spectator
mounts, since there isn't one.

Also the ADD event now has both RDONLY and SPECTATOR information
to match that of the ONLINE event.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/sys.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index a7cbfbd340c7..dda4bf1e9eb4 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -519,7 +519,14 @@ static struct attribute_group lock_module_group = {
 
 int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 {
+	struct super_block *sb = sdp->sd_vfs;
 	int error;
+	char ro[20];
+	char spectator[20];
+	char *envp[] = { ro, spectator, NULL };
+
+	sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+	sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
 
 	sdp->sd_kobj.kset = gfs2_kset;
 	error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
@@ -535,7 +542,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail_tune;
 
-	kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
+	kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp);
 	return 0;
 
 fail_tune:
@@ -554,7 +561,6 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 	kobject_put(&sdp->sd_kobj);
 }
 
-
 static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
 		       struct kobj_uevent_env *env)
 {
@@ -563,6 +569,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
 
 	add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
 	add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+	if (!sdp->sd_args.ar_spectator)
+		add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
 	if (gfs2_uuid_valid(uuid)) {
 		add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
 			       "%02X%02X-%02X%02X%02X%02X%02X%02X",
@@ -578,7 +586,6 @@ static struct kset_uevent_ops gfs2_uevent_ops = {
 	.uevent = gfs2_uevent,
 };
 
-
 int gfs2_sys_init(void)
 {
 	gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
-- 
cgit v1.2.3-70-g09d2


From 6050b9c74f24513191fc6b5e4b3583e38d146bf5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 31 Jul 2009 16:19:40 +0100
Subject: GFS2: Improve error handling in inode allocation

A little while back, block allocation was given some improved
error handling which meant that -EIO was returned in the case
of there being a problem in the resource group data. In addition
a message is printed explaning what went wrong and how to fix it.
This extends that error handling so that it also covers inode
allocation too.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c |  2 +-
 fs/gfs2/rgrp.c  | 36 +++++++++++++++++++++++++-----------
 fs/gfs2/rgrp.h  |  2 +-
 3 files changed, 27 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2f94bd723698..f9b4fe886540 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -731,7 +731,7 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
 	if (error)
 		goto out_ipreserv;
 
-	*no_addr = gfs2_alloc_di(dip, generation);
+	error = gfs2_alloc_di(dip, no_addr, generation);
 
 	gfs2_trans_end(sdp);
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fba795798d3a..3d0193af19c3 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1459,6 +1459,16 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 	return 0;
 }
 
+static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+	        (unsigned long long)rgd->rd_addr);
+	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+	gfs2_rgrp_dump(NULL, rgd->rd_gl);
+	rgd->rd_flags |= GFS2_RDF_ERROR;
+}
+
 /**
  * gfs2_alloc_block - Allocate one or more blocks
  * @ip: the inode to allocate the block for
@@ -1520,22 +1530,20 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
 	return 0;
 
 rgrp_error:
-	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
-	        (unsigned long long)rgd->rd_addr);
-	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
-	gfs2_rgrp_dump(NULL, rgd->rd_gl);
-	rgd->rd_flags |= GFS2_RDF_ERROR;
+	gfs2_rgrp_error(rgd);
 	return -EIO;
 }
 
 /**
  * gfs2_alloc_di - Allocate a dinode
  * @dip: the directory that the inode is going in
+ * @bn: the block number which is allocated
+ * @generation: the generation number of the inode
  *
- * Returns: the block allocated
+ * Returns: 0 on success or error
  */
 
-u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
+int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct gfs2_alloc *al = dip->i_alloc;
@@ -1546,12 +1554,13 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 
 	blk = rgblk_search(rgd, rgd->rd_last_alloc,
 			   GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
-	BUG_ON(blk == BFITNOENT);
 
-	rgd->rd_last_alloc = blk;
+	/* Since all blocks are reserved in advance, this shouldn't happen */
+	if (blk == BFITNOENT)
+		goto rgrp_error;
 
+	rgd->rd_last_alloc = blk;
 	block = rgd->rd_data0 + blk;
-
 	gfs2_assert_withdraw(sdp, rgd->rd_free);
 	rgd->rd_free--;
 	rgd->rd_dinodes++;
@@ -1568,7 +1577,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 	rgd->rd_free_clone--;
 	spin_unlock(&sdp->sd_rindex_spin);
 	trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
-	return block;
+	*bn = block;
+	return 0;
+
+rgrp_error:
+	gfs2_rgrp_error(rgd);
+	return -EIO;
 }
 
 /**
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 1e76ff0f3e00..a8dedd78245b 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -47,7 +47,7 @@ extern void gfs2_inplace_release(struct gfs2_inode *ip);
 extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
 
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
-extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
 
 extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
-- 
cgit v1.2.3-70-g09d2


From 05164e5b37a8329bfbcf72f526302cb2dd885fbd Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 17 Aug 2009 11:06:43 +0100
Subject: GFS2: Replace assertion with proper error handling

One fewer assert, one more place we can recover gracefully
if there is an error.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3d0193af19c3..c681c54fbd60 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1561,7 +1561,9 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
 
 	rgd->rd_last_alloc = blk;
 	block = rgd->rd_data0 + blk;
-	gfs2_assert_withdraw(sdp, rgd->rd_free);
+	if (rgd->rd_free == 0)
+		goto rgrp_error;
+
 	rgd->rd_free--;
 	rgd->rd_dinodes++;
 	*generation = rgd->rd_igeneration++;
-- 
cgit v1.2.3-70-g09d2


From 31e54b01f3f00b595aac02e887960e5dcc575844 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 13 Aug 2009 12:18:08 +0100
Subject: GFS2: Add sysfs link to device

This adds a link from the per-gfs2 sb sysfs directory to
the block device upon which the filesystem is mounted. The
link is called "device", strangely enough :-)

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/sys.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index dda4bf1e9eb4..0d4f7e974e9f 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -16,6 +16,7 @@
 #include <linux/kobject.h>
 #include <asm/uaccess.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/genhd.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -542,9 +543,17 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail_tune;
 
+	error = sysfs_create_link(&sdp->sd_kobj,
+				  &disk_to_dev(sb->s_bdev->bd_disk)->kobj,
+				  "device");
+	if (error)
+		goto fail_lock_module;
+
 	kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp);
 	return 0;
 
+fail_lock_module:
+	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 fail_tune:
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 fail_reg:
@@ -556,6 +565,7 @@ fail:
 
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
+	sysfs_remove_link(&sdp->sd_kobj, "device");
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 	kobject_put(&sdp->sd_kobj);
-- 
cgit v1.2.3-70-g09d2


From ada508274b8698a33cb0e5bd037db0f9dc781795 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 3 Aug 2009 18:24:21 +0200
Subject: ocfs2: Handle quota file corruption more gracefully

ocfs2_read_virt_blocks() does BUG when we try to read a block from a file
beyond its end. Since this can happen due to filesystem corruption, it
is not really an appropriate answer. Make ocfs2_read_quota_block() check
the condition and handle it by calling ocfs2_error() and returning EIO.

[ Modified to print ip_blkno in the error - Joel ]

Reported-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/quota_global.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index bf7742d0ee3b..44f2a5e1d042 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -23,6 +23,7 @@
 #include "sysfile.h"
 #include "dlmglue.h"
 #include "uptodate.h"
+#include "super.h"
 #include "quota.h"
 
 static struct workqueue_struct *ocfs2_quota_wq = NULL;
@@ -114,6 +115,15 @@ int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
 	int rc = 0;
 	struct buffer_head *tmp = *bh;
 
+	if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+		ocfs2_error(inode->i_sb,
+			    "Quota file %llu is probably corrupted! Requested "
+			    "to read block %Lu but file has size only %Lu\n",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)v_block,
+			    (unsigned long long)i_size_read(inode));
+		return -EIO;
+	}
 	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
 				    ocfs2_validate_quota_block);
 	if (rc)
-- 
cgit v1.2.3-70-g09d2


From 60e2ec48665b8495360ca4a6004c5cd52beb2bc1 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 12 Aug 2009 14:42:47 +0800
Subject: ocfs2: release the buffer head in ocfs2_do_truncate.

In ocfs2_do_truncate, we forget to release last_eb_bh which
will cause memleak. So call brelse in the end.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f9a3e8942669..ab513ddaeff2 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6851,7 +6851,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 	}
 	status = 0;
 bail:
-
+	brelse(last_eb_bh);
 	mlog_exit(status);
 	return status;
 }
-- 
cgit v1.2.3-70-g09d2


From eef3a116be11d35396efb2a8cc7345fd3221e294 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Sun, 16 Aug 2009 21:51:44 -0400
Subject: notify: unused event private race

inotify decides if private data it passed to get added to an event was
used by checking list_empty().  But it's possible that the event may
have been dequeued and the private event removed so it would look empty.

The fix is to use the return code from fsnotify_add_notify_event rather
than looking at the list.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/inotify/inotify_fsnotify.c | 13 +++++++------
 fs/notify/inotify/inotify_user.c     |  7 +++----
 fs/notify/notification.c             |  7 +++----
 3 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 47cd258fd24d..5dcbafe72d71 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -62,13 +62,14 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
 	event_priv->wd = wd;
 
 	ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
-	/* EEXIST is not an error */
-	if (ret == -EEXIST)
-		ret = 0;
-
-	/* did event_priv get attached? */
-	if (list_empty(&fsn_event_priv->event_list))
+	if (ret) {
 		inotify_free_event_priv(fsn_event_priv);
+		/* EEXIST says we tail matched, EOVERFLOW isn't something
+		 * to report up the stack. */
+		if ((ret == -EEXIST) ||
+		    (ret == -EOVERFLOW))
+			ret = 0;
+	}
 
 	/*
 	 * If we hold the entry until after the event is on the queue
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index f30d9bbc2e1b..c172a7a17b17 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -386,6 +386,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 	struct fsnotify_event *ignored_event;
 	struct inotify_event_private_data *event_priv;
 	struct fsnotify_event_private_data *fsn_event_priv;
+	int ret;
 
 	ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
 					      FSNOTIFY_EVENT_NONE, NULL, 0,
@@ -404,10 +405,8 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 	fsn_event_priv->group = group;
 	event_priv->wd = ientry->wd;
 
-	fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
-
-	/* did the private data get added? */
-	if (list_empty(&fsn_event_priv->event_list))
+	ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
+	if (ret)
 		inotify_free_event_priv(fsn_event_priv);
 
 skip_send_ignore:
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 521368574e97..74b3cf30bc6b 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -171,9 +171,7 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
 	struct list_head *list = &group->notification_list;
 	struct fsnotify_event_holder *last_holder;
 	struct fsnotify_event *last_event;
-
-	/* easy to tell if priv was attached to the event */
-	INIT_LIST_HEAD(&priv->event_list);
+	int ret = 0;
 
 	/*
 	 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
@@ -194,6 +192,7 @@ alloc_holder:
 
 	if (group->q_len >= group->max_events) {
 		event = &q_overflow_event;
+		ret = -EOVERFLOW;
 		/* sorry, no private data on the overflow event */
 		priv = NULL;
 	}
@@ -235,7 +234,7 @@ alloc_holder:
 	mutex_unlock(&group->notification_mutex);
 
 	wake_up(&group->notification_waitq);
-	return 0;
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From cd94c8bbef8d4b796a7ed4c551355a334604fd36 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Sun, 16 Aug 2009 21:51:49 -0400
Subject: inotify: tail drop inotify q_overflow events

In f44aebcc the tail drop logic of events with no file backing
(q_overflow and in_ignored) was reversed so IN_IGNORED events would
never be tail dropped.  This now means that Q_OVERFLOW events are NOT
tail dropped.  The fix is to not tail drop IN_IGNORED, but to tail drop
Q_OVERFLOW.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/notification.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 74b3cf30bc6b..3816d5750dd5 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -153,6 +153,10 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 				return true;
 			break;
 		case (FSNOTIFY_EVENT_NONE):
+			if (old->mask & FS_Q_OVERFLOW)
+				return true;
+			else if (old->mask & FS_IN_IGNORED)
+				return false;
 			return false;
 		};
 	}
-- 
cgit v1.2.3-70-g09d2


From 08e53fcb0db34baca3db84a457b6d67faabee4c6 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Sun, 16 Aug 2009 21:51:55 -0400
Subject: inotify: start watch descriptor count at 1

The inotify_add_watch man page specifies that inotify_add_watch() will
return a non-negative integer.  However, historically the inotify
watches started at 1, not at 0.

Turns out that the inotifywait program provided by the inotify-tools
package doesn't properly handle a 0 watch descriptor.  In 7e790dd5 we
changed from starting at 1 to starting at 0.  This patch starts at 1,
just like in previous kernels, but also just like in previous kernels
it's possible for it to wrap back to 0.  This preserves the kernel
functionality exactly like it was before the patch (neither method broke
the spec)

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/inotify/inotify_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index c172a7a17b17..dc32ed8323ba 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -567,7 +567,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
 
 	spin_lock_init(&group->inotify_data.idr_lock);
 	idr_init(&group->inotify_data.idr);
-	group->inotify_data.last_wd = 0;
+	group->inotify_data.last_wd = 1;
 	group->inotify_data.user = user;
 	group->inotify_data.fa = NULL;
 
-- 
cgit v1.2.3-70-g09d2


From 50fb6d2bd7062708892ae7147f30c3ee905b7a3d Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Date: Sun, 19 Jul 2009 13:41:57 -0600
Subject: 9p: Check for error in return value of v9fs_fid_add

Check if v9fs_fid_add was successful or not based on its
return value.

Signed-off-by: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 81f8bbf12f9f..1fa5f15eaddc 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -470,7 +470,10 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 		dentry->d_op = &v9fs_dentry_operations;
 
 	d_instantiate(dentry, inode);
-	v9fs_fid_add(dentry, fid);
+	err = v9fs_fid_add(dentry, fid);
+	if (err < 0)
+		goto error;
+
 	return ofid;
 
 error:
-- 
cgit v1.2.3-70-g09d2


From 2bb541157fe2602af7b9952096d0524f6f9c1e73 Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Date: Sun, 19 Jul 2009 13:41:56 -0600
Subject: 9p: Fix possible inode leak in v9fs_get_inode.

Add a missing iput when cleaning up if v9fs_get_inode
fails after returning a valid inode.

Signed-off-by: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 105 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 56 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 1fa5f15eaddc..0c8af1abf603 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -207,65 +207,72 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
 
 struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 {
+	int err;
 	struct inode *inode;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
 
 	inode = new_inode(sb);
-	if (inode) {
-		inode->i_mode = mode;
-		inode->i_uid = current_fsuid();
-		inode->i_gid = current_fsgid();
-		inode->i_blocks = 0;
-		inode->i_rdev = 0;
-		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		inode->i_mapping->a_ops = &v9fs_addr_operations;
-
-		switch (mode & S_IFMT) {
-		case S_IFIFO:
-		case S_IFBLK:
-		case S_IFCHR:
-		case S_IFSOCK:
-			if (!v9fs_extended(v9ses)) {
-				P9_DPRINTK(P9_DEBUG_ERROR,
-				      "special files without extended mode\n");
-				return ERR_PTR(-EINVAL);
-			}
-			init_special_inode(inode, inode->i_mode,
-					   inode->i_rdev);
-			break;
-		case S_IFREG:
-			inode->i_op = &v9fs_file_inode_operations;
-			inode->i_fop = &v9fs_file_operations;
-			break;
-		case S_IFLNK:
-			if (!v9fs_extended(v9ses)) {
-				P9_DPRINTK(P9_DEBUG_ERROR,
-					"extended modes used w/o 9P2000.u\n");
-				return ERR_PTR(-EINVAL);
-			}
-			inode->i_op = &v9fs_symlink_inode_operations;
-			break;
-		case S_IFDIR:
-			inc_nlink(inode);
-			if (v9fs_extended(v9ses))
-				inode->i_op = &v9fs_dir_inode_operations_ext;
-			else
-				inode->i_op = &v9fs_dir_inode_operations;
-			inode->i_fop = &v9fs_dir_operations;
-			break;
-		default:
+	if (!inode) {
+		P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
+		return -ENOMEM;
+	}
+
+	inode->i_mode = mode;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	inode->i_blocks = 0;
+	inode->i_rdev = 0;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_mapping->a_ops = &v9fs_addr_operations;
+
+	switch (mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFBLK:
+	case S_IFCHR:
+	case S_IFSOCK:
+		if (!v9fs_extended(v9ses)) {
 			P9_DPRINTK(P9_DEBUG_ERROR,
-				"BAD mode 0x%x S_IFMT 0x%x\n",
-				mode, mode & S_IFMT);
-			return ERR_PTR(-EINVAL);
+				   "special files without extended mode\n");
+			err = -EINVAL;
+			goto error;
 		}
-	} else {
-		P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
-		return ERR_PTR(-ENOMEM);
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
+	case S_IFREG:
+		inode->i_op = &v9fs_file_inode_operations;
+		inode->i_fop = &v9fs_file_operations;
+		break;
+	case S_IFLNK:
+		if (!v9fs_extended(v9ses)) {
+			P9_DPRINTK(P9_DEBUG_ERROR,
+				   "extended modes used w/o 9P2000.u\n");
+			err = -EINVAL;
+			goto error;
+		}
+		inode->i_op = &v9fs_symlink_inode_operations;
+		break;
+	case S_IFDIR:
+		inc_nlink(inode);
+		if (v9fs_extended(v9ses))
+			inode->i_op = &v9fs_dir_inode_operations_ext;
+		else
+			inode->i_op = &v9fs_dir_inode_operations;
+		inode->i_fop = &v9fs_dir_operations;
+		break;
+	default:
+		P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
+			   mode, mode & S_IFMT);
+		err = -EINVAL;
+		goto error;
 	}
+
 	return inode;
+
+error:
+	iput(inode);
+	return ERR_PTR(err);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 0e15597ebfe00e28857185f46aba00f400480ffe Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Date: Sun, 19 Jul 2009 13:41:55 -0600
Subject: 9p: minor comment fixes

Fix the comments -- mostly the improper and/or missing descriptions
of function parameters.

Signed-off-by: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c     |  3 +--
 net/9p/client.c       | 14 +++++++-------
 net/9p/trans_fd.c     |  8 ++++----
 net/9p/trans_rdma.c   |  9 +++++----
 net/9p/trans_virtio.c | 11 ++++-------
 5 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 0c8af1abf603..f22668afd0d6 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -171,7 +171,6 @@ int v9fs_uflags2omode(int uflags, int extended)
 
 /**
  * v9fs_blank_wstat - helper function to setup a 9P stat structure
- * @v9ses: 9P session info (for determining extended mode)
  * @wstat: structure to initialize
  *
  */
@@ -410,9 +409,9 @@ v9fs_open_created(struct inode *inode, struct file *file)
  * @v9ses: session information
  * @dir: directory that dentry is being created in
  * @dentry:  dentry that is being created
+ * @extension: 9p2000.u extension string to support devices, etc.
  * @perm: create permissions
  * @mode: open mode
- * @extension: 9p2000.u extension string to support devices, etc.
  *
  */
 static struct p9_fid *
diff --git a/net/9p/client.c b/net/9p/client.c
index 787ccddb85ea..7bbd2d5ae8d3 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -60,9 +60,9 @@ static struct p9_req_t *
 p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...);
 
 /**
- * v9fs_parse_options - parse mount options into session structure
- * @options: options string passed from mount
- * @v9ses: existing v9fs session information
+ * parse_options - parse mount options into client structure
+ * @opts: options string passed from mount
+ * @clnt: existing v9fs client information
  *
  * Return 0 upon success, -ERRNO upon failure
  */
@@ -232,7 +232,7 @@ EXPORT_SYMBOL(p9_tag_lookup);
 
 /**
  * p9_tag_init - setup tags structure and contents
- * @tags: tags structure from the client struct
+ * @c:  v9fs client struct
  *
  * This initializes the tags structure for each client instance.
  *
@@ -258,7 +258,7 @@ error:
 
 /**
  * p9_tag_cleanup - cleans up tags structure and reclaims resources
- * @tags: tags structure from the client struct
+ * @c:  v9fs client struct
  *
  * This frees resources associated with the tags structure
  *
@@ -430,8 +430,8 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
 
 /**
  * p9_client_flush - flush (cancel) a request
- * c: client state
- * req: request to cancel
+ * @c: client state
+ * @oldreq: request to cancel
  *
  * This sents a flush for a particular requests and links
  * the flush request to the original request.  The current
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 8c2588e4edc0..8d934dd7fd54 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -119,8 +119,8 @@ struct p9_poll_wait {
  * @wpos: write position for current frame
  * @wsize: amount of data to write for current frame
  * @wbuf: current write buffer
+ * @poll_pending_link: pending links to be polled per conn
  * @poll_wait: array of wait_q's for various worker threads
- * @poll_waddr: ????
  * @pt: poll state
  * @rq: current read work
  * @wq: current write work
@@ -700,9 +700,9 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
 }
 
 /**
- * parse_options - parse mount options into session structure
- * @options: options string passed from mount
- * @opts: transport-specific structure to parse options into
+ * parse_opts - parse mount options into p9_fd_opts structure
+ * @params: options string passed from mount
+ * @opts: fd transport-specific structure to parse options into
  *
  * Returns 0 upon success, -ERRNO upon failure
  */
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index ac4990041ebb..65cb29db03f8 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -67,14 +67,15 @@
  * @pd: Protection Domain pointer
  * @qp: Queue Pair pointer
  * @cq: Completion Queue pointer
+ * @dm_mr: DMA Memory Region pointer
  * @lkey: The local access only memory region key
  * @timeout: Number of uSecs to wait for connection management events
  * @sq_depth: The depth of the Send Queue
  * @sq_sem: Semaphore for the SQ
  * @rq_depth: The depth of the Receive Queue.
+ * @rq_count: Count of requests in the Receive Queue.
  * @addr: The remote peer's address
  * @req_lock: Protects the active request list
- * @send_wait: Wait list when the SQ fills up
  * @cm_done: Completion event for connection management tracking
  */
 struct p9_trans_rdma {
@@ -154,9 +155,9 @@ static match_table_t tokens = {
 };
 
 /**
- * parse_options - parse mount options into session structure
- * @options: options string passed from mount
- * @opts: transport-specific structure to parse options into
+ * parse_opts - parse mount options into rdma options structure
+ * @params: options string passed from mount
+ * @opts: rdma transport-specific structure to parse options into
  *
  * Returns 0 upon success, -ERRNO upon failure
  */
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index a49484e67e1d..9bf0b737aa51 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -57,11 +57,9 @@ static int chan_index;
  * @initialized: whether the channel is initialized
  * @inuse: whether the channel is in use
  * @lock: protects multiple elements within this structure
+ * @client: client instance
  * @vdev: virtio dev associated with this channel
  * @vq: virtio queue associated with this channel
- * @tagpool: accounting for tag ids (and request slots)
- * @reqs: array of request slots
- * @max_tag: current number of request_slots allocated
  * @sg: scatter gather list which is used to pack a request (protected?)
  *
  * We keep all per-channel information in a structure.
@@ -92,7 +90,7 @@ static unsigned int rest_of_page(void *data)
 
 /**
  * p9_virtio_close - reclaim resources of a channel
- * @trans: transport state
+ * @client: client instance
  *
  * This reclaims a channel by freeing its resources and
  * reseting its inuse flag.
@@ -181,9 +179,8 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req)
 
 /**
  * p9_virtio_request - issue a request
- * @t: transport state
- * @tc: &p9_fcall request to transmit
- * @rc: &p9_fcall to put reponse into
+ * @client: client instance issuing the request
+ * @req: request to be issued
  *
  */
 
-- 
cgit v1.2.3-70-g09d2


From 02bc35672b2fdf251e264adca5407792f63191e4 Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Date: Sun, 19 Jul 2009 13:41:54 -0600
Subject: 9p: Fix possible memleak in v9fs_inode_from fid.

Add missing p9stat_free in v9fs_inode_from_fid to avoid
any possible leaks.

Signed-off-by: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f22668afd0d6..fac30d21851f 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -344,30 +344,25 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 
 	ret = NULL;
 	st = p9_client_stat(fid);
-	if (IS_ERR(st)) {
-		err = PTR_ERR(st);
-		st = NULL;
-		goto error;
-	}
+	if (IS_ERR(st))
+		return ERR_CAST(st);
 
 	umode = p9mode2unixmode(v9ses, st->mode);
 	ret = v9fs_get_inode(sb, umode);
 	if (IS_ERR(ret)) {
 		err = PTR_ERR(ret);
-		ret = NULL;
 		goto error;
 	}
 
 	v9fs_stat2inode(st, ret, sb);
 	ret->i_ino = v9fs_qid2ino(&st->qid);
+	p9stat_free(st);
 	kfree(st);
 	return ret;
 
 error:
+	p9stat_free(st);
 	kfree(st);
-	if (ret)
-		iput(ret);
-
 	return ERR_PTR(err);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 4f4038328da5eb9cc237b51d3fe68138fd3fea14 Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Date: Sun, 19 Jul 2009 13:41:53 -0600
Subject: 9p: Fix v9fs show_options

Add the delimiter ',' before the options when they are passed
and check if no option parameters are passed to prevent displaying
NULL in /proc/mounts.

Signed-off-by: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 38d695d66a0b..a9d7d08cfbe8 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -220,8 +220,8 @@ static void v9fs_kill_super(struct super_block *s)
 static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
 {
 	struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info;
-
-	seq_printf(m, "%s", v9ses->options);
+	if (v9ses->options != NULL)
+		seq_printf(m, ",%s", v9ses->options);
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 1b5ab3e86712b6be38ebbe0d821387c1d8f91d7c Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Date: Sun, 19 Jul 2009 13:41:52 -0600
Subject: 9p: Fix possible regressions when ->get_sb fails.

->get_sb can fail causing some badness. this patch fixes
   * clear sb->fs_s_info in kill_sb.
   * deactivate_locked_super() calls kill_sb (v9fs_kill_super) which closes the
     destroys the client, clunks all its fids and closes the v9fs session.
     Attempting to do it twice will cause an oops.

Signed-off-by: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_super.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index a9d7d08cfbe8..2495af4ad9a6 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -120,7 +120,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 
 	P9_DPRINTK(P9_DEBUG_VFS, " \n");
 
-	st = NULL;
 	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
 		return -ENOMEM;
@@ -173,10 +172,8 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
 	simple_set_mnt(mnt, sb);
 	return 0;
 
-release_sb:
-	deactivate_locked_super(sb);
-
 free_stat:
+	p9stat_free(st);
 	kfree(st);
 
 clunk_fid:
@@ -185,7 +182,12 @@ clunk_fid:
 close_session:
 	v9fs_session_close(v9ses);
 	kfree(v9ses);
+	return retval;
 
+release_sb:
+	p9stat_free(st);
+	kfree(st);
+	deactivate_locked_super(sb);
 	return retval;
 }
 
@@ -207,6 +209,7 @@ static void v9fs_kill_super(struct super_block *s)
 
 	v9fs_session_close(v9ses);
 	kfree(v9ses);
+	s->s_fs_info = NULL;
 	P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
 }
 
-- 
cgit v1.2.3-70-g09d2


From 4d3297ca5bf37ff5956f76fb352e009880aad62d Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Date: Sun, 19 Jul 2009 13:41:51 -0600
Subject: 9p: Remove redundant inode uid/gid assignment

Remove a redundant update of inode's i_uid and i_gid
after v9fs_get_inode() since the latter already sets up
a new inode and sets the proper uid and gid values.

Signed-off-by: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_super.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 2495af4ad9a6..072dce094477 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -113,8 +113,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	struct v9fs_session_info *v9ses = NULL;
 	struct p9_wstat *st = NULL;
 	int mode = S_IRWXUGO | S_ISVTX;
-	uid_t uid = current_fsuid();
-	gid_t gid = current_fsgid();
 	struct p9_fid *fid;
 	int retval = 0;
 
@@ -149,9 +147,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 		goto release_sb;
 	}
 
-	inode->i_uid = uid;
-	inode->i_gid = gid;
-
 	root = d_alloc_root(inode);
 	if (!root) {
 		iput(inode);
-- 
cgit v1.2.3-70-g09d2


From 5fd131893793567c361ae64cbeb28a2a753bbe35 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Jul 2009 17:01:53 +0200
Subject: ocfs2: Don't oops in ocfs2_kill_sb on a failed mount

If we fail to mount the filesystem, we have to be careful not to dereference
uninitialized structures in ocfs2_kill_sb.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/super.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b0ee0fdf799a..a3f8871d21fd 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1218,13 +1218,17 @@ static void ocfs2_kill_sb(struct super_block *sb)
 {
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
+	/* Failed mount? */
+	if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
+		goto out;
+
 	/* Prevent further queueing of inode drop events */
 	spin_lock(&dentry_list_lock);
 	ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
 	spin_unlock(&dentry_list_lock);
 	/* Wait for work to finish and/or remove it */
 	cancel_work_sync(&osb->dentry_lock_work);
-
+out:
 	kill_block_super(sb);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 48559b4c30708ebdc849483da9fb83ee08c6c908 Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Date: Mon, 17 Aug 2009 16:32:18 -0500
Subject: 9p: Add missing cast for the error return value in v9fs_get_inode

Cast the error return value (ENOMEM) in v9fs_get_inode() to its
correct type using ERR_PTR.

Signed-off-by: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index fac30d21851f..06a223d50a81 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -215,7 +215,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 	inode = new_inode(sb);
 	if (!inode) {
 		P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	}
 
 	inode->i_mode = mode;
-- 
cgit v1.2.3-70-g09d2


From 4b53e4b500779230aedd5355940aeaaed0b5353b Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Date: Mon, 17 Aug 2009 16:42:28 -0500
Subject: 9p: remove unnecessary v9fses->options which duplicates the mount
 string

The mount options string is saved in sb->s_options. This patch removes
the redundant duplicating of the mount options. Also, since we are not
displaying anything special in show options, we replace v9fs_show_options
with generic_show_options for now.

Signed-off-by: Abhishek Kulkarni <adkulkar@umail.iu.edu>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.c      | 21 +++++----------------
 fs/9p/v9fs.h      |  1 -
 fs/9p/vfs_super.c | 23 +++++------------------
 3 files changed, 10 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 332b5ff02fec..f7003cfac63d 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -76,7 +76,7 @@ static const match_table_t tokens = {
  * Return 0 upon success, -ERRNO upon failure.
  */
 
-static int v9fs_parse_options(struct v9fs_session_info *v9ses)
+static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 {
 	char *options;
 	substring_t args[MAX_OPT_ARGS];
@@ -90,10 +90,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses)
 	v9ses->debug = 0;
 	v9ses->cache = 0;
 
-	if (!v9ses->options)
+	if (!opts)
 		return 0;
 
-	options = kstrdup(v9ses->options, GFP_KERNEL);
+	options = kstrdup(opts, GFP_KERNEL);
 	if (!options) {
 		P9_DPRINTK(P9_DEBUG_ERROR,
 			   "failed to allocate copy of option string\n");
@@ -206,24 +206,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	v9ses->uid = ~0;
 	v9ses->dfltuid = V9FS_DEFUID;
 	v9ses->dfltgid = V9FS_DEFGID;
-	if (data) {
-		v9ses->options = kstrdup(data, GFP_KERNEL);
-		if (!v9ses->options) {
-			P9_DPRINTK(P9_DEBUG_ERROR,
-			   "failed to allocate copy of option string\n");
-			retval = -ENOMEM;
-			goto error;
-		}
-	}
 
-	rc = v9fs_parse_options(v9ses);
+	rc = v9fs_parse_options(v9ses, data);
 	if (rc < 0) {
 		retval = rc;
 		goto error;
 	}
 
-	v9ses->clnt = p9_client_create(dev_name, v9ses->options);
-
+	v9ses->clnt = p9_client_create(dev_name, data);
 	if (IS_ERR(v9ses->clnt)) {
 		retval = PTR_ERR(v9ses->clnt);
 		v9ses->clnt = NULL;
@@ -280,7 +270,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
 
 	__putname(v9ses->uname);
 	__putname(v9ses->aname);
-	kfree(v9ses->options);
 }
 
 /**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index a7d567192998..38762bf102a9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -85,7 +85,6 @@ struct v9fs_session_info {
 	unsigned int afid;
 	unsigned int cache;
 
-	char *options;		/* copy of mount options */
 	char *uname;		/* user name to mount as */
 	char *aname;		/* name of remote hierarchy being mounted */
 	unsigned int maxdata;	/* max data for client interface */
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 072dce094477..8961f1a8f668 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -81,7 +81,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
 
 static void
 v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
-		int flags)
+		int flags, void *data)
 {
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
@@ -91,6 +91,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 
 	sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
 	    MS_NOATIME;
+
+	save_mount_options(sb, data);
 }
 
 /**
@@ -139,7 +141,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 		retval = PTR_ERR(sb);
 		goto free_stat;
 	}
-	v9fs_fill_super(sb, v9ses, flags);
+	v9fs_fill_super(sb, v9ses, flags, data);
 
 	inode = v9fs_get_inode(sb, S_IFDIR | mode);
 	if (IS_ERR(inode)) {
@@ -208,21 +210,6 @@ static void v9fs_kill_super(struct super_block *s)
 	P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
 }
 
-/**
- * v9fs_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- *
- */
-
-static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
-	struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info;
-	if (v9ses->options != NULL)
-		seq_printf(m, ",%s", v9ses->options);
-	return 0;
-}
-
 static void
 v9fs_umount_begin(struct super_block *sb)
 {
@@ -235,7 +222,7 @@ v9fs_umount_begin(struct super_block *sb)
 static const struct super_operations v9fs_super_ops = {
 	.statfs = simple_statfs,
 	.clear_inode = v9fs_clear_inode,
-	.show_options = v9fs_show_options,
+	.show_options = generic_show_options,
 	.umount_begin = v9fs_umount_begin,
 };
 
-- 
cgit v1.2.3-70-g09d2


From 50797481a7bdee548589506d7d7b48b08bc14dcd Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 18 Sep 2009 13:34:02 -0400
Subject: ext4: Avoid group preallocation for closed files

Currently the group preallocation code tries to find a large (512)
free block from which to do per-cpu group allocation for small files.
The problem with this scheme is that it leaves the filesystem horribly
fragmented.  In the worst case, if the filesystem is unmounted and
remounted (after a system shutdown, for example) we forget the fact
that wee were using a particular (now-partially filled) 512 block
extent.  So the next time we try to allocate space for a small file,
we will find *another* completely free 512 block chunk to allocate
small files.  Given that there are 32,768 blocks in a block group,
after 64 iterations of "mount, write one 4k file in a directory,
unmount", the block group will have 64 files, each separated by 511
blocks, and the block group will no longer have any free 512
completely free chunks of blocks for group preallocation space.

So if we try to allocate blocks for a file that has been closed, such
that we know the final size of the file, and the filesystem is not
busy, avoid using group preallocation.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 30 +++++++++++++++++++++++++++++-
 fs/ext4/mballoc.c | 10 +++++++++-
 2 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 70aa951ecb3c..2e9a2036c114 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -952,6 +952,7 @@ struct ext4_sb_info {
 	atomic_t s_mb_lost_chunks;
 	atomic_t s_mb_preallocated;
 	atomic_t s_mb_discarded;
+	atomic_t s_lock_busy;
 
 	/* locality groups */
 	struct ext4_locality_group *s_locality_groups;
@@ -1593,15 +1594,42 @@ struct ext4_group_info {
 #define EXT4_MB_GRP_NEED_INIT(grp)	\
 	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
 
+#define EXT4_MAX_CONTENTION		8
+#define EXT4_CONTENTION_THRESHOLD	2
+
 static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
 					      ext4_group_t group)
 {
 	return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
 }
 
+/*
+ * Returns true if the filesystem is busy enough that attempts to
+ * access the block group locks has run into contention.
+ */
+static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
+{
+	return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
+}
+
 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
 {
-	spin_lock(ext4_group_lock_ptr(sb, group));
+	spinlock_t *lock = ext4_group_lock_ptr(sb, group);
+	if (spin_trylock(lock))
+		/*
+		 * We're able to grab the lock right away, so drop the
+		 * lock contention counter.
+		 */
+		atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+	else {
+		/*
+		 * The lock is busy, so bump the contention counter,
+		 * and then wait on the spin lock.
+		 */
+		atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
+				  EXT4_MAX_CONTENTION);
+		spin_lock(lock);
+	}
 }
 
 static inline void ext4_unlock_group(struct super_block *sb,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 90a30ce822fc..faf5bd056a93 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4191,9 +4191,17 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 		return;
 
 	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
-	isize = i_size_read(ac->ac_inode) >> bsbits;
+	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+		>> bsbits;
 	size = max(size, isize);
 
+	if ((size == isize) &&
+	    !ext4_fs_is_busy(sbi) &&
+	    (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
+		return;
+	}
+
 	/* don't use group allocation for large files */
 	if (size >= sbi->s_mb_stream_request) {
 		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
-- 
cgit v1.2.3-70-g09d2


From 84fe3bef59dc45a1cb0d2f9b0aefa8f1fbfbdf98 Mon Sep 17 00:00:00 2001
From: Mingming <cmm@us.ibm.com>
Date: Tue, 1 Sep 2009 08:44:37 -0400
Subject: ext4: Compile warning fix when EXT_DEBUG enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When EXT_DEBUG is enabled I received the following compile warning on
PPC64:

  CC [M]  fs/ext4/inode.o
  CC [M]  fs/ext4/extents.o
fs/ext4/extents.c: In function ‘ext4_ext_rm_leaf’:
fs/ext4/extents.c:2097: warning: format ‘%lu’ expects type ‘long unsigned int’, but argument 2 has type ‘ext4_lblk_t’
fs/ext4/extents.c: In function ‘ext4_ext_get_blocks’:
fs/ext4/extents.c:2789: warning: format ‘%u’ expects type ‘unsigned int’, but argument 4 has type ‘long unsigned int’
fs/ext4/extents.c:2852: warning: format ‘%lu’ expects type ‘long unsigned int’, but argument 3 has type ‘ext4_lblk_t’
fs/ext4/extents.c:2953: warning: format ‘%lu’ expects type ‘long unsigned int’, but argument 4 has type ‘unsigned int’
  CC [M]  fs/ext4/migrate.o

The patch fixes compile warning.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

Index: linux-2.6.31-rc4/fs/ext4/extents.c
===================================================================
---
 fs/ext4/extents.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 73ebfb44ad75..c583ebd1e41f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2094,7 +2094,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		else
 			uninitialized = 0;
 
-		ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
+		ext_debug("remove ext %u:%d\n", ex_ee_block, ex_ee_len);
 		path[depth].p_ext = ex;
 
 		a = ex_ee_block > start ? ex_ee_block : start;
@@ -2786,7 +2786,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	struct ext4_allocation_request ar;
 
 	__clear_bit(BH_New, &bh_result->b_state);
-	ext_debug("blocks %u/%u requested for inode %u\n",
+	ext_debug("blocks %u/%u requested for inode %lu\n",
 			iblock, max_blocks, inode->i_ino);
 
 	/* check in cache */
@@ -2849,7 +2849,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			newblock = iblock - ee_block + ee_start;
 			/* number of remaining blocks in the extent */
 			allocated = ee_len - (iblock - ee_block);
-			ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
+			ext_debug("%u fit into %u:%d -> %llu\n", iblock,
 					ee_block, ee_len, newblock);
 
 			/* Do not put uninitialized extent in the cache */
@@ -2950,7 +2950,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	newblock = ext4_mb_new_blocks(handle, &ar, &err);
 	if (!newblock)
 		goto out2;
-	ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
+	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
 		  ar.goal, newblock, allocated);
 
 	/* try to insert new extent into found leaf and return */
-- 
cgit v1.2.3-70-g09d2


From 553f9008939638335836eec834f4dea310c17eae Mon Sep 17 00:00:00 2001
From: Mingming <cmm@us.ibm.com>
Date: Fri, 18 Sep 2009 13:34:55 -0400
Subject: ext4: Show unwritten extent flag in ext4_ext_show_leaf()

ext4_ext_show_leaf() will display the leaf extents when extent
debugging is enabled.

Printing out the unwritten bit is useful for debugging unwritten
extent, allow us to see the unwritten extents vs written extents,
after the unwritten extents are splitted or converted.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/ext4_extents.h |  3 +--
 fs/ext4/extents.c      | 31 ++++++++++++++++++++++---------
 2 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 20a84105a10b..9538633879d7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,8 +43,7 @@
 #define CHECK_BINSEARCH__
 
 /*
- * If EXT_DEBUG is defined you can use the 'extdebug' mount option
- * to get lots of info about what's going on.
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
  */
 #define EXT_DEBUG__
 #ifdef EXT_DEBUG
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c583ebd1e41f..13db43408533 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -437,8 +437,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
 			    idx_pblock(path->p_idx));
 		} else if (path->p_ext) {
-			ext_debug("  %d:%d:%llu ",
+			ext_debug("  %d:[%d]%d:%llu ",
 				  le32_to_cpu(path->p_ext->ee_block),
+				  ext4_ext_is_uninitialized(path->p_ext),
 				  ext4_ext_get_actual_len(path->p_ext),
 				  ext_pblock(path->p_ext));
 		} else
@@ -460,8 +461,11 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 	eh = path[depth].p_hdr;
 	ex = EXT_FIRST_EXTENT(eh);
 
+	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
+
 	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
-		ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
+		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
+			  ext4_ext_is_uninitialized(ex),
 			  ext4_ext_get_actual_len(ex), ext_pblock(ex));
 	}
 	ext_debug("\n");
@@ -580,9 +584,10 @@ ext4_ext_binsearch(struct inode *inode,
 	}
 
 	path->p_ext = l - 1;
-	ext_debug("  -> %d:%llu:%d ",
+	ext_debug("  -> %d:%llu:[%d]%d ",
 			le32_to_cpu(path->p_ext->ee_block),
 			ext_pblock(path->p_ext),
+			ext4_ext_is_uninitialized(path->p_ext),
 			ext4_ext_get_actual_len(path->p_ext));
 
 #ifdef CHECK_BINSEARCH
@@ -850,9 +855,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	path[depth].p_ext++;
 	while (path[depth].p_ext <=
 			EXT_MAX_EXTENT(path[depth].p_hdr)) {
-		ext_debug("move %d:%llu:%d in new leaf %llu\n",
+		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
 				le32_to_cpu(path[depth].p_ext->ee_block),
 				ext_pblock(path[depth].p_ext),
+				ext4_ext_is_uninitialized(path[depth].p_ext),
 				ext4_ext_get_actual_len(path[depth].p_ext),
 				newblock);
 		/*memmove(ex++, path[depth].p_ext++,
@@ -1580,9 +1586,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 
 	/* try to insert block into found extent and return */
 	if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
-		ext_debug("append %d block to %d:%d (from %llu)\n",
+		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+				ext4_ext_is_uninitialized(newext),
 				ext4_ext_get_actual_len(newext),
 				le32_to_cpu(ex->ee_block),
+				ext4_ext_is_uninitialized(ex),
 				ext4_ext_get_actual_len(ex), ext_pblock(ex));
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
@@ -1651,9 +1659,10 @@ has_space:
 
 	if (!nearex) {
 		/* there is no extent in this leaf, create first one */
-		ext_debug("first extent in the leaf: %d:%llu:%d\n",
+		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
 				le32_to_cpu(newext->ee_block),
 				ext_pblock(newext),
+				ext4_ext_is_uninitialized(newext),
 				ext4_ext_get_actual_len(newext));
 		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
 	} else if (le32_to_cpu(newext->ee_block)
@@ -1663,10 +1672,11 @@ has_space:
 			len = EXT_MAX_EXTENT(eh) - nearex;
 			len = (len - 1) * sizeof(struct ext4_extent);
 			len = len < 0 ? 0 : len;
-			ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
+			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
 					"move %d from 0x%p to 0x%p\n",
 					le32_to_cpu(newext->ee_block),
 					ext_pblock(newext),
+					ext4_ext_is_uninitialized(newext),
 					ext4_ext_get_actual_len(newext),
 					nearex, len, nearex + 1, nearex + 2);
 			memmove(nearex + 2, nearex + 1, len);
@@ -1676,10 +1686,11 @@ has_space:
 		BUG_ON(newext->ee_block == nearex->ee_block);
 		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
 		len = len < 0 ? 0 : len;
-		ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
+		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
 				"move %d from 0x%p to 0x%p\n",
 				le32_to_cpu(newext->ee_block),
 				ext_pblock(newext),
+				ext4_ext_is_uninitialized(newext),
 				ext4_ext_get_actual_len(newext),
 				nearex, len, nearex + 1, nearex + 2);
 		memmove(nearex + 1, nearex, len);
@@ -2094,7 +2105,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		else
 			uninitialized = 0;
 
-		ext_debug("remove ext %u:%d\n", ex_ee_block, ex_ee_len);
+		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
+			 uninitialized, ex_ee_len);
 		path[depth].p_ext = ex;
 
 		a = ex_ee_block > start ? ex_ee_block : start;
@@ -2743,6 +2755,7 @@ insert:
 	} else if (err)
 		goto fix_extent_len;
 out:
+	ext4_ext_show_leaf(inode, path);
 	return err ? err : allocated;
 
 fix_extent_len:
-- 
cgit v1.2.3-70-g09d2


From 9599b0e597d810be9b8f759ea6e9619c4f983c5e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 17 Aug 2009 21:23:17 -0400
Subject: jbd2: Annotate transaction start also for jbd2_journal_restart()

lockdep annotation for a transaction start has been at the end of
jbd2_journal_start(). But a transaction is also started from
jbd2_journal_restart(). Move the lockdep annotation to start_this_handle()
which covers both cases.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4516a87ca55c..a0512700542f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -238,6 +238,8 @@ repeat_locked:
 		  __jbd2_log_space_left(journal));
 	spin_unlock(&transaction->t_handle_lock);
 	spin_unlock(&journal->j_state_lock);
+
+	lock_map_acquire(&handle->h_lockdep_map);
 out:
 	if (unlikely(new_transaction))		/* It's usually NULL */
 		kfree(new_transaction);
@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 		handle = ERR_PTR(err);
 		goto out;
 	}
-
-	lock_map_acquire(&handle->h_lockdep_map);
 out:
 	return handle;
 }
@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
 	__jbd2_log_start_commit(journal, transaction->t_tid);
 	spin_unlock(&journal->j_state_lock);
 
+	lock_map_release(&handle->h_lockdep_map);
 	handle->h_buffer_credits = nblocks;
 	ret = start_this_handle(journal, handle);
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From 487caeef9fc08c0565e082c40a8aaf58dad92bbb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 17 Aug 2009 22:17:20 -0400
Subject: ext4: Fix possible deadlock between ext4_truncate() and
 ext4_get_blocks()

During truncate we are sometimes forced to start a new transaction as
the amount of blocks to be journaled is both quite large and hard to
predict. So far we restarted a transaction while holding i_data_sem
and that violates lock ordering because i_data_sem ranks below a
transaction start (and it can lead to a real deadlock with
ext4_get_blocks() mapping blocks in some page while having a
transaction open).

We fix the problem by dropping the i_data_sem before restarting the
transaction and acquire it afterwards. It's slightly subtle that this
works:

1) By the time ext4_truncate() is called, all the page cache for the
truncated part of the file is dropped so get_block() should not be
called on it (we only have to invalidate extent cache after we
reacquire i_data_sem because some extent from not-truncated part could
extend also into the part we are going to truncate).

2) Writes, migrate or defrag hold i_mutex so they are stopped for all
the time of the truncate.

This bug has been found and analyzed by Theodore Tso <tytso@mit.edu>.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  1 +
 fs/ext4/extents.c | 15 ++++++++++++---
 fs/ext4/inode.c   | 23 +++++++++++++++++++----
 3 files changed, 32 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2e9a2036c114..fb21663ffe54 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1370,6 +1370,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
+extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 13db43408533..8c20caf4aa5c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
 	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
 }
 
-static int ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_truncate_extend_restart(handle_t *handle,
+					    struct inode *inode,
+					    int needed)
 {
 	int err;
 
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
 	err = ext4_journal_extend(handle, needed);
 	if (err <= 0)
 		return err;
-	return ext4_journal_restart(handle, needed);
+	err = ext4_truncate_restart_trans(handle, inode, needed);
+	/*
+	 * We have dropped i_data_sem so someone might have cached again
+	 * an extent we are going to truncate.
+	 */
+	ext4_ext_invalidate_cache(inode);
+
+	return err;
 }
 
 /*
@@ -2150,7 +2159,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		}
 		credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
 
-		err = ext4_ext_journal_restart(handle, credits);
+		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
 		if (err)
 			goto out;
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9a4c929b16dc..d61fb523308f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -192,11 +192,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
  * so before we call here everything must be consistently dirtied against
  * this transaction.
  */
-static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+				 int nblocks)
 {
+	int ret;
+
+	/*
+	 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+	 * moment, get_block can be called only for blocks inside i_size since
+	 * page cache has been already dropped and writes are blocked by
+	 * i_mutex. So we can safely drop the i_data_sem here.
+	 */
 	BUG_ON(EXT4_JOURNAL(inode) == NULL);
 	jbd_debug(2, "restarting handle %p\n", handle);
-	return ext4_journal_restart(handle, blocks_for_truncate(inode));
+	up_write(&EXT4_I(inode)->i_data_sem);
+	ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+	down_write(&EXT4_I(inode)->i_data_sem);
+
+	return ret;
 }
 
 /*
@@ -3658,7 +3671,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
 			ext4_handle_dirty_metadata(handle, inode, bh);
 		}
 		ext4_mark_inode_dirty(handle, inode);
-		ext4_journal_test_restart(handle, inode);
+		ext4_truncate_restart_trans(handle, inode,
+					    blocks_for_truncate(inode));
 		if (bh) {
 			BUFFER_TRACE(bh, "retaking write access");
 			ext4_journal_get_write_access(handle, bh);
@@ -3869,7 +3883,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 				return;
 			if (try_to_extend_transaction(handle, inode)) {
 				ext4_mark_inode_dirty(handle, inode);
-				ext4_journal_test_restart(handle, inode);
+				ext4_truncate_restart_trans(handle, inode,
+					    blocks_for_truncate(inode));
 			}
 
 			ext4_free_blocks(handle, inode, nr, 1, 1);
-- 
cgit v1.2.3-70-g09d2


From bf43d84b185e2ff54598f8c58a5a8e63148b6e90 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 17 Aug 2009 23:48:51 -0400
Subject: ext4: reject too-large filesystems on 32-bit kernels

ext4 will happily mount a > 16T filesystem on a 32-bit box, but
this is not safe; writes to the block device will wrap past 16T
and the page cache can't index past 16T (232 index * 4k pages).

Adding another test to the existing "too many sectors" test
should do the trick.

Add a comment, a relevant return value, and fix the reference
to the CONFIG_LBD(AF) option as well.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fe3f376b7df2..de67c3657b83 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2550,12 +2550,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	if (ext4_blocks_count(es) >
-		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+	/*
+	 * Test whether we have more sectors than will fit in sector_t,
+	 * and whether the max offset is addressable by the page cache.
+	 */
+	if ((ext4_blocks_count(es) >
+	     (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
+	    (ext4_blocks_count(es) >
+	     (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
 		ext4_msg(sb, KERN_ERR, "filesystem"
-			" too large to mount safely");
+			 " too large to mount safely on this system");
 		if (sizeof(sector_t) < 8)
 			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+		ret = -EFBIG;
 		goto failed_mount;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 0373130d5bc783751c1fbad948886916a21d4559 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 17 Aug 2009 23:51:29 -0400
Subject: ext4: open-code ext4_mb_update_group_info

ext4_mb_update_group_info is only called in one place, and it's
extremely simple.  There's no reason to have it in a separate function
in a separate file as far as I can tell, it just obfuscates what's
really going on.

Perhaps it was intended to keep the grp->bb_* manipulation local to
mballoc.c but we're already accessing other grp-> fields in balloc.c
directly so this seems ok.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 2 +-
 fs/ext4/ext4.h    | 2 --
 fs/ext4/mballoc.c | 9 ---------
 3 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e2126d70dff5..1d0418980f8d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 	 * new bitmap information
 	 */
 	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
-	ext4_mb_update_group_info(grp, blocks_freed);
+	grp->bb_free += blocks_freed;
 	up_write(&grp->alloc_sem);
 
 	/* We dirtied the bitmap block */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fb21663ffe54..02b22885eb02 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1343,8 +1343,6 @@ extern void ext4_mb_free_blocks(handle_t *, struct inode *,
 		ext4_fsblk_t, unsigned long, int, unsigned long *);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
 		ext4_group_t i, struct ext4_group_desc *desc);
-extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
-		ext4_grpblk_t add);
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
 						ext4_group_t, int);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index faf5bd056a93..833d87236881 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2554,15 +2554,6 @@ exit_meta_group_info:
 	return -ENOMEM;
 } /* ext4_mb_add_groupinfo */
 
-/*
- * Update an existing group.
- * This function is used for online resize
- */
-void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
-{
-	grp->bb_free += add;
-}
-
 static int ext4_mb_init_backend(struct super_block *sb)
 {
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
-- 
cgit v1.2.3-70-g09d2


From 38877f4e8dbbec12c6fde85ee1fce1dc27ef3290 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 17 Aug 2009 23:55:24 -0400
Subject: simplify some logic in ext4_mb_normalize_request

While reading through some of the mballoc code it seems that a couple
spots in the size normalization function could be streamlined.

The test for non-overlapping PAs can be or'd for the start & end
conditions, and the tests for adjacent PAs can be else-if'd -
it's essentially independently testing:

	if (A + B <= C)
		...
	if (A > C)
		...

These cannot both be true so it seems like the else-if might
be slightly more efficient and/or informative.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 833d87236881..1a9cd7de04dd 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3184,23 +3184,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 		BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
 			ac->ac_o_ex.fe_logical < pa->pa_lstart));
 
-		/* skip PA normalized request doesn't overlap with */
-		if (pa->pa_lstart >= end) {
-			spin_unlock(&pa->pa_lock);
-			continue;
-		}
-		if (pa_end <= start) {
+		/* skip PAs this normalized request doesn't overlap with */
+		if (pa->pa_lstart >= end || pa_end <= start) {
 			spin_unlock(&pa->pa_lock);
 			continue;
 		}
 		BUG_ON(pa->pa_lstart <= start && pa_end >= end);
 
+		/* adjust start or end to be adjacent to this pa */
 		if (pa_end <= ac->ac_o_ex.fe_logical) {
 			BUG_ON(pa_end < start);
 			start = pa_end;
-		}
-
-		if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+		} else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
 			BUG_ON(pa->pa_lstart > end);
 			end = pa->pa_lstart;
 		}
-- 
cgit v1.2.3-70-g09d2


From a13fb1a4533f26c1e2b0204d5283b696689645af Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 18 Aug 2009 00:20:23 -0400
Subject: ext4: Add feature set check helper for mount & remount paths

A user reported that although his root ext4 filesystem was mounting
fine, other filesystems would not mount, with the:

"Filesystem with huge files cannot be mounted RDWR without CONFIG_LBDAF"

error on his 32-bit box built without CONFIG_LBDAF.  This is because
the test at mount time for this situation was not being re-checked
on remount, and the normal boot process makes an ro->rw transition,
so this was being missed.

Refactor to make a common helper function to test the filesystem
features against the type of mount request (RO vs. RW) so that we
stay consistent.

Addresses Red-Hat-Bugzilla: #517650

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 91 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 49 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index de67c3657b83..4037fe0b5a5c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2254,6 +2254,49 @@ static struct kobj_type ext4_ktype = {
 	.release	= ext4_sb_release,
 };
 
+/*
+ * Check whether this filesystem can be mounted based on
+ * the features present and the RDONLY/RDWR mount requested.
+ * Returns 1 if this filesystem can be mounted as requested,
+ * 0 if it cannot be.
+ */
+static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+{
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+		ext4_msg(sb, KERN_ERR,
+			"Couldn't mount because of "
+			"unsupported optional features (%x)",
+			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+			~EXT4_FEATURE_INCOMPAT_SUPP));
+		return 0;
+	}
+
+	if (readonly)
+		return 1;
+
+	/* Check that feature set is OK for a read-write mount */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
+			 "unsupported optional features (%x)",
+			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+				~EXT4_FEATURE_RO_COMPAT_SUPP));
+		return 0;
+	}
+	/*
+	 * Large file size enabled file system can only be mounted
+	 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
+	 */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+		if (sizeof(blkcnt_t) < sizeof(u64)) {
+			ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
+				 "cannot be mounted RDWR without "
+				 "CONFIG_LBDAF");
+			return 0;
+		}
+	}
+	return 1;
+}
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
@@ -2275,7 +2318,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned int db_count;
 	unsigned int i;
 	int needs_recovery, has_huge_files;
-	int features;
 	__u64 blocks_count;
 	int err;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -2402,39 +2444,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * previously didn't change the revision level when setting the flags,
 	 * so there is a chance incompat flags are set on a rev 0 filesystem.
 	 */
-	features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
-	if (features) {
-		ext4_msg(sb, KERN_ERR,
-			"Couldn't mount because of "
-			"unsupported optional features (%x)",
-			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
-			~EXT4_FEATURE_INCOMPAT_SUPP));
-		goto failed_mount;
-	}
-	features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
-	if (!(sb->s_flags & MS_RDONLY) && features) {
-		ext4_msg(sb, KERN_ERR,
-			"Couldn't mount RDWR because of "
-			"unsupported optional features (%x)",
-			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
-			~EXT4_FEATURE_RO_COMPAT_SUPP));
+	if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
 		goto failed_mount;
-	}
-	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
-	if (has_huge_files) {
-		/*
-		 * Large file size enabled file system can only be
-		 * mount if kernel is build with CONFIG_LBDAF
-		 */
-		if (sizeof(root->i_blocks) < sizeof(u64) &&
-				!(sb->s_flags & MS_RDONLY)) {
-			ext4_msg(sb, KERN_ERR, "Filesystem with huge "
-					"files cannot be mounted read-write "
-					"without CONFIG_LBDAF");
-			goto failed_mount;
-		}
-	}
+
 	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
 
 	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2470,6 +2482,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		}
 	}
 
+	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
 	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
 						      has_huge_files);
 	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -3485,18 +3499,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			if (sbi->s_journal)
 				ext4_mark_recovery_complete(sb, es);
 		} else {
-			int ret;
-			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					~EXT4_FEATURE_RO_COMPAT_SUPP))) {
-				ext4_msg(sb, KERN_WARNING, "couldn't "
-				       "remount RDWR because of unsupported "
-				       "optional features (%x)",
-				(le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
-					~EXT4_FEATURE_RO_COMPAT_SUPP));
+			/* Make sure we can mount this feature set readwrite */
+			if (!ext4_feature_set_ok(sb, 0)) {
 				err = -EROFS;
 				goto restore_opts;
 			}
-
 			/*
 			 * Make sure the group descriptor checksums
 			 * are sane.  If they aren't, refuse to remount r/w.
-- 
cgit v1.2.3-70-g09d2


From 1154ecbd2f8298ef75609f5f8ed5aca96be599fb Mon Sep 17 00:00:00 2001
From: Zhang Qiang <zhangqiang.buaa@gmail.com>
Date: Tue, 18 Aug 2009 14:58:24 +0800
Subject: nilfs2: missing a read lock for segment writer in
 nilfs_attach_checkpoint()

'ns_cno' of structure 'the_nilfs' must be protected from segment
writer, in other words, the caller of nilfs_get_checkpoint should hold
read lock for nilfs->ns_segctor_sem.  This patch adds the lock/unlock
operations in nilfs_attach_checkpoint() when calling
nilfs_cpfile_get_checkpoint().

Signed-off-by: Zhang Qiang <zhangqiang.buaa@gmail.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8e2ec43b18f4..151964f0de4c 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -416,8 +416,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 	if (unlikely(err))
 		goto failed;
 
+	down_read(&nilfs->ns_segctor_sem);
 	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
 					  &bh_cp);
+	up_read(&nilfs->ns_segctor_sem);
 	if (unlikely(err)) {
 		if (err == -ENOENT || err == -EINVAL) {
 			printk(KERN_ERR
-- 
cgit v1.2.3-70-g09d2


From 970343cd49048446da4189986cf960273588d71a Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Tue, 18 Aug 2009 14:25:03 +0800
Subject: GFS2: free disk inode which is deleted by remote node -V2

this patch is for the same problem that Benjamin Marzinski fixes at commit
b94a170e96dc416828af9d350ae2e34b70ae7347

quotation of the original problem:

---cut here---
When a file is deleted from a gfs2 filesystem on one node, a dcache
entry for it may still exist on other nodes in the cluster. If this
happens, gfs2 will be unable to free this file on disk. Because of this,
it's possible to have a gfs2 filesystem with no files on it and no free
space. With this patch, when a node receives a callback notifying it
that the file is being deleted on another node, it schedules a new
workqueue thread to remove the file's dcache entry.
---end cut---

after applying Benjamin's patch, I think there is still a case in which the disk
inode remains even when "no space" is hit. the case is that when running
d_prune_aliases() against the inode, there are one or more dentries(aliases)
which have reference count number > 0. in this case the dentries won't be pruned.
and even later, the reference count becomes to 0, the dentries can still be
cached in memory. unfortunately, no callback come again, things come back to
the state before the callback runs. thus the on disk inode remains there until
in memoryinode is removed for some other reason(shrinking inode cache or unmount
the volume..).

this patch is to remove those dentries when their reference count becomes to 0 and
the inode is deleted by remote node. for implementation, gfs2_dentry_delete() is
added as dentry_operations.d_delete. the function returns true when the inode is
deleted by remote node. in dput(), gfs2_dentry_delete() is called and since it
returns true, the dentry is unhashed from dcache and then removed. when all dentries
are removed, the in memory inode get removed so that the on disk inode is freed.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/dentry.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 022c66cd5606..91beddadd388 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -107,8 +107,26 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
 	return 0;
 }
 
+static int gfs2_dentry_delete(struct dentry *dentry)
+{
+	struct gfs2_inode *ginode;
+
+	if (!dentry->d_inode)
+		return 0;
+
+	ginode = GFS2_I(dentry->d_inode);
+	if (!ginode->i_iopen_gh.gh_gl)
+		return 0;
+
+	if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags))
+		return 1;
+
+	return 0;
+}
+
 const struct dentry_operations gfs2_dops = {
 	.d_revalidate = gfs2_drevalidate,
 	.d_hash = gfs2_dhash,
+	.d_delete = gfs2_dentry_delete,
 };
 
-- 
cgit v1.2.3-70-g09d2


From a924586036833086b262a371b09d1266c23bb4d1 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Wed, 19 Aug 2009 00:29:43 +0900
Subject: nilfs2: fix oopses with doubly mounted snapshots

will fix kernel oopses like the following:

 # mount -t nilfs2 -r -o cp=20 /dev/sdb1 /test1
 # mount -t nilfs2 -r -o cp=20 /dev/sdb1 /test2
 # umount /test1
 # umount /test2

BUG: sleeping function called from invalid context at arch/x86/mm/fault.c:1069
in_atomic(): 0, irqs_disabled(): 1, pid: 3886, name: umount.nilfs2
1 lock held by umount.nilfs2/3886:
 #0:  (&type->s_umount_key#31){+.+...}, at: [<c10b398a>] deactivate_super+0x52/0x6c
irq event stamp: 1219
hardirqs last  enabled at (1219): [<c135c774>] __mutex_unlock_slowpath+0xf8/0x119
hardirqs last disabled at (1218): [<c135c6d5>] __mutex_unlock_slowpath+0x59/0x119
softirqs last  enabled at (1214): [<c1033316>] __do_softirq+0x1a5/0x1ad
softirqs last disabled at (1205): [<c1033354>] do_softirq+0x36/0x5a
Pid: 3886, comm: umount.nilfs2 Not tainted 2.6.31-rc6 #55
Call Trace:
 [<c1023549>] __might_sleep+0x107/0x10e
 [<c13603c0>] do_page_fault+0x246/0x397
 [<c136017a>] ? do_page_fault+0x0/0x397
 [<c135e753>] error_code+0x6b/0x70
 [<c136017a>] ? do_page_fault+0x0/0x397
 [<c104f805>] ? __lock_acquire+0x91/0x12fd
 [<c1050a62>] ? __lock_acquire+0x12ee/0x12fd
 [<c1050a62>] ? __lock_acquire+0x12ee/0x12fd
 [<c1050b2b>] lock_acquire+0xba/0xdd
 [<d0d17d3f>] ? nilfs_detach_segment_constructor+0x2f/0x2fa [nilfs2]
 [<c135d4fe>] down_write+0x2a/0x46
 [<d0d17d3f>] ? nilfs_detach_segment_constructor+0x2f/0x2fa [nilfs2]
 [<d0d17d3f>] nilfs_detach_segment_constructor+0x2f/0x2fa [nilfs2]
 [<c104ea2c>] ? mark_held_locks+0x43/0x5b
 [<c104ecb1>] ? trace_hardirqs_on_caller+0x10b/0x133
 [<c104ece4>] ? trace_hardirqs_on+0xb/0xd
 [<d0d09ac1>] nilfs_put_super+0x2f/0xca [nilfs2]
 [<c10b3352>] generic_shutdown_super+0x49/0xb8
 [<c10b33de>] kill_block_super+0x1d/0x31
 [<c10e6599>] ? vfs_quota_off+0x0/0x12
 [<c10b398f>] deactivate_super+0x57/0x6c
 [<c10c4bc3>] mntput_no_expire+0x8c/0xb4
 [<c10c5094>] sys_umount+0x27f/0x2a4
 [<c10c50c6>] sys_oldumount+0xd/0xf
 [<c10031a4>] sysenter_do_call+0x12/0x38
 ...

This turns out to be a bug brought by an -rc1 patch ("nilfs2: simplify
remaining sget() use").

In the patch, a new "put resource" function, nilfs_put_sbinfo()
was introduced to delay freeing nilfs_sb_info struct.

But the nilfs_put_sbinfo() mistakenly used atomic_dec_and_test()
function to check the reference count, and it caused the nilfs_sb_info
was freed when user mounted a snapshot twice.

This bug also suggests there was unseen memory leak in usual mount
/umount operations for nilfs.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/the_nilfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index e8adbffc626f..1b9caafb8662 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -253,7 +253,7 @@ nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 
 static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
 {
-	if (!atomic_dec_and_test(&sbi->s_count))
+	if (atomic_dec_and_test(&sbi->s_count))
 		kfree(sbi);
 }
 
-- 
cgit v1.2.3-70-g09d2


From b5711b8e5a437ca7d35321d19de568b4f76a7739 Mon Sep 17 00:00:00 2001
From: Casey Dahlin <cdahlin@redhat.com>
Date: Tue, 28 Jul 2009 12:29:05 -0500
Subject: dlm: fix double-release of socket in error exit path

The last correction to the tcp_connect_to_sock error exit path,
commit a89d63a159b1ba5833be2bef00adf8ad8caac8be, can free an already
freed socket, due to collision with a previous (incomplete) attempt
to fix the same issue, commit 311f6fc77c51926dbdfbeab0a5d88d70f01fa3f4.

Signed-off-by: Casey Dahlin <cdahlin@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 618a60f03886..210d52c48808 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -926,10 +926,8 @@ static void tcp_connect_to_sock(struct connection *con)
 		goto out_err;
 
 	memset(&saddr, 0, sizeof(saddr));
-	if (dlm_nodeid_to_addr(con->nodeid, &saddr)) {
-		sock_release(sock);
+	if (dlm_nodeid_to_addr(con->nodeid, &saddr))
 		goto out_err;
-	}
 
 	sock->sk->sk_user_data = con;
 	con->rx_action = receive_from_sock;
-- 
cgit v1.2.3-70-g09d2


From 89a4eb4b66e8f4d395e14a14d262dac4d6ca52f0 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 18 Aug 2009 14:11:08 -0700
Subject: vfs: make get_sb_pseudo set s_maxbytes to value that can be cast to
 signed

get_sb_pseudo sets s_maxbytes to ~0ULL which becomes negative when cast
to a signed value.  Fix it to use MAX_LFS_FILESIZE which casts properly
to a positive signed value.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Steve French <smfrench@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Robert Love <rlove@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/libfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index ddfa89948c3f..dcec3d3ea64f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -217,7 +217,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 		return PTR_ERR(s);
 
 	s->s_flags = MS_NOUSER;
-	s->s_maxbytes = ~0ULL;
+	s->s_maxbytes = MAX_LFS_FILESIZE;
 	s->s_blocksize = PAGE_SIZE;
 	s->s_blocksize_bits = PAGE_SHIFT;
 	s->s_magic = magic;
-- 
cgit v1.2.3-70-g09d2


From 0753ba01e126020bf0f8150934903b48935b697d Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 18 Aug 2009 14:11:10 -0700
Subject: mm: revert "oom: move oom_adj value"

The commit 2ff05b2b (oom: move oom_adj value) moveed the oom_adj value to
the mm_struct.  It was a very good first step for sanitize OOM.

However Paul Menage reported the commit makes regression to his job
scheduler.  Current OOM logic can kill OOM_DISABLED process.

Why? His program has the code of similar to the following.

	...
	set_oom_adj(OOM_DISABLE); /* The job scheduler never killed by oom */
	...
	if (vfork() == 0) {
		set_oom_adj(0); /* Invoked child can be killed */
		execve("foo-bar-cmd");
	}
	....

vfork() parent and child are shared the same mm_struct.  then above
set_oom_adj(0) doesn't only change oom_adj for vfork() child, it's also
change oom_adj for vfork() parent.  Then, vfork() parent (job scheduler)
lost OOM immune and it was killed.

Actually, fork-setting-exec idiom is very frequently used in userland program.
We must not break this assumption.

Then, this patch revert commit 2ff05b2b and related commit.

Reverted commit list
---------------------
- commit 2ff05b2b4e (oom: move oom_adj value from task_struct to mm_struct)
- commit 4d8b9135c3 (oom: avoid unnecessary mm locking and scanning for OOM_DISABLE)
- commit 8123681022 (oom: only oom kill exiting tasks with attached memory)
- commit 933b787b57 (mm: copy over oom_adj value at fork time)

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 15 +++------
 fs/proc/base.c                     | 19 ++---------
 include/linux/mm_types.h           |  2 --
 include/linux/sched.h              |  1 +
 kernel/fork.c                      |  1 -
 mm/oom_kill.c                      | 64 +++++++++++++++++++++++---------------
 6 files changed, 48 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fad18f9456e4..ffead13f9443 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1167,13 +1167,11 @@ CHAPTER 3: PER-PROCESS PARAMETERS
 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
 ------------------------------------------------------
 
-This file can be used to adjust the score used to select which processes should
-be killed in an out-of-memory situation.  The oom_adj value is a characteristic
-of the task's mm, so all threads that share an mm with pid will have the same
-oom_adj value.  A high value will increase the likelihood of this process being
-killed by the oom-killer.  Valid values are in the range -16 to +15 as
-explained below and a special value of -17, which disables oom-killing
-altogether for threads sharing pid's mm.
+This file can be used to adjust the score used to select which processes
+should be killed in an  out-of-memory  situation.  Giving it a high score will
+increase the likelihood of this process being killed by the oom-killer.  Valid
+values are in the range -16 to +15, plus the special value -17, which disables
+oom-killing altogether for this process.
 
 The process to be killed in an out-of-memory situation is selected among all others
 based on its badness score. This value equals the original memory size of the process
@@ -1187,9 +1185,6 @@ the parent's score if they do not share the same memory. Thus forking servers
 are the prime candidates to be killed. Having only one 'hungry' child will make
 parent less preferable than the child.
 
-/proc/<pid>/oom_adj cannot be changed for kthreads since they are immune from
-oom-killing already.
-
 /proc/<pid>/oom_score shows process' current badness score.
 
 The following heuristics are then applied:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 175db258942f..6f742f6658a9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1003,12 +1003,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 
 	if (!task)
 		return -ESRCH;
-	task_lock(task);
-	if (task->mm)
-		oom_adjust = task->mm->oom_adj;
-	else
-		oom_adjust = OOM_DISABLE;
-	task_unlock(task);
+	oom_adjust = task->oomkilladj;
 	put_task_struct(task);
 
 	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1037,19 +1032,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	task = get_proc_task(file->f_path.dentry->d_inode);
 	if (!task)
 		return -ESRCH;
-	task_lock(task);
-	if (!task->mm) {
-		task_unlock(task);
-		put_task_struct(task);
-		return -EINVAL;
-	}
-	if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) {
-		task_unlock(task);
+	if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
 		put_task_struct(task);
 		return -EACCES;
 	}
-	task->mm->oom_adj = oom_adjust;
-	task_unlock(task);
+	task->oomkilladj = oom_adjust;
 	put_task_struct(task);
 	if (end - buffer == 0)
 		return -EIO;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7acc8439d9b3..0042090a4d70 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -240,8 +240,6 @@ struct mm_struct {
 
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
-	s8 oom_adj;	/* OOM kill score adjustment (bit shift) */
-
 	cpumask_t cpu_vm_mask;
 
 	/* Architecture-specific MM context */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3ab08e4bb6b8..0f1ea4a66957 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1198,6 +1198,7 @@ struct task_struct {
 	 * a short time
 	 */
 	unsigned char fpu_counter;
+	s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	unsigned int btrace_seq;
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 021e1138556e..144326b7af50 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -426,7 +426,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
-	mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0;
 	mm->core_state = NULL;
 	mm->nr_ptes = 0;
 	set_mm_counter(mm, file_rss, 0);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 175a67a78a99..a7b2460e922b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	unsigned long points, cpu_time, run_time;
 	struct mm_struct *mm;
 	struct task_struct *child;
-	int oom_adj;
 
 	task_lock(p);
 	mm = p->mm;
@@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 		task_unlock(p);
 		return 0;
 	}
-	oom_adj = mm->oom_adj;
-	if (oom_adj == OOM_DISABLE) {
-		task_unlock(p);
-		return 0;
-	}
 
 	/*
 	 * The memory size of the process is the basis for the badness.
@@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 		points /= 8;
 
 	/*
-	 * Adjust the score by oom_adj.
+	 * Adjust the score by oomkilladj.
 	 */
-	if (oom_adj) {
-		if (oom_adj > 0) {
+	if (p->oomkilladj) {
+		if (p->oomkilladj > 0) {
 			if (!points)
 				points = 1;
-			points <<= oom_adj;
+			points <<= p->oomkilladj;
 		} else
-			points >>= -(oom_adj);
+			points >>= -(p->oomkilladj);
 	}
 
 #ifdef DEBUG
@@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 			*ppoints = ULONG_MAX;
 		}
 
+		if (p->oomkilladj == OOM_DISABLE)
+			continue;
+
 		points = badness(p, uptime.tv_sec);
-		if (points > *ppoints) {
+		if (points > *ppoints || !chosen) {
 			chosen = p;
 			*ppoints = points;
 		}
@@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem)
 		}
 		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
 		       p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
-		       get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
+		       get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
+		       p->comm);
 		task_unlock(p);
 	} while_each_thread(g, p);
 }
@@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
 		return;
 	}
 
-	if (!p->mm)
+	if (!p->mm) {
+		WARN_ON(1);
+		printk(KERN_WARNING "tried to kill an mm-less task!\n");
 		return;
+	}
 
 	if (verbose)
 		printk(KERN_ERR "Killed process %d (%s)\n",
@@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p)
 	struct mm_struct *mm;
 	struct task_struct *g, *q;
 
-	task_lock(p);
 	mm = p->mm;
-	if (!mm || mm->oom_adj == OOM_DISABLE) {
-		task_unlock(p);
+
+	/* WARNING: mm may not be dereferenced since we did not obtain its
+	 * value from get_task_mm(p).  This is OK since all we need to do is
+	 * compare mm to q->mm below.
+	 *
+	 * Furthermore, even if mm contains a non-NULL value, p->mm may
+	 * change to NULL at any time since we do not hold task_lock(p).
+	 * However, this is of no concern to us.
+	 */
+
+	if (mm == NULL)
 		return 1;
-	}
-	task_unlock(p);
+
+	/*
+	 * Don't kill the process if any threads are set to OOM_DISABLE
+	 */
+	do_each_thread(g, q) {
+		if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
+			return 1;
+	} while_each_thread(g, q);
+
 	__oom_kill_task(p, 1);
 
 	/*
@@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	struct task_struct *c;
 
 	if (printk_ratelimit()) {
-		task_lock(current);
 		printk(KERN_WARNING "%s invoked oom-killer: "
-			"gfp_mask=0x%x, order=%d, oom_adj=%d\n",
-			current->comm, gfp_mask, order,
-			current->mm ? current->mm->oom_adj : OOM_DISABLE);
+			"gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
+			current->comm, gfp_mask, order, current->oomkilladj);
+		task_lock(current);
 		cpuset_print_task_mems_allowed(current);
 		task_unlock(current);
 		dump_stack();
@@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	/*
 	 * If the task is already exiting, don't alarm the sysadmin or kill
 	 * its children or threads, just set TIF_MEMDIE so it can die quickly
-	 * if its mm is still attached.
 	 */
-	if (p->mm && (p->flags & PF_EXITING)) {
+	if (p->flags & PF_EXITING) {
 		__oom_kill_task(p, 0);
 		return 0;
 	}
-- 
cgit v1.2.3-70-g09d2


From e571cbf1a4f8d8b6cfd4898df718dae84c75a8e1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 19 Aug 2009 18:12:27 -0400
Subject: NFS: Add a dns resolver for use with NFSv4 referrals and migration

The NFSv4 and NFSv4.1 protocols both allow for the redirection of a client
from one server to another in order to support filesystem migration and
replication. For full protocol support, we need to add the ability to
convert a DNS host name into an IP address that we can feed to the RPC
client.

We'll reuse the sunrpc cache, now that it has been converted to work with
rpc_pipefs.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/filesystems/nfs.txt   |  98 +++++++++++
 Documentation/kernel-parameters.txt |   8 +
 fs/nfs/Makefile                     |   3 +-
 fs/nfs/cache_lib.c                  | 140 +++++++++++++++
 fs/nfs/cache_lib.h                  |  27 +++
 fs/nfs/dns_resolve.c                | 335 ++++++++++++++++++++++++++++++++++++
 fs/nfs/dns_resolve.h                |  14 ++
 fs/nfs/inode.c                      |   8 +
 net/sunrpc/rpc_pipe.c               |   7 +
 9 files changed, 639 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/filesystems/nfs.txt
 create mode 100644 fs/nfs/cache_lib.c
 create mode 100644 fs/nfs/cache_lib.h
 create mode 100644 fs/nfs/dns_resolve.c
 create mode 100644 fs/nfs/dns_resolve.h

(limited to 'fs')

diff --git a/Documentation/filesystems/nfs.txt b/Documentation/filesystems/nfs.txt
new file mode 100644
index 000000000000..f50f26ce6cd0
--- /dev/null
+++ b/Documentation/filesystems/nfs.txt
@@ -0,0 +1,98 @@
+
+The NFS client
+==============
+
+The NFS version 2 protocol was first documented in RFC1094 (March 1989).
+Since then two more major releases of NFS have been published, with NFSv3
+being documented in RFC1813 (June 1995), and NFSv4 in RFC3530 (April
+2003).
+
+The Linux NFS client currently supports all the above published versions,
+and work is in progress on adding support for minor version 1 of the NFSv4
+protocol.
+
+The purpose of this document is to provide information on some of the
+upcall interfaces that are used in order to provide the NFS client with
+some of the information that it requires in order to fully comply with
+the NFS spec.
+
+The DNS resolver
+================
+
+NFSv4 allows for one server to refer the NFS client to data that has been
+migrated onto another server by means of the special "fs_locations"
+attribute. See
+	http://tools.ietf.org/html/rfc3530#section-6
+and
+	http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00
+
+The fs_locations information can take the form of either an ip address and
+a path, or a DNS hostname and a path. The latter requires the NFS client to
+do a DNS lookup in order to mount the new volume, and hence the need for an
+upcall to allow userland to provide this service.
+
+Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual
+/var/lib/nfs/rpc_pipefs, the upcall consists of the following steps:
+
+   (1) The process checks the dns_resolve cache to see if it contains a
+       valid entry. If so, it returns that entry and exits.
+
+   (2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent'
+       (may be changed using the 'nfs.cache_getent' kernel boot parameter)
+       is run, with two arguments:
+		- the cache name, "dns_resolve"
+		- the hostname to resolve
+
+   (3) After looking up the corresponding ip address, the helper script
+       writes the result into the rpc_pipefs pseudo-file
+       '/var/lib/nfs/rpc_pipefs/cache/dns_resolve/channel'
+       in the following (text) format:
+
+		"<ip address> <hostname> <ttl>\n"
+
+       Where <ip address> is in the usual IPv4 (123.456.78.90) or IPv6
+       (ffee:ddcc:bbaa:9988:7766:5544:3322:1100, ffee::1100, ...) format.
+       <hostname> is identical to the second argument of the helper
+       script, and <ttl> is the 'time to live' of this cache entry (in
+       units of seconds).
+
+       Note: If <ip address> is invalid, say the string "0", then a negative
+       entry is created, which will cause the kernel to treat the hostname
+       as having no valid DNS translation.
+
+
+
+
+A basic sample /sbin/nfs_cache_getent
+=====================================
+
+#!/bin/bash
+#
+ttl=600
+#
+cut=/usr/bin/cut
+getent=/usr/bin/getent
+rpc_pipefs=/var/lib/nfs/rpc_pipefs
+#
+die()
+{
+	echo "Usage: $0 cache_name entry_name"
+	exit 1
+}
+
+[ $# -lt 2 ] && die
+cachename="$1"
+cache_path=${rpc_pipefs}/cache/${cachename}/channel
+
+case "${cachename}" in
+	dns_resolve)
+		name="$2"
+		result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )"
+		[ -z "${result}" ] && result="0"
+		;;
+	*)
+		die
+		;;
+esac
+echo "${result} ${name} ${ttl}" >${cache_path}
+
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c08813dbfce2..ce8853755814 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1503,6 +1503,14 @@ and is between 256 and 4096 characters. It is defined in the file
 			[NFS] set the TCP port on which the NFSv4 callback
 			channel should listen.
 
+	nfs.cache_getent=
+			[NFS] sets the pathname to the program which is used
+			to update the NFS client cache entries.
+
+	nfs.cache_getent_timeout=
+			[NFS] sets the timeout after which an attempt to
+			update a cache entry is deemed to have failed.
+
 	nfs.idmap_cache_timeout=
 			[NFS] set the maximum lifetime for idmapper cache
 			entries.
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 845159814de2..da7fda639eac 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o
 
 nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
 			   direct.o pagelist.o proc.o read.o symlink.o unlink.o \
-			   write.o namespace.o mount_clnt.o
+			   write.o namespace.o mount_clnt.o \
+			   dns_resolve.o cache_lib.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o
 nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)	+= nfs3acl.o
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
new file mode 100644
index 000000000000..b4ffd0146ea6
--- /dev/null
+++ b/fs/nfs/cache_lib.c
@@ -0,0 +1,140 @@
+/*
+ * linux/fs/nfs/cache_lib.c
+ *
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include "cache_lib.h"
+
+#define NFS_CACHE_UPCALL_PATHLEN 256
+#define NFS_CACHE_UPCALL_TIMEOUT 15
+
+static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
+				"/sbin/nfs_cache_getent";
+static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
+
+module_param_string(cache_getent, nfs_cache_getent_prog,
+		sizeof(nfs_cache_getent_prog), 0600);
+MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
+module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
+MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
+		"the cache upcall is assumed to have failed");
+
+int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
+{
+	static char *envp[] = { "HOME=/",
+		"TERM=linux",
+		"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+		NULL
+	};
+	char *argv[] = {
+		nfs_cache_getent_prog,
+		cd->name,
+		entry_name,
+		NULL
+	};
+	int ret = -EACCES;
+
+	if (nfs_cache_getent_prog[0] == '\0')
+		goto out;
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+	/*
+	 * Disable the upcall mechanism if we're getting an ENOENT or
+	 * EACCES error. The admin can re-enable it on the fly by using
+	 * sysfs to set the 'cache_getent' parameter once the problem
+	 * has been fixed.
+	 */
+	if (ret == -ENOENT || ret == -EACCES)
+		nfs_cache_getent_prog[0] = '\0';
+out:
+	return ret > 0 ? 0 : ret;
+}
+
+/*
+ * Deferred request handling
+ */
+void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
+{
+	if (atomic_dec_and_test(&dreq->count))
+		kfree(dreq);
+}
+
+static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
+{
+	struct nfs_cache_defer_req *dreq;
+
+	dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
+
+	complete_all(&dreq->completion);
+	nfs_cache_defer_req_put(dreq);
+}
+
+static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
+{
+	struct nfs_cache_defer_req *dreq;
+
+	dreq = container_of(req, struct nfs_cache_defer_req, req);
+	dreq->deferred_req.revisit = nfs_dns_cache_revisit;
+	atomic_inc(&dreq->count);
+
+	return &dreq->deferred_req;
+}
+
+struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
+{
+	struct nfs_cache_defer_req *dreq;
+
+	dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
+	if (dreq) {
+		init_completion(&dreq->completion);
+		atomic_set(&dreq->count, 1);
+		dreq->req.defer = nfs_dns_cache_defer;
+	}
+	return dreq;
+}
+
+int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
+{
+	if (wait_for_completion_timeout(&dreq->completion,
+			nfs_cache_getent_timeout * HZ) == 0)
+		return -ETIMEDOUT;
+	return 0;
+}
+
+int nfs_cache_register(struct cache_detail *cd)
+{
+	struct nameidata nd;
+	struct vfsmount *mnt;
+	int ret;
+
+	mnt = rpc_get_mount();
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+	ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd);
+	if (ret)
+		goto err;
+	ret = sunrpc_cache_register_pipefs(nd.path.dentry,
+			cd->name, 0600, cd);
+	path_put(&nd.path);
+	if (!ret)
+		return ret;
+err:
+	rpc_put_mount();
+	return ret;
+}
+
+void nfs_cache_unregister(struct cache_detail *cd)
+{
+	sunrpc_cache_unregister_pipefs(cd);
+	rpc_put_mount();
+}
+
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
new file mode 100644
index 000000000000..76f856e284e4
--- /dev/null
+++ b/fs/nfs/cache_lib.h
@@ -0,0 +1,27 @@
+/*
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+
+#include <linux/completion.h>
+#include <linux/sunrpc/cache.h>
+#include <asm/atomic.h>
+
+/*
+ * Deferred request handling
+ */
+struct nfs_cache_defer_req {
+	struct cache_req req;
+	struct cache_deferred_req deferred_req;
+	struct completion completion;
+	atomic_t count;
+};
+
+extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
+extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
+extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
+extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
+
+extern int nfs_cache_register(struct cache_detail *cd);
+extern void nfs_cache_unregister(struct cache_detail *cd);
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
new file mode 100644
index 000000000000..f4d54ba97cc6
--- /dev/null
+++ b/fs/nfs/dns_resolve.c
@@ -0,0 +1,335 @@
+/*
+ * linux/fs/nfs/dns_resolve.c
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * Resolves DNS hostnames into valid ip addresses
+ */
+
+#include <linux/hash.h>
+#include <linux/string.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/seq_file.h>
+#include <linux/inet.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/svcauth.h>
+
+#include "dns_resolve.h"
+#include "cache_lib.h"
+
+#define NFS_DNS_HASHBITS 4
+#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
+
+static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
+
+struct nfs_dns_ent {
+	struct cache_head h;
+
+	char *hostname;
+	size_t namelen;
+
+	struct sockaddr_storage addr;
+	size_t addrlen;
+};
+
+
+static void nfs_dns_ent_init(struct cache_head *cnew,
+		struct cache_head *ckey)
+{
+	struct nfs_dns_ent *new;
+	struct nfs_dns_ent *key;
+
+	new = container_of(cnew, struct nfs_dns_ent, h);
+	key = container_of(ckey, struct nfs_dns_ent, h);
+
+	kfree(new->hostname);
+	new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
+	if (new->hostname) {
+		new->namelen = key->namelen;
+		memcpy(&new->addr, &key->addr, key->addrlen);
+		new->addrlen = key->addrlen;
+	} else {
+		new->namelen = 0;
+		new->addrlen = 0;
+	}
+}
+
+static void nfs_dns_ent_put(struct kref *ref)
+{
+	struct nfs_dns_ent *item;
+
+	item = container_of(ref, struct nfs_dns_ent, h.ref);
+	kfree(item->hostname);
+	kfree(item);
+}
+
+static struct cache_head *nfs_dns_ent_alloc(void)
+{
+	struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
+
+	if (item != NULL) {
+		item->hostname = NULL;
+		item->namelen = 0;
+		item->addrlen = 0;
+		return &item->h;
+	}
+	return NULL;
+};
+
+static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
+{
+	return hash_str(key->hostname, NFS_DNS_HASHBITS);
+}
+
+static void nfs_dns_request(struct cache_detail *cd,
+		struct cache_head *ch,
+		char **bpp, int *blen)
+{
+	struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+
+	qword_add(bpp, blen, key->hostname);
+	(*bpp)[-1] = '\n';
+}
+
+static int nfs_dns_upcall(struct cache_detail *cd,
+		struct cache_head *ch)
+{
+	struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+	int ret;
+
+	ret = nfs_cache_upcall(cd, key->hostname);
+	if (ret)
+		ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
+	return ret;
+}
+
+static int nfs_dns_match(struct cache_head *ca,
+		struct cache_head *cb)
+{
+	struct nfs_dns_ent *a;
+	struct nfs_dns_ent *b;
+
+	a = container_of(ca, struct nfs_dns_ent, h);
+	b = container_of(cb, struct nfs_dns_ent, h);
+
+	if (a->namelen == 0 || a->namelen != b->namelen)
+		return 0;
+	return memcmp(a->hostname, b->hostname, a->namelen) == 0;
+}
+
+static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
+		struct cache_head *h)
+{
+	struct nfs_dns_ent *item;
+	long ttl;
+
+	if (h == NULL) {
+		seq_puts(m, "# ip address      hostname        ttl\n");
+		return 0;
+	}
+	item = container_of(h, struct nfs_dns_ent, h);
+	ttl = (long)item->h.expiry_time - (long)get_seconds();
+	if (ttl < 0)
+		ttl = 0;
+
+	if (!test_bit(CACHE_NEGATIVE, &h->flags)) {
+		char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1];
+
+		rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf));
+		seq_printf(m, "%15s ", buf);
+	} else
+		seq_puts(m, "<none>          ");
+	seq_printf(m, "%15s %ld\n", item->hostname, ttl);
+	return 0;
+}
+
+struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
+		struct nfs_dns_ent *key)
+{
+	struct cache_head *ch;
+
+	ch = sunrpc_cache_lookup(cd,
+			&key->h,
+			nfs_dns_hash(key));
+	if (!ch)
+		return NULL;
+	return container_of(ch, struct nfs_dns_ent, h);
+}
+
+struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
+		struct nfs_dns_ent *new,
+		struct nfs_dns_ent *key)
+{
+	struct cache_head *ch;
+
+	ch = sunrpc_cache_update(cd,
+			&new->h, &key->h,
+			nfs_dns_hash(key));
+	if (!ch)
+		return NULL;
+	return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+	char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
+	struct nfs_dns_ent key, *item;
+	unsigned long ttl;
+	ssize_t len;
+	int ret = -EINVAL;
+
+	if (buf[buflen-1] != '\n')
+		goto out;
+	buf[buflen-1] = '\0';
+
+	len = qword_get(&buf, buf1, sizeof(buf1));
+	if (len <= 0)
+		goto out;
+	key.addrlen = rpc_pton(buf1, len,
+			(struct sockaddr *)&key.addr,
+			sizeof(key.addr));
+
+	len = qword_get(&buf, buf1, sizeof(buf1));
+	if (len <= 0)
+		goto out;
+
+	key.hostname = buf1;
+	key.namelen = len;
+	memset(&key.h, 0, sizeof(key.h));
+
+	ttl = get_expiry(&buf);
+	if (ttl == 0)
+		goto out;
+	key.h.expiry_time = ttl + get_seconds();
+
+	ret = -ENOMEM;
+	item = nfs_dns_lookup(cd, &key);
+	if (item == NULL)
+		goto out;
+
+	if (key.addrlen == 0)
+		set_bit(CACHE_NEGATIVE, &key.h.flags);
+
+	item = nfs_dns_update(cd, &key, item);
+	if (item == NULL)
+		goto out;
+
+	ret = 0;
+	cache_put(&item->h, cd);
+out:
+	return ret;
+}
+
+static struct cache_detail nfs_dns_resolve = {
+	.owner = THIS_MODULE,
+	.hash_size = NFS_DNS_HASHTBL_SIZE,
+	.hash_table = nfs_dns_table,
+	.name = "dns_resolve",
+	.cache_put = nfs_dns_ent_put,
+	.cache_upcall = nfs_dns_upcall,
+	.cache_parse = nfs_dns_parse,
+	.cache_show = nfs_dns_show,
+	.match = nfs_dns_match,
+	.init = nfs_dns_ent_init,
+	.update = nfs_dns_ent_init,
+	.alloc = nfs_dns_ent_alloc,
+};
+
+static int do_cache_lookup(struct cache_detail *cd,
+		struct nfs_dns_ent *key,
+		struct nfs_dns_ent **item,
+		struct nfs_cache_defer_req *dreq)
+{
+	int ret = -ENOMEM;
+
+	*item = nfs_dns_lookup(cd, key);
+	if (*item) {
+		ret = cache_check(cd, &(*item)->h, &dreq->req);
+		if (ret)
+			*item = NULL;
+	}
+	return ret;
+}
+
+static int do_cache_lookup_nowait(struct cache_detail *cd,
+		struct nfs_dns_ent *key,
+		struct nfs_dns_ent **item)
+{
+	int ret = -ENOMEM;
+
+	*item = nfs_dns_lookup(cd, key);
+	if (!*item)
+		goto out_err;
+	ret = -ETIMEDOUT;
+	if (!test_bit(CACHE_VALID, &(*item)->h.flags)
+			|| (*item)->h.expiry_time < get_seconds()
+			|| cd->flush_time > (*item)->h.last_refresh)
+		goto out_put;
+	ret = -ENOENT;
+	if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
+		goto out_put;
+	return 0;
+out_put:
+	cache_put(&(*item)->h, cd);
+out_err:
+	*item = NULL;
+	return ret;
+}
+
+static int do_cache_lookup_wait(struct cache_detail *cd,
+		struct nfs_dns_ent *key,
+		struct nfs_dns_ent **item)
+{
+	struct nfs_cache_defer_req *dreq;
+	int ret = -ENOMEM;
+
+	dreq = nfs_cache_defer_req_alloc();
+	if (!dreq)
+		goto out;
+	ret = do_cache_lookup(cd, key, item, dreq);
+	if (ret == -EAGAIN) {
+		ret = nfs_cache_wait_for_upcall(dreq);
+		if (!ret)
+			ret = do_cache_lookup_nowait(cd, key, item);
+	}
+	nfs_cache_defer_req_put(dreq);
+out:
+	return ret;
+}
+
+ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+		struct sockaddr *sa, size_t salen)
+{
+	struct nfs_dns_ent key = {
+		.hostname = name,
+		.namelen = namelen,
+	};
+	struct nfs_dns_ent *item = NULL;
+	ssize_t ret;
+
+	ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item);
+	if (ret == 0) {
+		if (salen >= item->addrlen) {
+			memcpy(sa, &item->addr, item->addrlen);
+			ret = item->addrlen;
+		} else
+			ret = -EOVERFLOW;
+		cache_put(&item->h, &nfs_dns_resolve);
+	} else if (ret == -ENOENT)
+		ret = -ESRCH;
+	return ret;
+}
+
+int nfs_dns_resolver_init(void)
+{
+	return nfs_cache_register(&nfs_dns_resolve);
+}
+
+void nfs_dns_resolver_destroy(void)
+{
+	nfs_cache_unregister(&nfs_dns_resolve);
+}
+
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
new file mode 100644
index 000000000000..a3f0938babf7
--- /dev/null
+++ b/fs/nfs/dns_resolve.h
@@ -0,0 +1,14 @@
+/*
+ * Resolve DNS hostnames into valid ip addresses
+ */
+#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H
+#define __LINUX_FS_NFS_DNS_RESOLVE_H
+
+#define NFS_DNS_HOSTNAME_MAXLEN	(128)
+
+extern int nfs_dns_resolver_init(void);
+extern void nfs_dns_resolver_destroy(void);
+extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+		struct sockaddr *sa, size_t salen);
+
+#endif
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index fe5a8b45d867..060022b4651c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
+#include "dns_resolve.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
@@ -1506,6 +1507,10 @@ static int __init init_nfs_fs(void)
 {
 	int err;
 
+	err = nfs_dns_resolver_init();
+	if (err < 0)
+		goto out8;
+
 	err = nfs_fscache_register();
 	if (err < 0)
 		goto out7;
@@ -1564,6 +1569,8 @@ out5:
 out6:
 	nfs_fscache_unregister();
 out7:
+	nfs_dns_resolver_destroy();
+out8:
 	return err;
 }
 
@@ -1575,6 +1582,7 @@ static void __exit exit_nfs_fs(void)
 	nfs_destroy_inodecache();
 	nfs_destroy_nfspagecache();
 	nfs_fscache_unregister();
+	nfs_dns_resolver_destroy();
 #ifdef CONFIG_PROC_FS
 	rpc_proc_unregister("nfs");
 #endif
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 3fdacaf5c708..7f676bdf70d3 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -416,11 +416,13 @@ struct vfsmount *rpc_get_mount(void)
 		return ERR_PTR(err);
 	return rpc_mount;
 }
+EXPORT_SYMBOL_GPL(rpc_get_mount);
 
 void rpc_put_mount(void)
 {
 	simple_release_fs(&rpc_mount, &rpc_mount_count);
 }
+EXPORT_SYMBOL_GPL(rpc_put_mount);
 
 static int rpc_delete_dentry(struct dentry *dentry)
 {
@@ -946,6 +948,7 @@ enum {
 	RPCAUTH_portmap,
 	RPCAUTH_statd,
 	RPCAUTH_nfsd4_cb,
+	RPCAUTH_cache,
 	RPCAUTH_RootEOF
 };
 
@@ -974,6 +977,10 @@ static const struct rpc_filelist files[] = {
 		.name = "nfsd4_cb",
 		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
 	},
+	[RPCAUTH_cache] = {
+		.name = "cache",
+		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+	},
 };
 
 static int
-- 
cgit v1.2.3-70-g09d2


From 7d7ea882898f23989209d0359691b356f73240dc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 19 Aug 2009 18:12:34 -0400
Subject: NFS: Use the DNS resolver in the mount code.

In the referral code, use it to look up the new server's ip address if the
fs_locations attribute contains a hostname.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4namespace.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index ef22ee89aa77..2636c26d56fa 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -17,6 +17,7 @@
 #include <linux/inet.h>
 #include "internal.h"
 #include "nfs4_fs.h"
+#include "dns_resolve.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
@@ -95,6 +96,20 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
 	return 0;
 }
 
+static size_t nfs_parse_server_name(char *string, size_t len,
+		struct sockaddr *sa, size_t salen)
+{
+	ssize_t ret;
+
+	ret = rpc_pton(string, len, sa, salen);
+	if (ret == 0) {
+		ret = nfs_dns_resolve_name(string, len, sa, salen);
+		if (ret < 0)
+			ret = 0;
+	}
+	return ret;
+}
+
 static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 				     char *page, char *page2,
 				     const struct nfs4_fs_location *location)
@@ -121,7 +136,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 
 		if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
 			continue;
-		mountdata->addrlen = rpc_pton(buf->data, buf->len,
+		mountdata->addrlen = nfs_parse_server_name(buf->data,
+				buf->len,
 				mountdata->addr, mountdata->addrlen);
 		if (mountdata->addrlen == 0)
 			continue;
-- 
cgit v1.2.3-70-g09d2


From e1af88a1ad8f4dea3a2d6c5637d94a3fc3c62994 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 19 Aug 2009 18:04:43 +0200
Subject: nfs: Remove reference to generic_osync_inode from a comment

generic_file_direct_write() no longer calls generic_osync_inode() so remove the
comment.

CC: linux-nfs@vger.kernel.org
CC: Neil Brown <neilb@suse.de>
CC: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e4e089a8f294..6c3210099d51 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -934,9 +934,6 @@ out:
  * back into its cache.  We let the server do generic write
  * parameter checking and report problems.
  *
- * We also avoid an unnecessary invocation of generic_osync_inode(),
- * as it is fairly meaningless to sync the metadata of an NFS file.
- *
  * We eliminate local atime updates, see direct read above.
  *
  * We avoid unnecessary page cache invalidations for normal cached
-- 
cgit v1.2.3-70-g09d2


From a8b88d3d49623ac701b5dc996cbd61219c793c7c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 20 Aug 2009 18:26:52 +0200
Subject: ocfs2: Add missing lock name

There is missing name for NFSSync cluster lock. This makes lockdep unhappy
because we end up passing NULL to lockdep when initializing lock key. Fix it.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/ocfs2_lockid.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index fcdba091af3d..c212cf5a2bdf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -108,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
 	[OCFS2_LOCK_TYPE_OPEN] = "Open",
 	[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
 	[OCFS2_LOCK_TYPE_QINFO] = "Quota",
+	[OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
 	[OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
 };
 
-- 
cgit v1.2.3-70-g09d2


From c795b33ba171e41563ab7e25105c0cd4edd81cd7 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@gmail.com>
Date: Thu, 20 Aug 2009 13:43:19 -0500
Subject: ocfs2/dlm: Wait on lockres instead of erroring cancel requests

In case a downconvert is queued, and a flock receives a signal,
BUG_ON(lockres->l_action != OCFS2_AST_INVALID) is triggered
because a lock cancel triggers a dlmunlock while an AST is
scheduled.

To avoid this, allow a LKM_CANCEL to pass through, and let it
wait on __dlm_wait_on_lockres().

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.de>
Acked-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlm/dlmunlock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index fcf879ed6930..756f5b0998e0 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -122,7 +122,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
 	 * that still has AST's pending... */
 	in_use = !list_empty(&lock->ast_list);
 	spin_unlock(&dlm->ast_lock);
-	if (in_use) {
+	if (in_use && !(flags & LKM_CANCEL)) {
 	       mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
 		    "while waiting for an ast!", res->lockname.len,
 		    res->lockname.name);
@@ -131,7 +131,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
 
 	spin_lock(&res->spinlock);
 	if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
-		if (master_node) {
+		if (master_node && !(flags & LKM_CANCEL)) {
 			mlog(ML_ERROR, "lockres in progress!\n");
 			spin_unlock(&res->spinlock);
 			return DLM_FORWARD;
-- 
cgit v1.2.3-70-g09d2


From 939a9421eb53d3ea83188ae13802779041caefdb Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Thu, 20 Aug 2009 19:29:03 -0700
Subject: vfs: allow file truncations when both suid and write permissions set

When suid is set and the non-owner user has write permission, any writing
into this file should be allowed and suid should be removed after that.

However, current kernel only allows writing without truncations, when we
do truncations on that file, we get EPERM.  This is a bug.

Steps to reproduce this bug:

% ls -l rootdir/file1
-rwsrwsrwx 1 root root 3 Jun 25 15:42 rootdir/file1
% echo h > rootdir/file1
zsh: operation not permitted: rootdir/file1
% ls -l rootdir/file1
-rwsrwsrwx 1 root root 3 Jun 25 15:42 rootdir/file1
% echo h >> rootdir/file1
% ls -l rootdir/file1
-rwxrwxrwx 1 root root 5 Jun 25 16:34 rootdir/file1

Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Eric Sandeen <esandeen@redhat.com>
Acked-by: Eric Paris <eparis@redhat.com>
Cc: Eugene Teo <eteo@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/open.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index dd98e8076024..40d1fa25f5aa 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -199,7 +199,7 @@ out:
 int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	struct file *filp)
 {
-	int err;
+	int ret;
 	struct iattr newattrs;
 
 	/* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
@@ -214,12 +214,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	}
 
 	/* Remove suid/sgid on truncate too */
-	newattrs.ia_valid |= should_remove_suid(dentry);
+	ret = should_remove_suid(dentry);
+	if (ret)
+		newattrs.ia_valid |= ret | ATTR_FORCE;
 
 	mutex_lock(&dentry->d_inode->i_mutex);
-	err = notify_change(dentry, &newattrs);
+	ret = notify_change(dentry, &newattrs);
 	mutex_unlock(&dentry->d_inode->i_mutex);
-	return err;
+	return ret;
 }
 
 static long do_sys_truncate(const char __user *pathname, loff_t length)
-- 
cgit v1.2.3-70-g09d2


From 03e860bd9f6a3cca747b0795bed26279a8b420a0 Mon Sep 17 00:00:00 2001
From: "From: Nick Piggin" <npiggin@suse.de>
Date: Fri, 21 Aug 2009 10:09:44 +0200
Subject: btrfs: fix inode rbtree corruption

Node may not be inserted over existing node. This causes inode tree
corruption and I was seeing crashes in inode_tree_del which I can not
reproduce after this patch.

The other way to fix this would be to tie inode lifetime in the rbtree
with inode while not in freeing state. I had a look at this but it is
not so trivial at this point. At least this patch gets things working again.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Chris Mason <chris.mason@oracle.com>
Acked-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/btrfs/inode.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 272b9b2bea86..59cba180fe83 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3099,8 +3099,12 @@ static void inode_tree_add(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_inode *entry;
-	struct rb_node **p = &root->inode_tree.rb_node;
-	struct rb_node *parent = NULL;
+	struct rb_node **p;
+	struct rb_node *parent;
+
+again:
+	p = &root->inode_tree.rb_node;
+	parent = NULL;
 
 	spin_lock(&root->inode_lock);
 	while (*p) {
@@ -3108,13 +3112,16 @@ static void inode_tree_add(struct inode *inode)
 		entry = rb_entry(parent, struct btrfs_inode, rb_node);
 
 		if (inode->i_ino < entry->vfs_inode.i_ino)
-			p = &(*p)->rb_left;
+			p = &parent->rb_left;
 		else if (inode->i_ino > entry->vfs_inode.i_ino)
-			p = &(*p)->rb_right;
+			p = &parent->rb_right;
 		else {
 			WARN_ON(!(entry->vfs_inode.i_state &
 				  (I_WILL_FREE | I_FREEING | I_CLEAR)));
-			break;
+			rb_erase(parent, &root->inode_tree);
+			RB_CLEAR_NODE(parent);
+			spin_unlock(&root->inode_lock);
+			goto again;
 		}
 	}
 	rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
@@ -3126,12 +3133,12 @@ static void inode_tree_del(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 
+	spin_lock(&root->inode_lock);
 	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
-		spin_lock(&root->inode_lock);
 		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
-		spin_unlock(&root->inode_lock);
 		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
 	}
+	spin_unlock(&root->inode_lock);
 }
 
 static noinline void init_btrfs_i(struct inode *inode)
-- 
cgit v1.2.3-70-g09d2


From 8e9d78edea3ce5c0036f85b93091483f2f15443a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 21 Aug 2009 17:40:08 -0700
Subject: Re-introduce page mapping check in mark_buffer_dirty()

In commit a8e7d49aa7be728c4ae241a75a2a124cdcabc0c5 ("Fix race in
create_empty_buffers() vs __set_page_dirty_buffers()"), I removed a test
for a NULL page mapping unintentionally when some of the code inside
__set_page_dirty() was moved to the callers.

That removal generally didn't matter, since a filesystem would serialize
truncation (which clears the page mapping) against writing (which marks
the buffer dirty), so locking at a higher level (either per-page or an
inode at a time) should mean that the buffer page would be stable.  And
indeed, nothing bad seemed to happen.

Except it turns out that apparently reiserfs does something odd when
under load and writing out the journal, and we have a number of bugzilla
entries that look similar:

	http://bugzilla.kernel.org/show_bug.cgi?id=13556
	http://bugzilla.kernel.org/show_bug.cgi?id=13756
	http://bugzilla.kernel.org/show_bug.cgi?id=13876

and it looks like reiserfs depended on that check (the common theme
seems to be "data=journal", and a journal writeback during a truncate).

I suspect reiserfs should have some additional locking, but in the
meantime this should get us back to the pre-2.6.29 behavior.

Pattern-pointed-out-by: Roland Kletzing <devzero@web.de>
Cc: stable@kernel.org (2.6.29 and 2.6.30)
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index a3ef091a45bd..28f320fac4d4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1165,8 +1165,11 @@ void mark_buffer_dirty(struct buffer_head *bh)
 
 	if (!test_set_buffer_dirty(bh)) {
 		struct page *page = bh->b_page;
-		if (!TestSetPageDirty(page))
-			__set_page_dirty(page, page_mapping(page), 0);
+		if (!TestSetPageDirty(page)) {
+			struct address_space *mapping = page_mapping(page);
+			if (mapping)
+				__set_page_dirty(page, mapping, 0);
+		}
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From 5eecfde615894dc1c2e3f85b515a96ae2e408fb5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 21 Aug 2009 17:50:30 -0400
Subject: NFS: Handle a zero-length auth flavor list

Some releases of Linux rpc.mountd (nfs-utils 1.1.4 and later) return an
empty auth flavor list if no sec= was specified for the export.  This is
notably broken server behavior.

The new auth flavor list checking added in a recent commit rejects this
case.  The OpenSolaris client does too.

The broken mountd implementation is already widely deployed.  To avoid
a behavioral regression, the kernel's mount client skips flavor checking
(ie reverts to the pre-2.6.32 behavior) if mountd returns an empty
flavor list.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 9c85cdb353aa..f3a95df4b95f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1337,6 +1337,16 @@ static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
 {
 	unsigned int i, j, server_authlist_len = *(request->auth_flav_len);
 
+	/*
+	 * Certain releases of Linux's mountd return an empty
+	 * flavor list.  To prevent behavioral regression with
+	 * these servers (ie. rejecting mounts that used to
+	 * succeed), revert to pre-2.6.32 behavior (no checking)
+	 * if the returned flavor list is empty.
+	 */
+	if (server_authlist_len == 0)
+		return 0;
+
 	/*
 	 * We avoid sophisticated negotiating here, as there are
 	 * plenty of cases where we can get it wrong, providing
-- 
cgit v1.2.3-70-g09d2


From 6777d773a463ac045d333b989d4e44660f8d92ad Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Fri, 21 Aug 2009 14:32:48 -0400
Subject: kernel_read: redefine offset type

vfs_read() offset is defined as loff_t, but kernel_read()
offset is only defined as unsigned long. Redefine
kernel_read() offset as loff_t.

Cc: stable@kernel.org
Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/exec.c          | 4 ++--
 include/linux/fs.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 4a8849e45b21..fb4f3cdda78c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -678,8 +678,8 @@ exit:
 }
 EXPORT_SYMBOL(open_exec);
 
-int kernel_read(struct file *file, unsigned long offset,
-	char *addr, unsigned long count)
+int kernel_read(struct file *file, loff_t offset,
+		char *addr, unsigned long count)
 {
 	mm_segment_t old_fs;
 	loff_t pos = offset;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 67888a9e0655..73e9b643e455 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2123,7 +2123,7 @@ extern struct file *do_filp_open(int dfd, const char *pathname,
 		int open_flag, int mode, int acc_mode);
 extern int may_open(struct path *, int, int);
 
-extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
+extern int kernel_read(struct file *, loff_t, char *, unsigned long);
 extern struct file * open_exec(const char *);
  
 /* fs/dcache.c -- generic fs support functions */
-- 
cgit v1.2.3-70-g09d2


From cd0120751d631bc4b99f180c1c22de2caca98207 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Sat, 22 Aug 2009 19:26:42 +0200
Subject: GFS2: jumping to wrong label?

Also a gfs2_glock_dq() is required here.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f8bd20baf99c..ce551f85524f 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -349,7 +349,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
 
 	error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
 	if (error)
-		goto out_rgrp;
+		goto out_gunlock;
 
 	error = gfs2_dir_del(dip, &dentry->d_name);
         if (error)
-- 
cgit v1.2.3-70-g09d2


From d34843d0c4a20872f3f3bfb510328bd043b939ff Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 24 Aug 2009 10:44:18 +0100
Subject: GFS2: Add "-o errors=panic|withdraw" mount options

This patch adds "-o errors=panic" and "-o errors=withdraw" to the
gfs2 mount options.  The "errors=withdraw" option is today's
current behaviour, meaning to withdraw from the file system if a
non-serious gfs2 error occurs.  The new "errors=panic" option
tells gfs2 to force a kernel panic if a non-serious gfs2 file
system error occurs.  This may be useful, for example, where
fabric-level fencing is used that has no way to reboot (such as
fence_scsi).

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     |  7 +++++++
 fs/gfs2/ops_fstype.c |  1 +
 fs/gfs2/super.c      | 36 ++++++++++++++++++++++++++++++++++++
 fs/gfs2/util.c       | 41 +++++++++++++++++++++++++++--------------
 4 files changed, 71 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 61801ada36f0..1d11e6e0373e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -406,6 +406,12 @@ struct gfs2_statfs_change_host {
 #define GFS2_DATA_WRITEBACK	1
 #define GFS2_DATA_ORDERED	2
 
+#define GFS2_ERRORS_DEFAULT     GFS2_ERRORS_WITHDRAW
+#define GFS2_ERRORS_WITHDRAW    0
+#define GFS2_ERRORS_CONTINUE    1 /* place holder for future feature */
+#define GFS2_ERRORS_RO          2 /* place holder for future feature */
+#define GFS2_ERRORS_PANIC       3
+
 struct gfs2_args {
 	char ar_lockproto[GFS2_LOCKNAME_LEN];	/* Name of the Lock Protocol */
 	char ar_locktable[GFS2_LOCKNAME_LEN];	/* Name of the Lock Table */
@@ -422,6 +428,7 @@ struct gfs2_args {
 	unsigned int ar_data:2;			/* ordered/writeback */
 	unsigned int ar_meta:1;			/* mount metafs */
 	unsigned int ar_discard:1;		/* discard requests */
+	unsigned int ar_errors:2;               /* errors=withdraw | panic */
 	int ar_commit;				/* Commit interval */
 };
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 39021c020176..165518a1041e 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1168,6 +1168,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
 	sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
 	sdp->sd_args.ar_commit = 60;
+	sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
 
 	error = gfs2_mount_args(sdp, &sdp->sd_args, data);
 	if (error) {
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 85bd2bc8c1de..7a5c128e8776 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -68,6 +68,8 @@ enum {
 	Opt_discard,
 	Opt_nodiscard,
 	Opt_commit,
+	Opt_err_withdraw,
+	Opt_err_panic,
 	Opt_error,
 };
 
@@ -97,6 +99,8 @@ static const match_table_t tokens = {
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
 	{Opt_commit, "commit=%d"},
+	{Opt_err_withdraw, "errors=withdraw"},
+	{Opt_err_panic, "errors=panic"},
 	{Opt_error, NULL}
 };
 
@@ -152,6 +156,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 			args->ar_localcaching = 1;
 			break;
 		case Opt_debug:
+			if (args->ar_errors == GFS2_ERRORS_PANIC) {
+				fs_info(sdp, "-o debug and -o errors=panic "
+				       "are mutually exclusive.\n");
+				return -EINVAL;
+			}
 			args->ar_debug = 1;
 			break;
 		case Opt_nodebug:
@@ -205,6 +214,17 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 				return rv ? rv : -EINVAL;
 			}
 			break;
+		case Opt_err_withdraw:
+			args->ar_errors = GFS2_ERRORS_WITHDRAW;
+			break;
+		case Opt_err_panic:
+			if (args->ar_debug) {
+				fs_info(sdp, "-o debug and -o errors=panic "
+					"are mutually exclusive.\n");
+				return -EINVAL;
+			}
+			args->ar_errors = GFS2_ERRORS_PANIC;
+			break;
 		case Opt_error:
 		default:
 			fs_info(sdp, "invalid mount option: %s\n", o);
@@ -1226,6 +1246,22 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	lfsecs = sdp->sd_tune.gt_log_flush_secs;
 	if (lfsecs != 60)
 		seq_printf(s, ",commit=%d", lfsecs);
+	if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
+		const char *state;
+
+		switch (args->ar_errors) {
+		case GFS2_ERRORS_WITHDRAW:
+			state = "withdraw";
+			break;
+		case GFS2_ERRORS_PANIC:
+			state = "panic";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",errors=%s", state);
+	}
 	return 0;
 }
 
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 9d12b1118ba0..f6a7efa34eb9 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -38,24 +38,30 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
 	const struct lm_lockops *lm = ls->ls_ops;
 	va_list args;
 
-	if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
+	    test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
 		return 0;
 
 	va_start(args, fmt);
 	vprintk(fmt, args);
 	va_end(args);
 
-	fs_err(sdp, "about to withdraw this file system\n");
-	BUG_ON(sdp->sd_args.ar_debug);
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
+		fs_err(sdp, "about to withdraw this file system\n");
+		BUG_ON(sdp->sd_args.ar_debug);
 
-	kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+		kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
 
-	if (lm->lm_unmount) {
-		fs_err(sdp, "telling LM to unmount\n");
-		lm->lm_unmount(sdp);
+		if (lm->lm_unmount) {
+			fs_err(sdp, "telling LM to unmount\n");
+			lm->lm_unmount(sdp);
+		}
+		fs_err(sdp, "withdrawn\n");
+		dump_stack();
 	}
-	fs_err(sdp, "withdrawn\n");
-	dump_stack();
+
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+		panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname);
 
 	return -1;
 }
@@ -93,17 +99,24 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
 			gfs2_tune_get(sdp, gt_complain_secs) * HZ))
 		return -2;
 
-	printk(KERN_WARNING
-	       "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
-	       "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
-	       sdp->sd_fsname, assertion,
-	       sdp->sd_fsname, function, file, line);
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
+		printk(KERN_WARNING
+		       "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+		       "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		       sdp->sd_fsname, assertion,
+		       sdp->sd_fsname, function, file, line);
 
 	if (sdp->sd_args.ar_debug)
 		BUG();
 	else
 		dump_stack();
 
+	if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+		panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+		      "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		      sdp->sd_fsname, assertion,
+		      sdp->sd_fsname, function, file, line);
+
 	sdp->sd_last_warning = jiffies;
 
 	return -1;
-- 
cgit v1.2.3-70-g09d2


From 6d41807614151829ae17a3a58bff8572af5e407e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 10 Aug 2009 16:03:43 -0400
Subject: ext3: Update Kconfig description of EXT3_DEFAULTS_TO_ORDERED

The old description for this configuration option was perhaps not
completely balanced in terms of describing the tradeoffs of using a
default of data=writeback vs. data=ordered.  Despite the fact that old
description very strongly recomended disabling this feature, all of
the major distributions have elected to preserve the existing 'legacy'
default, which is a strong hint that it perhaps wasn't telling the
whole story.

This revised description has been vetted by a number of ext3
developers as being better at informing the user about the tradeoffs
of enabling or disabling this configuration feature.

Cc: linux-ext4@vger.kernel.org
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/Kconfig | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index fb3c1a21b135..522b15498f45 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -29,23 +29,25 @@ config EXT3_FS
 	  module will be called ext3.
 
 config EXT3_DEFAULTS_TO_ORDERED
-	bool "Default to 'data=ordered' in ext3 (legacy option)"
+	bool "Default to 'data=ordered' in ext3"
 	depends on EXT3_FS
 	help
-	  If a filesystem does not explicitly specify a data ordering
-	  mode, and the journal capability allowed it, ext3 used to
-	  historically default to 'data=ordered'.
-
-	  That was a rather unfortunate choice, because it leads to all
-	  kinds of latency problems, and the 'data=writeback' mode is more
-	  appropriate these days.
-
-	  You should probably always answer 'n' here, and if you really
-	  want to use 'data=ordered' mode, set it in the filesystem itself
-	  with 'tune2fs -o journal_data_ordered'.
-
-	  But if you really want to enable the legacy default, you can do
-	  so by answering 'y' to this question.
+	  The journal mode options for ext3 have different tradeoffs
+	  between when data is guaranteed to be on disk and
+	  performance.	The use of "data=writeback" can cause
+	  unwritten data to appear in files after an system crash or
+	  power failure, which can be a security issue.	 However,
+	  "data=ordered" mode can also result in major performance
+	  problems, including seconds-long delays before an fsync()
+	  call returns.	 For details, see:
+
+	  http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
+
+	  If you have been historically happy with ext3's performance,
+	  data=ordered mode will be a safe choice and you should
+	  answer 'y' here.  If you understand the reliability and data
+	  privacy issues of data=writeback and are willing to make
+	  that trade off, answer 'n'.
 
 config EXT3_FS_XATTR
 	bool "Ext3 extended attributes"
-- 
cgit v1.2.3-70-g09d2


From 3c4cec65274481ec6332b0a91f19b4c8c5394801 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 24 Aug 2009 16:38:43 +0200
Subject: ext3: Improve error message that changing journaling mode on remount
 is not possible

This patch makes the error message about changing journaling mode on remount
more descriptive. Some people are going to hit this error now due to commit
bbae8bcc49bc4d002221dab52c79a50a82e7cd1f if they configure a kernel to default
to data=writeback mode. The problem happens if they have data=ordered set for
the root filesystem in /etc/fstab but not in the kernel command line (and they
don't use initrd). Their filesystem then gets mounted as data=writeback by
kernel but then their boot fails because init scripts won't be able to remount
the filesystem rw. Better error message will hopefully make it easier for them
to find the error in their setup and bother us less with error reports :).

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c | 40 +++++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 524b349c6299..a8d80a7f1105 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -543,6 +543,19 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
 #endif
 }
 
+static char *data_mode_string(unsigned long mode)
+{
+	switch (mode) {
+	case EXT3_MOUNT_JOURNAL_DATA:
+		return "journal";
+	case EXT3_MOUNT_ORDERED_DATA:
+		return "ordered";
+	case EXT3_MOUNT_WRITEBACK_DATA:
+		return "writeback";
+	}
+	return "unknown";
+}
+
 /*
  * Show an option if
  *  - it's set to a non-default value OR
@@ -616,13 +629,8 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (test_opt(sb, NOBH))
 		seq_puts(seq, ",nobh");
 
-	if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
-		seq_puts(seq, ",data=journal");
-	else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
-		seq_puts(seq, ",data=ordered");
-	else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
-		seq_puts(seq, ",data=writeback");
-
+	seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt &
+						     EXT3_MOUNT_DATA_FLAGS));
 	if (test_opt(sb, DATA_ERR_ABORT))
 		seq_puts(seq, ",data_err=abort");
 
@@ -1024,12 +1032,18 @@ static int parse_options (char *options, struct super_block *sb,
 		datacheck:
 			if (is_remount) {
 				if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
-						!= data_opt) {
-					printk(KERN_ERR
-						"EXT3-fs: cannot change data "
-						"mode on remount\n");
-					return 0;
-				}
+						== data_opt)
+					break;
+				printk(KERN_ERR
+					"EXT3-fs (device %s): Cannot change "
+					"data mode on remount. The filesystem "
+					"is mounted in data=%s mode and you "
+					"try to remount it in data=%s mode.\n",
+					sb->s_id,
+					data_mode_string(sbi->s_mount_opt &
+							EXT3_MOUNT_DATA_FLAGS),
+					data_mode_string(data_opt));
+				return 0;
 			} else {
 				sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
 				sbi->s_mount_opt |= data_opt;
-- 
cgit v1.2.3-70-g09d2


From 063c4c99630c0b06afad080d2a18bda64172c1a2 Mon Sep 17 00:00:00 2001
From: Lars Marowsky-Bree <lmb@suse.de>
Date: Tue, 11 Aug 2009 16:18:23 -0500
Subject: dlm: fix connection close handling

Closing a connection to a node can create problems if there are
outstanding messages for that node.  The problems include dlm_send
spinning attempting to reconnect, or BUG from tcp_connect_to_sock()
attempting to use a partially closed connection.

To cleanly close a connection, we now first attempt to send any pending
messages, cancel any remaining workqueue work, and flag the connection
as closed to avoid reconnect attempts.

Signed-off-by: Lars Marowsky-Bree <lmb@suse.de>
Signed-off-by: Christine Caulfield <ccaulfie@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 210d52c48808..bda690cd3640 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,6 +106,7 @@ struct connection {
 #define CF_CONNECT_PENDING 3
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
+#define CF_CLOSE 6
 	struct list_head writequeue;  /* List of outgoing writequeue_entries */
 	spinlock_t writequeue_lock;
 	int (*rx_action) (struct connection *);	/* What to do when active */
@@ -299,6 +300,8 @@ static void lowcomms_write_space(struct sock *sk)
 
 static inline void lowcomms_connect_sock(struct connection *con)
 {
+	if (test_bit(CF_CLOSE, &con->flags))
+		return;
 	if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
 		queue_work(send_workqueue, &con->swork);
 }
@@ -1368,6 +1371,13 @@ int dlm_lowcomms_close(int nodeid)
 	log_print("closing connection to node %d", nodeid);
 	con = nodeid2con(nodeid, 0);
 	if (con) {
+		clear_bit(CF_CONNECT_PENDING, &con->flags);
+		clear_bit(CF_WRITE_PENDING, &con->flags);
+		set_bit(CF_CLOSE, &con->flags);
+		if (cancel_work_sync(&con->swork))
+			log_print("canceled swork for node %d", nodeid);
+		if (cancel_work_sync(&con->rwork))
+			log_print("canceled rwork for node %d", nodeid);
 		clean_one_writequeue(con);
 		close_connection(con, true);
 	}
@@ -1393,9 +1403,10 @@ static void process_send_sockets(struct work_struct *work)
 
 	if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
 		con->connect_action(con);
+		set_bit(CF_WRITE_PENDING, &con->flags);
 	}
-	clear_bit(CF_WRITE_PENDING, &con->flags);
-	send_to_sock(con);
+	if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
+		send_to_sock(con);
 }
 
 
-- 
cgit v1.2.3-70-g09d2


From 1329e3f2c898cfabb6ed236d3fb8c1725197af53 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <bonzini@gnu.org>
Date: Mon, 24 Aug 2009 13:18:04 -0500
Subject: dlm: use kernel_sendpage

Using kernel_sendpage() is cleaner and safer than following
sock->ops ourselves.

Signed-off-by: Paolo Bonzini <bonzini@gnu.org>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index bda690cd3640..240cef14fe58 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1285,7 +1285,6 @@ out:
 static void send_to_sock(struct connection *con)
 {
 	int ret = 0;
-	ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
 	const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
 	struct writequeue_entry *e;
 	int len, offset;
@@ -1294,8 +1293,6 @@ static void send_to_sock(struct connection *con)
 	if (con->sock == NULL)
 		goto out_connect;
 
-	sendpage = con->sock->ops->sendpage;
-
 	spin_lock(&con->writequeue_lock);
 	for (;;) {
 		e = list_entry(con->writequeue.next, struct writequeue_entry,
@@ -1310,8 +1307,8 @@ static void send_to_sock(struct connection *con)
 
 		ret = 0;
 		if (len) {
-			ret = sendpage(con->sock, e->page, offset, len,
-				       msg_flags);
+			ret = kernel_sendpage(con->sock, e->page, offset, len,
+					      msg_flags);
 			if (ret == -EAGAIN || ret == 0) {
 				cond_resched();
 				goto out;
-- 
cgit v1.2.3-70-g09d2


From 353d5c30c666580347515da609dd74a2b8e9b828 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Mon, 24 Aug 2009 16:30:28 +0100
Subject: mm: fix hugetlb bug due to user_shm_unlock call

2.6.30's commit 8a0bdec194c21c8fdef840989d0d7b742bb5d4bc removed
user_shm_lock() calls in hugetlb_file_setup() but left the
user_shm_unlock call in shm_destroy().

In detail:
Assume that can_do_hugetlb_shm() returns true and hence user_shm_lock()
is not called in hugetlb_file_setup(). However, user_shm_unlock() is
called in any case in shm_destroy() and in the following
atomic_dec_and_lock(&up->__count) in free_uid() is executed and if
up->__count gets zero, also cleanup_user_struct() is scheduled.

Note that sched_destroy_user() is empty if CONFIG_USER_SCHED is not set.
However, the ref counter up->__count gets unexpectedly non-positive and
the corresponding structs are freed even though there are live
references to them, resulting in a kernel oops after a lots of
shmget(SHM_HUGETLB)/shmctl(IPC_RMID) cycles and CONFIG_USER_SCHED set.

Hugh changed Stefan's suggested patch: can_do_hugetlb_shm() at the
time of shm_destroy() may give a different answer from at the time
of hugetlb_file_setup().  And fixed newseg()'s no_id error path,
which has missed user_shm_unlock() ever since it came in 2.6.9.

Reported-by: Stefan Huber <shuber2@gmail.com>
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Tested-by: Stefan Huber <shuber2@gmail.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    | 20 ++++++++++++--------
 include/linux/hugetlb.h |  6 ++++--
 ipc/shm.c               |  8 +++++---
 3 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 941c8425c10b..cb88dac8ccaa 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -935,26 +935,28 @@ static int can_do_hugetlb_shm(void)
 	return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
 }
 
-struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
+struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
+						struct user_struct **user)
 {
 	int error = -ENOMEM;
-	int unlock_shm = 0;
 	struct file *file;
 	struct inode *inode;
 	struct dentry *dentry, *root;
 	struct qstr quick_string;
-	struct user_struct *user = current_user();
 
+	*user = NULL;
 	if (!hugetlbfs_vfsmount)
 		return ERR_PTR(-ENOENT);
 
 	if (!can_do_hugetlb_shm()) {
-		if (user_shm_lock(size, user)) {
-			unlock_shm = 1;
+		*user = current_user();
+		if (user_shm_lock(size, *user)) {
 			WARN_ONCE(1,
 			  "Using mlock ulimits for SHM_HUGETLB deprecated\n");
-		} else
+		} else {
+			*user = NULL;
 			return ERR_PTR(-EPERM);
+		}
 	}
 
 	root = hugetlbfs_vfsmount->mnt_root;
@@ -996,8 +998,10 @@ out_inode:
 out_dentry:
 	dput(dentry);
 out_shm_unlock:
-	if (unlock_shm)
-		user_shm_unlock(size, user);
+	if (*user) {
+		user_shm_unlock(size, *user);
+		*user = NULL;
+	}
 	return ERR_PTR(error);
 }
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2723513a5651..5cbc620bdfe0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -10,6 +10,7 @@
 #include <asm/tlbflush.h>
 
 struct ctl_table;
+struct user_struct;
 
 int PageHuge(struct page *page);
 
@@ -146,7 +147,8 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 
 extern const struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, size_t, int);
+struct file *hugetlb_file_setup(const char *name, size_t size, int acct,
+						struct user_struct **user);
 int hugetlb_get_quota(struct address_space *mapping, long delta);
 void hugetlb_put_quota(struct address_space *mapping, long delta);
 
@@ -168,7 +170,7 @@ static inline void set_file_hugepages(struct file *file)
 
 #define is_file_hugepages(file)			0
 #define set_file_hugepages(file)		BUG()
-#define hugetlb_file_setup(name,size,acctflag)	ERR_PTR(-ENOSYS)
+#define hugetlb_file_setup(name,size,acct,user)	ERR_PTR(-ENOSYS)
 
 #endif /* !CONFIG_HUGETLBFS */
 
diff --git a/ipc/shm.c b/ipc/shm.c
index 15dd238e5338..1bc4701ef4f0 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -174,7 +174,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
 	shm_unlock(shp);
 	if (!is_file_hugepages(shp->shm_file))
 		shmem_lock(shp->shm_file, 0, shp->mlock_user);
-	else
+	else if (shp->mlock_user)
 		user_shm_unlock(shp->shm_file->f_path.dentry->d_inode->i_size,
 						shp->mlock_user);
 	fput (shp->shm_file);
@@ -369,8 +369,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 		/* hugetlb_file_setup applies strict accounting */
 		if (shmflg & SHM_NORESERVE)
 			acctflag = VM_NORESERVE;
-		file = hugetlb_file_setup(name, size, acctflag);
-		shp->mlock_user = current_user();
+		file = hugetlb_file_setup(name, size, acctflag,
+							&shp->mlock_user);
 	} else {
 		/*
 		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
@@ -410,6 +410,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	return error;
 
 no_id:
+	if (shp->mlock_user)	/* shmflg & SHM_HUGETLB case */
+		user_shm_unlock(size, shp->mlock_user);
 	fput(file);
 no_file:
 	security_shm_free(shp);
-- 
cgit v1.2.3-70-g09d2


From 7111dc73923e9737b38a3ef5b5f236109000ff28 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 24 Aug 2009 19:21:29 -0400
Subject: NFSv4: Fix an infinite looping problem with the nfs4_state_manager

Commit 76db6d9500caeaa774a3e32a997eba30bbdc176b (nfs41: add session setup
to the state manager) introduces an infinite loop possibility in the NFSv4
state manager. By first checking nfs4_has_session() before clearing the
NFS4CLNT_SESSION_SETUP flag, it allows for a situation where someone sets
that flag, but it never gets cleared, and so the state manager loops.

In fact commit c3fad1b1aaf850bf692642642ace7cd0d64af0a3 (nfs41: add session
reset to state manager) causes this to happen every time we get a network
partition error.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Tested-by: Daniel J Blueman <daniel.blueman@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/nfs4state.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 65ca8c18476f..1434080aefeb 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1250,8 +1250,8 @@ static void nfs4_state_manager(struct nfs_client *clp)
 				continue;
 		}
 		/* Initialize or reset the session */
-		if (nfs4_has_session(clp) &&
-		   test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) {
+		if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)
+		   && nfs4_has_session(clp)) {
 			if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
 				status = nfs4_initialize_session(clp);
 			else
-- 
cgit v1.2.3-70-g09d2


From a8526e84ac758ac6da45cf273aa1538a6a7aa3de Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 25 Aug 2009 22:36:05 -0400
Subject: ext4: Add missing unlock_new_inode() call in extent migration code

We need to unlock the new inode before iput.  This patch fixes the
following warning when calling chattr +e to migrate a file to use
extents.  It also fixes problems in when e4defrag attempts to
defragment an inode.

[  470.400044] ------------[ cut here ]------------
[  470.400065] WARNING: at fs/inode.c:1210 generic_delete_inode+0x65/0x16a()
[  470.400072] Hardware name: N/A
.....
...
[  470.400353] Pid: 4451, comm: chattr Not tainted 2.6.31-rc7-red-debug #4
[  470.400359] Call Trace:
[  470.400372]  [<ffffffff81037771>] warn_slowpath_common+0x77/0x8f
[  470.400385]  [<ffffffff81037798>] warn_slowpath_null+0xf/0x11
[  470.400395]  [<ffffffff810b7f28>] generic_delete_inode+0x65/0x16a
[  470.400405]  [<ffffffff810b8044>] generic_drop_inode+0x17/0x1bd
[  470.400413]  [<ffffffff810b7083>] iput+0x61/0x65
[  470.400455]  [<ffffffffa003b229>] ext4_ext_migrate+0x5eb/0x66a [ext4]
[  470.400492]  [<ffffffffa002b1f8>] ext4_ioctl+0x340/0x756 [ext4]
[  470.400507]  [<ffffffff810b1a91>] vfs_ioctl+0x1d/0x82
[  470.400517]  [<ffffffff810b1ff0>] do_vfs_ioctl+0x483/0x4c9
[  470.400527]  [<ffffffff81059c30>] ? trace_hardirqs_on+0xd/0xf
[  470.400537]  [<ffffffff810b2087>] sys_ioctl+0x51/0x74
[  470.400549]  [<ffffffff8100ba6b>] system_call_fastpath+0x16/0x1b
[  470.400557] ---[ end trace ab85723542352dac ]---

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/migrate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 313a50b39741..05361ad5b80a 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -618,7 +618,7 @@ err_out:
 	tmp_inode->i_nlink = 0;
 
 	ext4_journal_stop(handle);
-
+	unlock_new_inode(tmp_inode);
 	iput(tmp_inode);
 
 	return retval;
-- 
cgit v1.2.3-70-g09d2


From 1927805e6599d8602d2c0af6a0155c85acc0b214 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 25 Aug 2009 22:36:25 -0400
Subject: ext4: use variables not types in sizeofs() for allocations

Precursor to changing some types; to keep things in sync, it
seems better to allocate/memset based on the size of the
variables we are using rather than on some disconnected
basic type like "unsigned short"

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/ext4/mballoc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1a9cd7de04dd..081eaf7fce89 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -868,7 +868,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			grinfo = ext4_get_group_info(sb, group);
 			grinfo->bb_fragments = 0;
 			memset(grinfo->bb_counters, 0,
-			       sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+			       sizeof(*grinfo->bb_counters) *
+				(sb->s_blocksize_bits+2));
 			/*
 			 * incore got set to the group block bitmap below
 			 */
@@ -2640,14 +2641,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	unsigned max;
 	int ret;
 
-	i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
+	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
 
 	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_offsets == NULL) {
 		return -ENOMEM;
 	}
 
-	i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
+	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
 	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_maxs == NULL) {
 		kfree(sbi->s_mb_offsets);
-- 
cgit v1.2.3-70-g09d2


From a36b44988cef1fc007535107013571fa691a2d7f Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 25 Aug 2009 22:36:45 -0400
Subject: ext4: use ext4_grpblk_t more extensively

unsigned  short is potentially too small to track blocks within
a group; today it is safe due to restrictions in e2fsprogs but
we have _lo / _hi bits for group blocks with the intent to go
up to 32 bits, so clean this up now.

There are many more places where we use unsigned/int/unsigned int
to contain a group block but this should at least fix all the
short types.

I added a few comments to the struct ext4_group_info definition
as well.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 11 +++++++----
 fs/ext4/mballoc.c | 18 +++++++++---------
 fs/ext4/mballoc.h |  6 +++---
 3 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 02b22885eb02..41a76e163b99 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1577,15 +1577,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 struct ext4_group_info {
 	unsigned long   bb_state;
 	struct rb_root  bb_free_root;
-	unsigned short  bb_first_free;
-	unsigned short  bb_free;
-	unsigned short  bb_fragments;
+	ext4_grpblk_t	bb_first_free;	/* first free block */
+	ext4_grpblk_t	bb_free;	/* total free blocks */
+	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
 	struct          list_head bb_prealloc_list;
 #ifdef DOUBLE_CHECK
 	void            *bb_bitmap;
 #endif
 	struct rw_semaphore alloc_sem;
-	unsigned short  bb_counters[];
+	ext4_grpblk_t	bb_counters[];	/* Nr of free power-of-two-block
+					 * regions, index is order.
+					 * bb_counters[3] = 5 means
+					 * 5 free 8-block regions. */
 };
 
 #define EXT4_GROUP_INFO_NEED_INIT_BIT	0
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 081eaf7fce89..3086b3c65adc 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -623,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 
 /* FIXME!! need more doc */
 static void ext4_mb_mark_free_simple(struct super_block *sb,
-				void *buddy, unsigned first, int len,
+				void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
 					struct ext4_group_info *grp)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	unsigned short min;
-	unsigned short max;
-	unsigned short chunk;
+	ext4_grpblk_t min;
+	ext4_grpblk_t max;
+	ext4_grpblk_t chunk;
 	unsigned short border;
 
 	BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
@@ -663,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 				void *buddy, void *bitmap, ext4_group_t group)
 {
 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
-	unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
-	unsigned short i = 0;
-	unsigned short first;
-	unsigned short len;
+	ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+	ext4_grpblk_t i = 0;
+	ext4_grpblk_t first;
+	ext4_grpblk_t len;
 	unsigned free = 0;
 	unsigned fragments = 0;
 	unsigned long long period = get_cycles();
@@ -2325,7 +2325,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 	struct ext4_buddy e4b;
 	struct sg {
 		struct ext4_group_info info;
-		unsigned short counters[16];
+		ext4_grpblk_t counters[16];
 	} sg;
 
 	group--;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 9db890d4d275..188d3d709b24 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -136,8 +136,8 @@ struct ext4_prealloc_space {
 	unsigned		pa_deleted;
 	ext4_fsblk_t		pa_pstart;	/* phys. block */
 	ext4_lblk_t		pa_lstart;	/* log. block */
-	unsigned short		pa_len;		/* len of preallocated chunk */
-	unsigned short		pa_free;	/* how many blocks are free */
+	ext4_grpblk_t		pa_len;		/* len of preallocated chunk */
+	ext4_grpblk_t		pa_free;	/* how many blocks are free */
 	unsigned short		pa_type;	/* pa type. inode or group */
 	spinlock_t		*pa_obj_lock;
 	struct inode		*pa_inode;	/* hack, for history only */
@@ -152,7 +152,7 @@ struct ext4_free_extent {
 	ext4_lblk_t fe_logical;
 	ext4_grpblk_t fe_start;
 	ext4_group_t fe_group;
-	int fe_len;
+	ext4_grpblk_t fe_len;
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 40b78a322365aa1d87770200f7fc7de3b361c11a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 26 Aug 2009 18:41:32 +0100
Subject: GFS2: Clean up of extended attribute support

This has been on my list for some time. We need to change the way
in which we handle extended attributes to allow faster file creation
times (by reducing the number of transactions required) and the
extended attribute code is the main obstacle to this.

In addition to that, the VFS provides a way to demultiplex the xattr
calls which we ought to be using, rather than rolling our own. This
patch changes the GFS2 code to use that VFS feature and as a result
the code shrinks by a couple of hundred lines or so, and becomes
easier to read.

I'm planning on doing further clean up work in this area, but this
patch is a good start. The cleaned up code also uses the more usual
"xattr" shorthand, I plan to eliminate the use of "eattr" eventually
and in the mean time it serves as a flag as to which bits of the code
have been updated.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Makefile     |   2 +-
 fs/gfs2/acl.c        | 104 ++++++-------
 fs/gfs2/eaops.c      | 157 -------------------
 fs/gfs2/eaops.h      |  30 ----
 fs/gfs2/eattr.c      | 421 +++++++++++++++++++++++++++++----------------------
 fs/gfs2/eattr.h      |  52 ++-----
 fs/gfs2/file.c       |   1 -
 fs/gfs2/inode.c      |  12 +-
 fs/gfs2/ops_fstype.c |   1 +
 fs/gfs2/ops_inode.c  |  78 +++++-----
 fs/gfs2/super.h      |   1 +
 11 files changed, 333 insertions(+), 526 deletions(-)
 delete mode 100644 fs/gfs2/eaops.c
 delete mode 100644 fs/gfs2/eaops.h

(limited to 'fs')

diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 3da2f1f4f738..2a4fd894858a 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
 EXTRA_CFLAGS := -I$(src)
 obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o eattr.o glock.o \
 	glops.o inode.o log.o lops.o main.o meta_io.o \
 	aops.o dentry.o export.o file.o \
 	ops_fstype.o ops_inode.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index fa881bdc3d85..f6777f167260 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -19,7 +19,6 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "acl.h"
-#include "eaops.h"
 #include "eattr.h"
 #include "glock.h"
 #include "inode.h"
@@ -31,8 +30,7 @@
 #define ACL_DEFAULT 0
 
 int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
-		      struct gfs2_ea_request *er,
-		      int *remove, mode_t *mode)
+			  struct gfs2_ea_request *er, int *remove, mode_t *mode)
 {
 	struct posix_acl *acl;
 	int error;
@@ -83,30 +81,20 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
 	return 0;
 }
 
-static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
-		   struct gfs2_ea_location *el, char **data, unsigned int *len)
+static int acl_get(struct gfs2_inode *ip, const char *name,
+		   struct posix_acl **acl, struct gfs2_ea_location *el,
+		   char **datap, unsigned int *lenp)
 {
-	struct gfs2_ea_request er;
-	struct gfs2_ea_location el_this;
+	char *data;
+	unsigned int len;
 	int error;
 
+	el->el_bh = NULL;
+
 	if (!ip->i_eattr)
 		return 0;
 
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-	if (access) {
-		er.er_name = GFS2_POSIX_ACL_ACCESS;
-		er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
-	} else {
-		er.er_name = GFS2_POSIX_ACL_DEFAULT;
-		er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
-	}
-	er.er_type = GFS2_EATYPE_SYS;
-
-	if (!el)
-		el = &el_this;
-
-	error = gfs2_ea_find(ip, &er, el);
+	error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
 	if (error)
 		return error;
 	if (!el->el_ea)
@@ -114,32 +102,31 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
 	if (!GFS2_EA_DATA_LEN(el->el_ea))
 		goto out;
 
-	er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
-	er.er_data = kmalloc(er.er_data_len, GFP_NOFS);
+	len = GFS2_EA_DATA_LEN(el->el_ea);
+	data = kmalloc(len, GFP_NOFS);
 	error = -ENOMEM;
-	if (!er.er_data)
+	if (!data)
 		goto out;
 
-	error = gfs2_ea_get_copy(ip, el, er.er_data);
-	if (error)
+	error = gfs2_ea_get_copy(ip, el, data, len);
+	if (error < 0)
 		goto out_kfree;
+	error = 0;
 
 	if (acl) {
-		*acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
+		*acl = posix_acl_from_xattr(data, len);
 		if (IS_ERR(*acl))
 			error = PTR_ERR(*acl);
 	}
 
 out_kfree:
-	if (error || !data)
-		kfree(er.er_data);
-	else {
-		*data = er.er_data;
-		*len = er.er_data_len;
+	if (error || !datap) {
+		kfree(data);
+	} else {
+		*datap = data;
+		*lenp = len;
 	}
 out:
-	if (error || el == &el_this)
-		brelse(el->el_bh);
 	return error;
 }
 
@@ -153,10 +140,12 @@ out:
 
 int gfs2_check_acl(struct inode *inode, int mask)
 {
+	struct gfs2_ea_location el;
 	struct posix_acl *acl = NULL;
 	int error;
 
-	error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
+	error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL);
+	brelse(el.el_bh);
 	if (error)
 		return error;
 
@@ -196,10 +185,12 @@ static int munge_mode(struct gfs2_inode *ip, mode_t mode)
 
 int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 {
+	struct gfs2_ea_location el;
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct posix_acl *acl = NULL, *clone;
-	struct gfs2_ea_request er;
 	mode_t mode = ip->i_inode.i_mode;
+	char *data = NULL;
+	unsigned int len;
 	int error;
 
 	if (!sdp->sd_args.ar_posix_acl)
@@ -207,11 +198,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	if (S_ISLNK(ip->i_inode.i_mode))
 		return 0;
 
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-	er.er_type = GFS2_EATYPE_SYS;
-
-	error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
-			&er.er_data, &er.er_data_len);
+	error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len);
+	brelse(el.el_bh);
 	if (error)
 		return error;
 	if (!acl) {
@@ -229,9 +217,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	acl = clone;
 
 	if (S_ISDIR(ip->i_inode.i_mode)) {
-		er.er_name = GFS2_POSIX_ACL_DEFAULT;
-		er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
-		error = gfs2_system_eaops.eo_set(ip, &er);
+		error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
+				       GFS2_POSIX_ACL_DEFAULT, data, len, 0);
 		if (error)
 			goto out;
 	}
@@ -239,21 +226,19 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	error = posix_acl_create_masq(acl, &mode);
 	if (error < 0)
 		goto out;
-	if (error > 0) {
-		er.er_name = GFS2_POSIX_ACL_ACCESS;
-		er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
-		posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
-		er.er_mode = mode;
-		er.er_flags = GFS2_ERF_MODE;
-		error = gfs2_system_eaops.eo_set(ip, &er);
-		if (error)
-			goto out;
-	} else
-		munge_mode(ip, mode);
+	if (error == 0)
+		goto munge;
 
+	posix_acl_to_xattr(acl, data, len);
+	error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
+			       GFS2_POSIX_ACL_ACCESS, data, len, 0);
+	if (error)
+		goto out;
+munge:
+	error = munge_mode(ip, mode);
 out:
 	posix_acl_release(acl);
-	kfree(er.er_data);
+	kfree(data);
 	return error;
 }
 
@@ -265,9 +250,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 	unsigned int len;
 	int error;
 
-	error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
+	error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len);
 	if (error)
-		return error;
+		goto out_brelse;
 	if (!acl)
 		return gfs2_setattr_simple(ip, attr);
 
@@ -286,8 +271,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 
 out:
 	posix_acl_release(acl);
-	brelse(el.el_bh);
 	kfree(data);
+out_brelse:
+	brelse(el.el_bh);
 	return error;
 }
 
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
deleted file mode 100644
index dee9b03e5b37..000000000000
--- a/fs/gfs2/eaops.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/capability.h>
-#include <linux/xattr.h>
-#include <linux/gfs2_ondisk.h>
-#include <asm/uaccess.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "acl.h"
-#include "eaops.h"
-#include "eattr.h"
-#include "util.h"
-
-/**
- * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
- * @namep: ea name, possibly with type appended
- *
- * Returns: GFS2_EATYPE_XXX
- */
-
-unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
-{
-	unsigned int type;
-
-	if (strncmp(name, "system.", 7) == 0) {
-		type = GFS2_EATYPE_SYS;
-		if (truncated_name)
-			*truncated_name = name + sizeof("system.") - 1;
-	} else if (strncmp(name, "user.", 5) == 0) {
-		type = GFS2_EATYPE_USR;
-		if (truncated_name)
-			*truncated_name = name + sizeof("user.") - 1;
-	} else if (strncmp(name, "security.", 9) == 0) {
-		type = GFS2_EATYPE_SECURITY;
-		if (truncated_name)
-			*truncated_name = name + sizeof("security.") - 1;
-	} else {
-		type = GFS2_EATYPE_UNUSED;
-		if (truncated_name)
-			*truncated_name = NULL;
-	}
-
-	return type;
-}
-
-static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
-	    !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
-	    !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
-	    (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
-	     GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
-		return -EOPNOTSUPP;
-
-	return gfs2_ea_get_i(ip, er);
-}
-
-static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	int remove = 0;
-	int error;
-
-	if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
-		if (!(er->er_flags & GFS2_ERF_MODE)) {
-			er->er_mode = ip->i_inode.i_mode;
-			er->er_flags |= GFS2_ERF_MODE;
-		}
-		error = gfs2_acl_validate_set(ip, 1, er,
-					      &remove, &er->er_mode);
-		if (error)
-			return error;
-		error = gfs2_ea_set_i(ip, er);
-		if (error)
-			return error;
-		if (remove)
-			gfs2_ea_remove_i(ip, er);
-		return 0;
-
-	} else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
-		error = gfs2_acl_validate_set(ip, 0, er,
-					      &remove, NULL);
-		if (error)
-			return error;
-		if (!remove)
-			error = gfs2_ea_set_i(ip, er);
-		else {
-			error = gfs2_ea_remove_i(ip, er);
-			if (error == -ENODATA)
-				error = 0;
-		}
-		return error;
-	}
-
-	return -EPERM;
-}
-
-static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
-		int error = gfs2_acl_validate_remove(ip, 1);
-		if (error)
-			return error;
-
-	} else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
-		int error = gfs2_acl_validate_remove(ip, 0);
-		if (error)
-			return error;
-
-	} else
-		return -EPERM;
-
-	return gfs2_ea_remove_i(ip, er);
-}
-
-static const struct gfs2_eattr_operations gfs2_user_eaops = {
-	.eo_get = gfs2_ea_get_i,
-	.eo_set = gfs2_ea_set_i,
-	.eo_remove = gfs2_ea_remove_i,
-	.eo_name = "user",
-};
-
-const struct gfs2_eattr_operations gfs2_system_eaops = {
-	.eo_get = system_eo_get,
-	.eo_set = system_eo_set,
-	.eo_remove = system_eo_remove,
-	.eo_name = "system",
-};
-
-static const struct gfs2_eattr_operations gfs2_security_eaops = {
-	.eo_get = gfs2_ea_get_i,
-	.eo_set = gfs2_ea_set_i,
-	.eo_remove = gfs2_ea_remove_i,
-	.eo_name = "security",
-};
-
-const struct gfs2_eattr_operations *gfs2_ea_ops[] = {
-	NULL,
-	&gfs2_user_eaops,
-	&gfs2_system_eaops,
-	&gfs2_security_eaops,
-};
-
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
deleted file mode 100644
index da2f7fbbb40d..000000000000
--- a/fs/gfs2/eaops.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __EAOPS_DOT_H__
-#define __EAOPS_DOT_H__
-
-struct gfs2_ea_request;
-struct gfs2_inode;
-
-struct gfs2_eattr_operations {
-	int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
-	int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
-	int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
-	char *eo_name;
-};
-
-unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
-
-extern const struct gfs2_eattr_operations gfs2_system_eaops;
-
-extern const struct gfs2_eattr_operations *gfs2_ea_ops[];
-
-#endif /* __EAOPS_DOT_H__ */
-
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 07ea9529adda..d9a9e260d5ac 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -18,7 +18,6 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "acl.h"
-#include "eaops.h"
 #include "eattr.h"
 #include "glock.h"
 #include "inode.h"
@@ -38,26 +37,32 @@
  * Returns: 1 if the EA should be stuffed
  */
 
-static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
+static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
 			unsigned int *size)
 {
-	*size = GFS2_EAREQ_SIZE_STUFFED(er);
-	if (*size <= sdp->sd_jbsize)
+	unsigned int jbsize = sdp->sd_jbsize;
+
+	/* Stuffed */
+	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
+
+	if (*size <= jbsize)
 		return 1;
 
-	*size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
+	/* Unstuffed */
+	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
+		      (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
 
 	return 0;
 }
 
-static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
+static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
 {
 	unsigned int size;
 
-	if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
+	if (dsize > GFS2_EA_MAX_DATA_LEN)
 		return -ERANGE;
 
-	ea_calc_size(sdp, er, &size);
+	ea_calc_size(sdp, nsize, dsize, &size);
 
 	/* This can only happen with 512 byte blocks */
 	if (size > sdp->sd_jbsize)
@@ -151,7 +156,9 @@ out:
 }
 
 struct ea_find {
-	struct gfs2_ea_request *ef_er;
+	int type;
+	const char *name;
+	size_t namel;
 	struct gfs2_ea_location *ef_el;
 };
 
@@ -160,14 +167,13 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
 		     void *private)
 {
 	struct ea_find *ef = private;
-	struct gfs2_ea_request *er = ef->ef_er;
 
 	if (ea->ea_type == GFS2_EATYPE_UNUSED)
 		return 0;
 
-	if (ea->ea_type == er->er_type) {
-		if (ea->ea_name_len == er->er_name_len &&
-		    !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
+	if (ea->ea_type == ef->type) {
+		if (ea->ea_name_len == ef->namel &&
+		    !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
 			struct gfs2_ea_location *el = ef->ef_el;
 			get_bh(bh);
 			el->el_bh = bh;
@@ -180,13 +186,15 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
 	return 0;
 }
 
-int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
 		 struct gfs2_ea_location *el)
 {
 	struct ea_find ef;
 	int error;
 
-	ef.ef_er = er;
+	ef.type = type;
+	ef.name = name;
+	ef.namel = strlen(name);
 	ef.ef_el = el;
 
 	memset(el, 0, sizeof(struct gfs2_ea_location));
@@ -344,6 +352,20 @@ struct ea_list {
 	unsigned int ei_size;
 };
 
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+	switch (ea->ea_type) {
+	case GFS2_EATYPE_USR:
+		return 5 + ea->ea_name_len + 1;
+	case GFS2_EATYPE_SYS:
+		return 7 + ea->ea_name_len + 1;
+	case GFS2_EATYPE_SECURITY:
+		return 9 + ea->ea_name_len + 1;
+	default:
+		return 0;
+	}
+}
+
 static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
 		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
 		     void *private)
@@ -392,21 +414,25 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
 }
 
 /**
- * gfs2_ea_list -
- * @ip:
- * @er:
+ * gfs2_listxattr - List gfs2 extended attributes
+ * @dentry: The dentry whose inode we are interested in
+ * @buffer: The buffer to write the results
+ * @size: The size of the buffer
  *
  * Returns: actual size of data on success, -errno on error
  */
 
-int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_ea_request er;
 	struct gfs2_holder i_gh;
 	int error;
 
-	if (!er->er_data || !er->er_data_len) {
-		er->er_data = NULL;
-		er->er_data_len = 0;
+	memset(&er, 0, sizeof(struct gfs2_ea_request));
+	if (size) {
+		er.er_data = buffer;
+		er.er_data_len = size;
 	}
 
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
@@ -414,7 +440,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 		return error;
 
 	if (ip->i_eattr) {
-		struct ea_list ei = { .ei_er = er, .ei_size = 0 };
+		struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
 
 		error = ea_foreach(ip, ea_list_i, &ei);
 		if (!error)
@@ -491,83 +517,60 @@ out:
 }
 
 int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-		     char *data)
+		     char *data, size_t size)
 {
+	int ret;
+	size_t len = GFS2_EA_DATA_LEN(el->el_ea);
+	if (len > size)
+		return -ERANGE;
+
 	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
-		memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea));
-		return 0;
-	} else
-		return ea_get_unstuffed(ip, el->el_ea, data);
+		memcpy(data, GFS2_EA2DATA(el->el_ea), len);
+		return len;
+	}
+	ret = ea_get_unstuffed(ip, el->el_ea, data);
+	if (ret < 0)
+		return ret;
+	return len;
 }
 
 /**
- * gfs2_ea_get_i -
- * @ip: The GFS2 inode
- * @er: The request structure
+ * gfs2_xattr_get - Get a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of extended attribute
+ * @name: The name of the extended attribute
+ * @buffer: The buffer to write the result into
+ * @size: The size of the buffer
  *
  * Returns: actual size of data on success, -errno on error
  */
 
-int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+		   void *buffer, size_t size)
 {
+	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_ea_location el;
 	int error;
 
 	if (!ip->i_eattr)
 		return -ENODATA;
+	if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
+		return -EINVAL;
 
-	error = gfs2_ea_find(ip, er, &el);
+	error = gfs2_ea_find(ip, type, name, &el);
 	if (error)
 		return error;
 	if (!el.el_ea)
 		return -ENODATA;
-
-	if (er->er_data_len) {
-		if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
-			error =  -ERANGE;
-		else
-			error = gfs2_ea_get_copy(ip, &el, er->er_data);
-	}
-	if (!error)
+	if (size) 
+		error = gfs2_ea_get_copy(ip, &el, buffer, size);
+	else
 		error = GFS2_EA_DATA_LEN(el.el_ea);
-
 	brelse(el.el_bh);
 
 	return error;
 }
 
-/**
- * gfs2_ea_get -
- * @ip: The GFS2 inode
- * @er: The request structure
- *
- * Returns: actual size of data on success, -errno on error
- */
-
-int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	struct gfs2_holder i_gh;
-	int error;
-
-	if (!er->er_name_len ||
-	    er->er_name_len > GFS2_EA_MAX_NAME_LEN)
-		return -EINVAL;
-	if (!er->er_data || !er->er_data_len) {
-		er->er_data = NULL;
-		er->er_data_len = 0;
-	}
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-	if (error)
-		return error;
-
-	error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
-
-	gfs2_glock_dq_uninit(&i_gh);
-
-	return error;
-}
-
 /**
  * ea_alloc_blk - allocates a new block for extended attributes.
  * @ip: A pointer to the inode that's getting extended attributes
@@ -713,12 +716,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
-		if (er->er_flags & GFS2_ERF_MODE) {
-			gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
-					    (ip->i_inode.i_mode & S_IFMT) ==
-					    (er->er_mode & S_IFMT));
-			ip->i_inode.i_mode = er->er_mode;
-		}
 		ip->i_inode.i_ctime = CURRENT_TIME;
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 		gfs2_dinode_out(ip, dibh->b_data);
@@ -762,15 +759,23 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
  * Returns: errno
  */
 
-static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+static int ea_init(struct gfs2_inode *ip, int type, const char *name,
+		   const void *data, size_t size)
 {
+	struct gfs2_ea_request er;
 	unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
 	unsigned int blks = 1;
 
-	if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
-		blks += DIV_ROUND_UP(er->er_data_len, jbsize);
+	er.er_type = type;
+	er.er_name = name;
+	er.er_name_len = strlen(name);
+	er.er_data = (void *)data;
+	er.er_data_len = size;
+
+	if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
+		blks += DIV_ROUND_UP(er.er_data_len, jbsize);
 
-	return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
+	return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
 }
 
 static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
@@ -848,12 +853,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (error)
 		goto out;
-
-	if (er->er_flags & GFS2_ERF_MODE) {
-		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
-			(ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
-		ip->i_inode.i_mode = er->er_mode;
-	}
 	ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -894,7 +893,8 @@ static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
 	int stuffed;
 	int error;
 
-	stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
+	stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
+			       es->es_er->er_data_len, &size);
 
 	if (ea->ea_type == GFS2_EATYPE_UNUSED) {
 		if (GFS2_EA_REC_LEN(ea) < size)
@@ -1005,15 +1005,22 @@ out:
 	return error;
 }
 
-static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
-		    struct gfs2_ea_location *el)
+static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
+		    const void *value, size_t size, struct gfs2_ea_location *el)
 {
+	struct gfs2_ea_request er;
 	struct ea_set es;
 	unsigned int blks = 2;
 	int error;
 
+	er.er_type = type;
+	er.er_name = name;
+	er.er_data = (void *)value;
+	er.er_name_len = strlen(name);
+	er.er_data_len = size;
+
 	memset(&es, 0, sizeof(struct ea_set));
-	es.es_er = er;
+	es.es_er = &er;
 	es.es_el = el;
 
 	error = ea_foreach(ip, ea_set_simple, &es);
@@ -1024,10 +1031,10 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 
 	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
 		blks++;
-	if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
-		blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+	if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+		blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
 
-	return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
+	return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
 }
 
 static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
@@ -1042,74 +1049,6 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
 	return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
 }
 
-int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	struct gfs2_ea_location el;
-	int error;
-
-	if (!ip->i_eattr) {
-		if (er->er_flags & XATTR_REPLACE)
-			return -ENODATA;
-		return ea_init(ip, er);
-	}
-
-	error = gfs2_ea_find(ip, er, &el);
-	if (error)
-		return error;
-
-	if (el.el_ea) {
-		if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
-			brelse(el.el_bh);
-			return -EPERM;
-		}
-
-		error = -EEXIST;
-		if (!(er->er_flags & XATTR_CREATE)) {
-			int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
-			error = ea_set_i(ip, er, &el);
-			if (!error && unstuffed)
-				ea_set_remove_unstuffed(ip, &el);
-		}
-
-		brelse(el.el_bh);
-	} else {
-		error = -ENODATA;
-		if (!(er->er_flags & XATTR_REPLACE))
-			error = ea_set_i(ip, er, NULL);
-	}
-
-	return error;
-}
-
-int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-	struct gfs2_holder i_gh;
-	int error;
-
-	if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
-		return -EINVAL;
-	if (!er->er_data || !er->er_data_len) {
-		er->er_data = NULL;
-		er->er_data_len = 0;
-	}
-	error = ea_check_size(GFS2_SB(&ip->i_inode), er);
-	if (error)
-		return error;
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
-	if (error)
-		return error;
-
-	if (IS_IMMUTABLE(&ip->i_inode))
-		error = -EPERM;
-	else
-		error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
-
-	gfs2_glock_dq_uninit(&i_gh);
-
-	return error;
-}
-
 static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
 {
 	struct gfs2_ea_header *ea = el->el_ea;
@@ -1131,8 +1070,9 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
 
 		if (GFS2_EA_IS_LAST(ea))
 			prev->ea_flags |= GFS2_EAFLAG_LAST;
-	} else
+	} else {
 		ea->ea_type = GFS2_EATYPE_UNUSED;
+	}
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
@@ -1147,15 +1087,29 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
 	return error;
 }
 
-int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+/**
+ * gfs2_xattr_remove - Remove a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ *
+ * This is not called directly by the VFS since we use the (common)
+ * scheme of making a "set with NULL data" mean a remove request. Note
+ * that this is different from a set with zero length data.
+ *
+ * Returns: 0, or errno on failure
+ */
+
+static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
 {
+	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_ea_location el;
 	int error;
 
 	if (!ip->i_eattr)
 		return -ENODATA;
 
-	error = gfs2_ea_find(ip, er, &el);
+	error = gfs2_ea_find(ip, type, name, &el);
 	if (error)
 		return error;
 	if (!el.el_ea)
@@ -1164,8 +1118,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 	if (GFS2_EA_IS_STUFFED(el.el_ea))
 		error = ea_remove_stuffed(ip, &el);
 	else
-		error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
-					    0);
+		error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
 
 	brelse(el.el_bh);
 
@@ -1173,31 +1126,70 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 }
 
 /**
- * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
- * @ip: pointer to the inode of the target file
- * @er: request information
+ * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ * @value: The value of the extended attribute (NULL for remove)
+ * @size: The size of the @value argument
+ * @flags: Create or Replace
  *
- * Returns: errno
+ * See gfs2_xattr_remove() for details of the removal of xattrs.
+ *
+ * Returns: 0 or errno on failure
  */
 
-int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+		   const void *value, size_t size, int flags)
 {
-	struct gfs2_holder i_gh;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_ea_location el;
+	unsigned int namel = strlen(name);
 	int error;
 
-	if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
-		return -EINVAL;
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		return -EPERM;
+	if (namel > GFS2_EA_MAX_NAME_LEN)
+		return -ERANGE;
 
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (value == NULL)
+		return gfs2_xattr_remove(inode, type, name);
+
+	if (ea_check_size(sdp, namel, size))
+		return -ERANGE;
+
+	if (!ip->i_eattr) {
+		if (flags & XATTR_REPLACE)
+			return -ENODATA;
+		return ea_init(ip, type, name, value, size);
+	}
+
+	error = gfs2_ea_find(ip, type, name, &el);
 	if (error)
 		return error;
 
-	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
-		error = -EPERM;
-	else
-		error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
+	if (el.el_ea) {
+		if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
+			brelse(el.el_bh);
+			return -EPERM;
+		}
 
-	gfs2_glock_dq_uninit(&i_gh);
+		error = -EEXIST;
+		if (!(flags & XATTR_CREATE)) {
+			int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+			error = ea_set_i(ip, type, name, value, size, &el);
+			if (!error && unstuffed)
+				ea_set_remove_unstuffed(ip, &el);
+		}
+
+		brelse(el.el_bh);
+		return error;
+	}
+
+	error = -ENODATA;
+	if (!(flags & XATTR_REPLACE))
+		error = ea_set_i(ip, type, name, value, size, NULL);
 
 	return error;
 }
@@ -1503,3 +1495,64 @@ out_alloc:
 	return error;
 }
 
+static int gfs2_xattr_user_get(struct inode *inode, const char *name,
+			       void *buffer, size_t size)
+{
+	return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
+}
+
+static int gfs2_xattr_user_set(struct inode *inode, const char *name,
+			       const void *value, size_t size, int flags)
+{
+	return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
+}
+
+static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+				 void *buffer, size_t size)
+{
+	return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
+}
+
+static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+				 const void *value, size_t size, int flags)
+{
+	return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
+}
+
+static int gfs2_xattr_security_get(struct inode *inode, const char *name,
+				   void *buffer, size_t size)
+{
+	return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
+}
+
+static int gfs2_xattr_security_set(struct inode *inode, const char *name,
+				   const void *value, size_t size, int flags)
+{
+	return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
+}
+
+static struct xattr_handler gfs2_xattr_user_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.get    = gfs2_xattr_user_get,
+	.set    = gfs2_xattr_user_set,
+};
+
+static struct xattr_handler gfs2_xattr_security_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.get    = gfs2_xattr_security_get,
+	.set    = gfs2_xattr_security_set,
+};
+
+static struct xattr_handler gfs2_xattr_system_handler = {
+	.prefix = XATTR_SYSTEM_PREFIX,
+	.get    = gfs2_xattr_system_get,
+	.set    = gfs2_xattr_system_set,
+};
+
+struct xattr_handler *gfs2_xattr_handlers[] = {
+	&gfs2_xattr_user_handler,
+	&gfs2_xattr_security_handler,
+	&gfs2_xattr_system_handler,
+	NULL,
+};
+
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
index c82dbe01d713..4040a188f63b 100644
--- a/fs/gfs2/eattr.h
+++ b/fs/gfs2/eattr.h
@@ -27,10 +27,6 @@ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
 #define GFS2_EAREQ_SIZE_STUFFED(er) \
 ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
 
-#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
-ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
-      sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
-
 #define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
 #define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
 
@@ -43,16 +39,12 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
 #define GFS2_EA_BH2FIRST(bh) \
 ((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
 
-#define GFS2_ERF_MODE 0x80000000
-
 struct gfs2_ea_request {
 	const char *er_name;
 	char *er_data;
 	unsigned int er_name_len;
 	unsigned int er_data_len;
 	unsigned int er_type; /* GFS2_EATYPE_... */
-	int er_flags;
-	mode_t er_mode;
 };
 
 struct gfs2_ea_location {
@@ -61,40 +53,20 @@ struct gfs2_ea_location {
 	struct gfs2_ea_header *el_prev;
 };
 
-int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-
-int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-
-int gfs2_ea_dealloc(struct gfs2_inode *ip);
+extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+			  void *buffer, size_t size);
+extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+			  const void *value, size_t size, int flags);
+extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
+extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
 
 /* Exported to acl.c */
 
-int gfs2_ea_find(struct gfs2_inode *ip,
-		 struct gfs2_ea_request *er,
-		 struct gfs2_ea_location *el);
-int gfs2_ea_get_copy(struct gfs2_inode *ip,
-		     struct gfs2_ea_location *el,
-		     char *data);
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-		      struct iattr *attr, char *data);
-
-static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
-{
-	switch (ea->ea_type) {
-	case GFS2_EATYPE_USR:
-		return 5 + ea->ea_name_len + 1;
-	case GFS2_EATYPE_SYS:
-		return 7 + ea->ea_name_len + 1;
-	case GFS2_EATYPE_SECURITY:
-		return 9 + ea->ea_name_len + 1;
-	default:
-		return 0;
-	}
-}
+extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+			struct gfs2_ea_location *el);
+extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+			    char *data, size_t size);
+extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+			     struct iattr *attr, char *data);
 
 #endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 73318a3ce6f1..166f38fbd246 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -38,7 +38,6 @@
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
-#include "eaops.h"
 
 /**
  * gfs2_llseek - seek to a location in a file
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index f9b4fe886540..f936d2d68138 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -924,7 +924,6 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
 	size_t len;
 	void *value;
 	char *name;
-	struct gfs2_ea_request er;
 
 	err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
 					   &name, &value, &len);
@@ -935,16 +934,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
 		return err;
 	}
 
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-
-	er.er_type = GFS2_EATYPE_SECURITY;
-	er.er_name = name;
-	er.er_data = value;
-	er.er_name_len = strlen(name);
-	er.er_data_len = len;
-
-	err = gfs2_ea_set_i(ip, &er);
-
+	err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0);
 	kfree(value);
 	kfree(name);
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 165518a1041e..1ec69e63c694 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1186,6 +1186,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = GFS2_MAGIC;
 	sb->s_op = &gfs2_super_ops;
 	sb->s_export_op = &gfs2_export_ops;
+	sb->s_xattr = gfs2_xattr_handlers;
 	sb->s_time_gran = 1;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index ce551f85524f..1ee0c2657467 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -26,7 +26,6 @@
 #include "acl.h"
 #include "bmap.h"
 #include "dir.h"
-#include "eaops.h"
 #include "eattr.h"
 #include "glock.h"
 #include "inode.h"
@@ -1302,60 +1301,53 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
 			 const void *data, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
-	struct gfs2_ea_request er;
-
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-	er.er_type = gfs2_ea_name2type(name, &er.er_name);
-	if (er.er_type == GFS2_EATYPE_UNUSED)
-		return -EOPNOTSUPP;
-	er.er_data = (char *)data;
-	er.er_name_len = strlen(er.er_name);
-	er.er_data_len = size;
-	er.er_flags = flags;
-
-	gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
 
-	return gfs2_ea_set(GFS2_I(inode), &er);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = generic_setxattr(dentry, name, data, size, flags);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
 }
 
 static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
 			     void *data, size_t size)
 {
-	struct gfs2_ea_request er;
-
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-	er.er_type = gfs2_ea_name2type(name, &er.er_name);
-	if (er.er_type == GFS2_EATYPE_UNUSED)
-		return -EOPNOTSUPP;
-	er.er_data = data;
-	er.er_name_len = strlen(er.er_name);
-	er.er_data_len = size;
-
-	return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
-}
-
-static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
-	struct gfs2_ea_request er;
-
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-	er.er_data = (size) ? buffer : NULL;
-	er.er_data_len = size;
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
 
-	return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = generic_getxattr(dentry, name, data, size);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
 }
 
 static int gfs2_removexattr(struct dentry *dentry, const char *name)
 {
-	struct gfs2_ea_request er;
-
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-	er.er_type = gfs2_ea_name2type(name, &er.er_name);
-	if (er.er_type == GFS2_EATYPE_UNUSED)
-		return -EOPNOTSUPP;
-	er.er_name_len = strlen(er.er_name);
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
 
-	return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret == 0) {
+		ret = generic_removexattr(dentry, name);
+		gfs2_glock_dq(&gh);
+	}
+	gfs2_holder_uninit(&gh);
+	return ret;
 }
 
 static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 911c954cefbd..235db3682885 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -54,6 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
 extern const struct export_operations gfs2_export_ops;
 extern const struct super_operations gfs2_super_ops;
 extern const struct dentry_operations gfs2_dops;
+extern struct xattr_handler *gfs2_xattr_handlers[];
 
 #endif /* __SUPER_DOT_H__ */
 
-- 
cgit v1.2.3-70-g09d2


From 307cf6e63cfa5025589ea1a06db44439a43819ff Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 26 Aug 2009 18:51:04 +0100
Subject: GFS2: Rename eattr.[ch] as xattr.[ch]

Use the more conventional name for the extended attribute
support code. Update all the places which care.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Makefile    |    2 +-
 fs/gfs2/acl.c       |    2 +-
 fs/gfs2/eattr.c     | 1558 ---------------------------------------------------
 fs/gfs2/eattr.h     |   72 ---
 fs/gfs2/inode.c     |    2 +-
 fs/gfs2/ops_inode.c |    2 +-
 fs/gfs2/super.c     |    2 +-
 fs/gfs2/xattr.c     | 1558 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/gfs2/xattr.h     |   72 +++
 9 files changed, 1635 insertions(+), 1635 deletions(-)
 delete mode 100644 fs/gfs2/eattr.c
 delete mode 100644 fs/gfs2/eattr.h
 create mode 100644 fs/gfs2/xattr.c
 create mode 100644 fs/gfs2/xattr.h

(limited to 'fs')

diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 2a4fd894858a..21f7e46da4c0 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
 EXTRA_CFLAGS := -I$(src)
 obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o dir.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
 	glops.o inode.o log.o lops.o main.o meta_io.o \
 	aops.o dentry.o export.o file.o \
 	ops_fstype.o ops_inode.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index f6777f167260..3fc4e3ac7d84 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -19,7 +19,7 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "acl.h"
-#include "eattr.h"
+#include "xattr.h"
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
deleted file mode 100644
index d9a9e260d5ac..000000000000
--- a/fs/gfs2/eattr.c
+++ /dev/null
@@ -1,1558 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/xattr.h>
-#include <linux/gfs2_ondisk.h>
-#include <asm/uaccess.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "acl.h"
-#include "eattr.h"
-#include "glock.h"
-#include "inode.h"
-#include "meta_io.h"
-#include "quota.h"
-#include "rgrp.h"
-#include "trans.h"
-#include "util.h"
-
-/**
- * ea_calc_size - returns the acutal number of bytes the request will take up
- *                (not counting any unstuffed data blocks)
- * @sdp:
- * @er:
- * @size:
- *
- * Returns: 1 if the EA should be stuffed
- */
-
-static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
-			unsigned int *size)
-{
-	unsigned int jbsize = sdp->sd_jbsize;
-
-	/* Stuffed */
-	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
-
-	if (*size <= jbsize)
-		return 1;
-
-	/* Unstuffed */
-	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
-		      (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
-
-	return 0;
-}
-
-static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
-{
-	unsigned int size;
-
-	if (dsize > GFS2_EA_MAX_DATA_LEN)
-		return -ERANGE;
-
-	ea_calc_size(sdp, nsize, dsize, &size);
-
-	/* This can only happen with 512 byte blocks */
-	if (size > sdp->sd_jbsize)
-		return -ERANGE;
-
-	return 0;
-}
-
-typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
-			  struct gfs2_ea_header *ea,
-			  struct gfs2_ea_header *prev, void *private);
-
-static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
-			ea_call_t ea_call, void *data)
-{
-	struct gfs2_ea_header *ea, *prev = NULL;
-	int error = 0;
-
-	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
-		return -EIO;
-
-	for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
-		if (!GFS2_EA_REC_LEN(ea))
-			goto fail;
-		if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <=
-						  bh->b_data + bh->b_size))
-			goto fail;
-		if (!GFS2_EATYPE_VALID(ea->ea_type))
-			goto fail;
-
-		error = ea_call(ip, bh, ea, prev, data);
-		if (error)
-			return error;
-
-		if (GFS2_EA_IS_LAST(ea)) {
-			if ((char *)GFS2_EA2NEXT(ea) !=
-			    bh->b_data + bh->b_size)
-				goto fail;
-			break;
-		}
-	}
-
-	return error;
-
-fail:
-	gfs2_consist_inode(ip);
-	return -EIO;
-}
-
-static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
-{
-	struct buffer_head *bh, *eabh;
-	__be64 *eablk, *end;
-	int error;
-
-	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
-	if (error)
-		return error;
-
-	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
-		error = ea_foreach_i(ip, bh, ea_call, data);
-		goto out;
-	}
-
-	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
-		error = -EIO;
-		goto out;
-	}
-
-	eablk = (__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
-	end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
-
-	for (; eablk < end; eablk++) {
-		u64 bn;
-
-		if (!*eablk)
-			break;
-		bn = be64_to_cpu(*eablk);
-
-		error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
-		if (error)
-			break;
-		error = ea_foreach_i(ip, eabh, ea_call, data);
-		brelse(eabh);
-		if (error)
-			break;
-	}
-out:
-	brelse(bh);
-	return error;
-}
-
-struct ea_find {
-	int type;
-	const char *name;
-	size_t namel;
-	struct gfs2_ea_location *ef_el;
-};
-
-static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
-		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
-		     void *private)
-{
-	struct ea_find *ef = private;
-
-	if (ea->ea_type == GFS2_EATYPE_UNUSED)
-		return 0;
-
-	if (ea->ea_type == ef->type) {
-		if (ea->ea_name_len == ef->namel &&
-		    !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
-			struct gfs2_ea_location *el = ef->ef_el;
-			get_bh(bh);
-			el->el_bh = bh;
-			el->el_ea = ea;
-			el->el_prev = prev;
-			return 1;
-		}
-	}
-
-	return 0;
-}
-
-int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
-		 struct gfs2_ea_location *el)
-{
-	struct ea_find ef;
-	int error;
-
-	ef.type = type;
-	ef.name = name;
-	ef.namel = strlen(name);
-	ef.ef_el = el;
-
-	memset(el, 0, sizeof(struct gfs2_ea_location));
-
-	error = ea_foreach(ip, ea_find_i, &ef);
-	if (error > 0)
-		return 0;
-
-	return error;
-}
-
-/**
- * ea_dealloc_unstuffed -
- * @ip:
- * @bh:
- * @ea:
- * @prev:
- * @private:
- *
- * Take advantage of the fact that all unstuffed blocks are
- * allocated from the same RG.  But watch, this may not always
- * be true.
- *
- * Returns: errno
- */
-
-static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
-				struct gfs2_ea_header *ea,
-				struct gfs2_ea_header *prev, void *private)
-{
-	int *leave = private;
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_rgrpd *rgd;
-	struct gfs2_holder rg_gh;
-	struct buffer_head *dibh;
-	__be64 *dataptrs;
-	u64 bn = 0;
-	u64 bstart = 0;
-	unsigned int blen = 0;
-	unsigned int blks = 0;
-	unsigned int x;
-	int error;
-
-	if (GFS2_EA_IS_STUFFED(ea))
-		return 0;
-
-	dataptrs = GFS2_EA2DATAPTRS(ea);
-	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
-		if (*dataptrs) {
-			blks++;
-			bn = be64_to_cpu(*dataptrs);
-		}
-	}
-	if (!blks)
-		return 0;
-
-	rgd = gfs2_blk2rgrpd(sdp, bn);
-	if (!rgd) {
-		gfs2_consist_inode(ip);
-		return -EIO;
-	}
-
-	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
-	if (error)
-		return error;
-
-	error = gfs2_trans_begin(sdp, rgd->rd_length + RES_DINODE +
-				 RES_EATTR + RES_STATFS + RES_QUOTA, blks);
-	if (error)
-		goto out_gunlock;
-
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
-
-	dataptrs = GFS2_EA2DATAPTRS(ea);
-	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
-		if (!*dataptrs)
-			break;
-		bn = be64_to_cpu(*dataptrs);
-
-		if (bstart + blen == bn)
-			blen++;
-		else {
-			if (bstart)
-				gfs2_free_meta(ip, bstart, blen);
-			bstart = bn;
-			blen = 1;
-		}
-
-		*dataptrs = 0;
-		gfs2_add_inode_blocks(&ip->i_inode, -1);
-	}
-	if (bstart)
-		gfs2_free_meta(ip, bstart, blen);
-
-	if (prev && !leave) {
-		u32 len;
-
-		len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
-		prev->ea_rec_len = cpu_to_be32(len);
-
-		if (GFS2_EA_IS_LAST(ea))
-			prev->ea_flags |= GFS2_EAFLAG_LAST;
-	} else {
-		ea->ea_type = GFS2_EATYPE_UNUSED;
-		ea->ea_num_ptrs = 0;
-	}
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
-
-	gfs2_trans_end(sdp);
-
-out_gunlock:
-	gfs2_glock_dq_uninit(&rg_gh);
-	return error;
-}
-
-static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
-			       struct gfs2_ea_header *ea,
-			       struct gfs2_ea_header *prev, int leave)
-{
-	struct gfs2_alloc *al;
-	int error;
-
-	al = gfs2_alloc_get(ip);
-	if (!al)
-		return -ENOMEM;
-
-	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
-	if (error)
-		goto out_alloc;
-
-	error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
-	if (error)
-		goto out_quota;
-
-	error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
-
-	gfs2_glock_dq_uninit(&al->al_ri_gh);
-
-out_quota:
-	gfs2_quota_unhold(ip);
-out_alloc:
-	gfs2_alloc_put(ip);
-	return error;
-}
-
-struct ea_list {
-	struct gfs2_ea_request *ei_er;
-	unsigned int ei_size;
-};
-
-static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
-{
-	switch (ea->ea_type) {
-	case GFS2_EATYPE_USR:
-		return 5 + ea->ea_name_len + 1;
-	case GFS2_EATYPE_SYS:
-		return 7 + ea->ea_name_len + 1;
-	case GFS2_EATYPE_SECURITY:
-		return 9 + ea->ea_name_len + 1;
-	default:
-		return 0;
-	}
-}
-
-static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
-		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
-		     void *private)
-{
-	struct ea_list *ei = private;
-	struct gfs2_ea_request *er = ei->ei_er;
-	unsigned int ea_size = gfs2_ea_strlen(ea);
-
-	if (ea->ea_type == GFS2_EATYPE_UNUSED)
-		return 0;
-
-	if (er->er_data_len) {
-		char *prefix = NULL;
-		unsigned int l = 0;
-		char c = 0;
-
-		if (ei->ei_size + ea_size > er->er_data_len)
-			return -ERANGE;
-
-		switch (ea->ea_type) {
-		case GFS2_EATYPE_USR:
-			prefix = "user.";
-			l = 5;
-			break;
-		case GFS2_EATYPE_SYS:
-			prefix = "system.";
-			l = 7;
-			break;
-		case GFS2_EATYPE_SECURITY:
-			prefix = "security.";
-			l = 9;
-			break;
-		}
-
-		BUG_ON(l == 0);
-
-		memcpy(er->er_data + ei->ei_size, prefix, l);
-		memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
-		       ea->ea_name_len);
-		memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
-	}
-
-	ei->ei_size += ea_size;
-
-	return 0;
-}
-
-/**
- * gfs2_listxattr - List gfs2 extended attributes
- * @dentry: The dentry whose inode we are interested in
- * @buffer: The buffer to write the results
- * @size: The size of the buffer
- *
- * Returns: actual size of data on success, -errno on error
- */
-
-ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
-	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-	struct gfs2_ea_request er;
-	struct gfs2_holder i_gh;
-	int error;
-
-	memset(&er, 0, sizeof(struct gfs2_ea_request));
-	if (size) {
-		er.er_data = buffer;
-		er.er_data_len = size;
-	}
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-	if (error)
-		return error;
-
-	if (ip->i_eattr) {
-		struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
-
-		error = ea_foreach(ip, ea_list_i, &ei);
-		if (!error)
-			error = ei.ei_size;
-	}
-
-	gfs2_glock_dq_uninit(&i_gh);
-
-	return error;
-}
-
-/**
- * ea_get_unstuffed - actually copies the unstuffed data into the
- *                    request buffer
- * @ip: The GFS2 inode
- * @ea: The extended attribute header structure
- * @data: The data to be copied
- *
- * Returns: errno
- */
-
-static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
-			    char *data)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct buffer_head **bh;
-	unsigned int amount = GFS2_EA_DATA_LEN(ea);
-	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
-	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
-	unsigned int x;
-	int error = 0;
-
-	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
-	if (!bh)
-		return -ENOMEM;
-
-	for (x = 0; x < nptrs; x++) {
-		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
-				       bh + x);
-		if (error) {
-			while (x--)
-				brelse(bh[x]);
-			goto out;
-		}
-		dataptrs++;
-	}
-
-	for (x = 0; x < nptrs; x++) {
-		error = gfs2_meta_wait(sdp, bh[x]);
-		if (error) {
-			for (; x < nptrs; x++)
-				brelse(bh[x]);
-			goto out;
-		}
-		if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
-			for (; x < nptrs; x++)
-				brelse(bh[x]);
-			error = -EIO;
-			goto out;
-		}
-
-		memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header),
-		       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
-
-		amount -= sdp->sd_jbsize;
-		data += sdp->sd_jbsize;
-
-		brelse(bh[x]);
-	}
-
-out:
-	kfree(bh);
-	return error;
-}
-
-int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-		     char *data, size_t size)
-{
-	int ret;
-	size_t len = GFS2_EA_DATA_LEN(el->el_ea);
-	if (len > size)
-		return -ERANGE;
-
-	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
-		memcpy(data, GFS2_EA2DATA(el->el_ea), len);
-		return len;
-	}
-	ret = ea_get_unstuffed(ip, el->el_ea, data);
-	if (ret < 0)
-		return ret;
-	return len;
-}
-
-/**
- * gfs2_xattr_get - Get a GFS2 extended attribute
- * @inode: The inode
- * @type: The type of extended attribute
- * @name: The name of the extended attribute
- * @buffer: The buffer to write the result into
- * @size: The size of the buffer
- *
- * Returns: actual size of data on success, -errno on error
- */
-
-int gfs2_xattr_get(struct inode *inode, int type, const char *name,
-		   void *buffer, size_t size)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_ea_location el;
-	int error;
-
-	if (!ip->i_eattr)
-		return -ENODATA;
-	if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
-		return -EINVAL;
-
-	error = gfs2_ea_find(ip, type, name, &el);
-	if (error)
-		return error;
-	if (!el.el_ea)
-		return -ENODATA;
-	if (size) 
-		error = gfs2_ea_get_copy(ip, &el, buffer, size);
-	else
-		error = GFS2_EA_DATA_LEN(el.el_ea);
-	brelse(el.el_bh);
-
-	return error;
-}
-
-/**
- * ea_alloc_blk - allocates a new block for extended attributes.
- * @ip: A pointer to the inode that's getting extended attributes
- * @bhp: Pointer to pointer to a struct buffer_head
- *
- * Returns: errno
- */
-
-static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_ea_header *ea;
-	unsigned int n = 1;
-	u64 block;
-	int error;
-
-	error = gfs2_alloc_block(ip, &block, &n);
-	if (error)
-		return error;
-	gfs2_trans_add_unrevoke(sdp, block, 1);
-	*bhp = gfs2_meta_new(ip->i_gl, block);
-	gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
-	gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
-	gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
-
-	ea = GFS2_EA_BH2FIRST(*bhp);
-	ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
-	ea->ea_type = GFS2_EATYPE_UNUSED;
-	ea->ea_flags = GFS2_EAFLAG_LAST;
-	ea->ea_num_ptrs = 0;
-
-	gfs2_add_inode_blocks(&ip->i_inode, 1);
-
-	return 0;
-}
-
-/**
- * ea_write - writes the request info to an ea, creating new blocks if
- *            necessary
- * @ip: inode that is being modified
- * @ea: the location of the new ea in a block
- * @er: the write request
- *
- * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
- *
- * returns : errno
- */
-
-static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
-		    struct gfs2_ea_request *er)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	int error;
-
-	ea->ea_data_len = cpu_to_be32(er->er_data_len);
-	ea->ea_name_len = er->er_name_len;
-	ea->ea_type = er->er_type;
-	ea->__pad = 0;
-
-	memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
-
-	if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
-		ea->ea_num_ptrs = 0;
-		memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
-	} else {
-		__be64 *dataptr = GFS2_EA2DATAPTRS(ea);
-		const char *data = er->er_data;
-		unsigned int data_len = er->er_data_len;
-		unsigned int copy;
-		unsigned int x;
-
-		ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
-		for (x = 0; x < ea->ea_num_ptrs; x++) {
-			struct buffer_head *bh;
-			u64 block;
-			int mh_size = sizeof(struct gfs2_meta_header);
-			unsigned int n = 1;
-
-			error = gfs2_alloc_block(ip, &block, &n);
-			if (error)
-				return error;
-			gfs2_trans_add_unrevoke(sdp, block, 1);
-			bh = gfs2_meta_new(ip->i_gl, block);
-			gfs2_trans_add_bh(ip->i_gl, bh, 1);
-			gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
-
-			gfs2_add_inode_blocks(&ip->i_inode, 1);
-
-			copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
-							   data_len;
-			memcpy(bh->b_data + mh_size, data, copy);
-			if (copy < sdp->sd_jbsize)
-				memset(bh->b_data + mh_size + copy, 0,
-				       sdp->sd_jbsize - copy);
-
-			*dataptr++ = cpu_to_be64(bh->b_blocknr);
-			data += copy;
-			data_len -= copy;
-
-			brelse(bh);
-		}
-
-		gfs2_assert_withdraw(sdp, !data_len);
-	}
-
-	return 0;
-}
-
-typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
-				   struct gfs2_ea_request *er, void *private);
-
-static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
-			     unsigned int blks,
-			     ea_skeleton_call_t skeleton_call, void *private)
-{
-	struct gfs2_alloc *al;
-	struct buffer_head *dibh;
-	int error;
-
-	al = gfs2_alloc_get(ip);
-	if (!al)
-		return -ENOMEM;
-
-	error = gfs2_quota_lock_check(ip);
-	if (error)
-		goto out;
-
-	al->al_requested = blks;
-
-	error = gfs2_inplace_reserve(ip);
-	if (error)
-		goto out_gunlock_q;
-
-	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
-				 blks + al->al_rgd->rd_length +
-				 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
-	if (error)
-		goto out_ipres;
-
-	error = skeleton_call(ip, er, private);
-	if (error)
-		goto out_end_trans;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
-
-out_end_trans:
-	gfs2_trans_end(GFS2_SB(&ip->i_inode));
-out_ipres:
-	gfs2_inplace_release(ip);
-out_gunlock_q:
-	gfs2_quota_unlock(ip);
-out:
-	gfs2_alloc_put(ip);
-	return error;
-}
-
-static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
-		     void *private)
-{
-	struct buffer_head *bh;
-	int error;
-
-	error = ea_alloc_blk(ip, &bh);
-	if (error)
-		return error;
-
-	ip->i_eattr = bh->b_blocknr;
-	error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
-
-	brelse(bh);
-
-	return error;
-}
-
-/**
- * ea_init - initializes a new eattr block
- * @ip:
- * @er:
- *
- * Returns: errno
- */
-
-static int ea_init(struct gfs2_inode *ip, int type, const char *name,
-		   const void *data, size_t size)
-{
-	struct gfs2_ea_request er;
-	unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
-	unsigned int blks = 1;
-
-	er.er_type = type;
-	er.er_name = name;
-	er.er_name_len = strlen(name);
-	er.er_data = (void *)data;
-	er.er_data_len = size;
-
-	if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
-		blks += DIV_ROUND_UP(er.er_data_len, jbsize);
-
-	return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
-}
-
-static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
-{
-	u32 ea_size = GFS2_EA_SIZE(ea);
-	struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
-				     ea_size);
-	u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size;
-	int last = ea->ea_flags & GFS2_EAFLAG_LAST;
-
-	ea->ea_rec_len = cpu_to_be32(ea_size);
-	ea->ea_flags ^= last;
-
-	new->ea_rec_len = cpu_to_be32(new_size);
-	new->ea_flags = last;
-
-	return new;
-}
-
-static void ea_set_remove_stuffed(struct gfs2_inode *ip,
-				  struct gfs2_ea_location *el)
-{
-	struct gfs2_ea_header *ea = el->el_ea;
-	struct gfs2_ea_header *prev = el->el_prev;
-	u32 len;
-
-	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
-
-	if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
-		ea->ea_type = GFS2_EATYPE_UNUSED;
-		return;
-	} else if (GFS2_EA2NEXT(prev) != ea) {
-		prev = GFS2_EA2NEXT(prev);
-		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
-	}
-
-	len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
-	prev->ea_rec_len = cpu_to_be32(len);
-
-	if (GFS2_EA_IS_LAST(ea))
-		prev->ea_flags |= GFS2_EAFLAG_LAST;
-}
-
-struct ea_set {
-	int ea_split;
-
-	struct gfs2_ea_request *es_er;
-	struct gfs2_ea_location *es_el;
-
-	struct buffer_head *es_bh;
-	struct gfs2_ea_header *es_ea;
-};
-
-static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
-				 struct gfs2_ea_header *ea, struct ea_set *es)
-{
-	struct gfs2_ea_request *er = es->es_er;
-	struct buffer_head *dibh;
-	int error;
-
-	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
-	if (error)
-		return error;
-
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
-
-	if (es->ea_split)
-		ea = ea_split_ea(ea);
-
-	ea_write(ip, ea, er);
-
-	if (es->es_el)
-		ea_set_remove_stuffed(ip, es->es_el);
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		goto out;
-	ip->i_inode.i_ctime = CURRENT_TIME;
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(ip, dibh->b_data);
-	brelse(dibh);
-out:
-	gfs2_trans_end(GFS2_SB(&ip->i_inode));
-	return error;
-}
-
-static int ea_set_simple_alloc(struct gfs2_inode *ip,
-			       struct gfs2_ea_request *er, void *private)
-{
-	struct ea_set *es = private;
-	struct gfs2_ea_header *ea = es->es_ea;
-	int error;
-
-	gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
-
-	if (es->ea_split)
-		ea = ea_split_ea(ea);
-
-	error = ea_write(ip, ea, er);
-	if (error)
-		return error;
-
-	if (es->es_el)
-		ea_set_remove_stuffed(ip, es->es_el);
-
-	return 0;
-}
-
-static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
-			 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
-			 void *private)
-{
-	struct ea_set *es = private;
-	unsigned int size;
-	int stuffed;
-	int error;
-
-	stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
-			       es->es_er->er_data_len, &size);
-
-	if (ea->ea_type == GFS2_EATYPE_UNUSED) {
-		if (GFS2_EA_REC_LEN(ea) < size)
-			return 0;
-		if (!GFS2_EA_IS_STUFFED(ea)) {
-			error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
-			if (error)
-				return error;
-		}
-		es->ea_split = 0;
-	} else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
-		es->ea_split = 1;
-	else
-		return 0;
-
-	if (stuffed) {
-		error = ea_set_simple_noalloc(ip, bh, ea, es);
-		if (error)
-			return error;
-	} else {
-		unsigned int blks;
-
-		es->es_bh = bh;
-		es->es_ea = ea;
-		blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
-					GFS2_SB(&ip->i_inode)->sd_jbsize);
-
-		error = ea_alloc_skeleton(ip, es->es_er, blks,
-					  ea_set_simple_alloc, es);
-		if (error)
-			return error;
-	}
-
-	return 1;
-}
-
-static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
-			void *private)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct buffer_head *indbh, *newbh;
-	__be64 *eablk;
-	int error;
-	int mh_size = sizeof(struct gfs2_meta_header);
-
-	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
-		__be64 *end;
-
-		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
-				       &indbh);
-		if (error)
-			return error;
-
-		if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
-			error = -EIO;
-			goto out;
-		}
-
-		eablk = (__be64 *)(indbh->b_data + mh_size);
-		end = eablk + sdp->sd_inptrs;
-
-		for (; eablk < end; eablk++)
-			if (!*eablk)
-				break;
-
-		if (eablk == end) {
-			error = -ENOSPC;
-			goto out;
-		}
-
-		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
-	} else {
-		u64 blk;
-		unsigned int n = 1;
-		error = gfs2_alloc_block(ip, &blk, &n);
-		if (error)
-			return error;
-		gfs2_trans_add_unrevoke(sdp, blk, 1);
-		indbh = gfs2_meta_new(ip->i_gl, blk);
-		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
-		gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
-		gfs2_buffer_clear_tail(indbh, mh_size);
-
-		eablk = (__be64 *)(indbh->b_data + mh_size);
-		*eablk = cpu_to_be64(ip->i_eattr);
-		ip->i_eattr = blk;
-		ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
-		gfs2_add_inode_blocks(&ip->i_inode, 1);
-
-		eablk++;
-	}
-
-	error = ea_alloc_blk(ip, &newbh);
-	if (error)
-		goto out;
-
-	*eablk = cpu_to_be64((u64)newbh->b_blocknr);
-	error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
-	brelse(newbh);
-	if (error)
-		goto out;
-
-	if (private)
-		ea_set_remove_stuffed(ip, private);
-
-out:
-	brelse(indbh);
-	return error;
-}
-
-static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
-		    const void *value, size_t size, struct gfs2_ea_location *el)
-{
-	struct gfs2_ea_request er;
-	struct ea_set es;
-	unsigned int blks = 2;
-	int error;
-
-	er.er_type = type;
-	er.er_name = name;
-	er.er_data = (void *)value;
-	er.er_name_len = strlen(name);
-	er.er_data_len = size;
-
-	memset(&es, 0, sizeof(struct ea_set));
-	es.es_er = &er;
-	es.es_el = el;
-
-	error = ea_foreach(ip, ea_set_simple, &es);
-	if (error > 0)
-		return 0;
-	if (error)
-		return error;
-
-	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
-		blks++;
-	if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
-		blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
-
-	return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
-}
-
-static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
-				   struct gfs2_ea_location *el)
-{
-	if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
-		el->el_prev = GFS2_EA2NEXT(el->el_prev);
-		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
-				     GFS2_EA2NEXT(el->el_prev) == el->el_ea);
-	}
-
-	return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
-}
-
-static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
-{
-	struct gfs2_ea_header *ea = el->el_ea;
-	struct gfs2_ea_header *prev = el->el_prev;
-	struct buffer_head *dibh;
-	int error;
-
-	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
-	if (error)
-		return error;
-
-	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
-
-	if (prev) {
-		u32 len;
-
-		len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
-		prev->ea_rec_len = cpu_to_be32(len);
-
-		if (GFS2_EA_IS_LAST(ea))
-			prev->ea_flags |= GFS2_EAFLAG_LAST;
-	} else {
-		ea->ea_type = GFS2_EATYPE_UNUSED;
-	}
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
-
-	gfs2_trans_end(GFS2_SB(&ip->i_inode));
-
-	return error;
-}
-
-/**
- * gfs2_xattr_remove - Remove a GFS2 extended attribute
- * @inode: The inode
- * @type: The type of the extended attribute
- * @name: The name of the extended attribute
- *
- * This is not called directly by the VFS since we use the (common)
- * scheme of making a "set with NULL data" mean a remove request. Note
- * that this is different from a set with zero length data.
- *
- * Returns: 0, or errno on failure
- */
-
-static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_ea_location el;
-	int error;
-
-	if (!ip->i_eattr)
-		return -ENODATA;
-
-	error = gfs2_ea_find(ip, type, name, &el);
-	if (error)
-		return error;
-	if (!el.el_ea)
-		return -ENODATA;
-
-	if (GFS2_EA_IS_STUFFED(el.el_ea))
-		error = ea_remove_stuffed(ip, &el);
-	else
-		error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
-
-	brelse(el.el_bh);
-
-	return error;
-}
-
-/**
- * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
- * @inode: The inode
- * @type: The type of the extended attribute
- * @name: The name of the extended attribute
- * @value: The value of the extended attribute (NULL for remove)
- * @size: The size of the @value argument
- * @flags: Create or Replace
- *
- * See gfs2_xattr_remove() for details of the removal of xattrs.
- *
- * Returns: 0 or errno on failure
- */
-
-int gfs2_xattr_set(struct inode *inode, int type, const char *name,
-		   const void *value, size_t size, int flags)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_ea_location el;
-	unsigned int namel = strlen(name);
-	int error;
-
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		return -EPERM;
-	if (namel > GFS2_EA_MAX_NAME_LEN)
-		return -ERANGE;
-
-	if (value == NULL)
-		return gfs2_xattr_remove(inode, type, name);
-
-	if (ea_check_size(sdp, namel, size))
-		return -ERANGE;
-
-	if (!ip->i_eattr) {
-		if (flags & XATTR_REPLACE)
-			return -ENODATA;
-		return ea_init(ip, type, name, value, size);
-	}
-
-	error = gfs2_ea_find(ip, type, name, &el);
-	if (error)
-		return error;
-
-	if (el.el_ea) {
-		if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
-			brelse(el.el_bh);
-			return -EPERM;
-		}
-
-		error = -EEXIST;
-		if (!(flags & XATTR_CREATE)) {
-			int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
-			error = ea_set_i(ip, type, name, value, size, &el);
-			if (!error && unstuffed)
-				ea_set_remove_unstuffed(ip, &el);
-		}
-
-		brelse(el.el_bh);
-		return error;
-	}
-
-	error = -ENODATA;
-	if (!(flags & XATTR_REPLACE))
-		error = ea_set_i(ip, type, name, value, size, NULL);
-
-	return error;
-}
-
-static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
-				  struct gfs2_ea_header *ea, char *data)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct buffer_head **bh;
-	unsigned int amount = GFS2_EA_DATA_LEN(ea);
-	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
-	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
-	unsigned int x;
-	int error;
-
-	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
-	if (!bh)
-		return -ENOMEM;
-
-	error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
-	if (error)
-		goto out;
-
-	for (x = 0; x < nptrs; x++) {
-		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
-				       bh + x);
-		if (error) {
-			while (x--)
-				brelse(bh[x]);
-			goto fail;
-		}
-		dataptrs++;
-	}
-
-	for (x = 0; x < nptrs; x++) {
-		error = gfs2_meta_wait(sdp, bh[x]);
-		if (error) {
-			for (; x < nptrs; x++)
-				brelse(bh[x]);
-			goto fail;
-		}
-		if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
-			for (; x < nptrs; x++)
-				brelse(bh[x]);
-			error = -EIO;
-			goto fail;
-		}
-
-		gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
-
-		memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data,
-		       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
-
-		amount -= sdp->sd_jbsize;
-		data += sdp->sd_jbsize;
-
-		brelse(bh[x]);
-	}
-
-out:
-	kfree(bh);
-	return error;
-
-fail:
-	gfs2_trans_end(sdp);
-	kfree(bh);
-	return error;
-}
-
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-		      struct iattr *attr, char *data)
-{
-	struct buffer_head *dibh;
-	int error;
-
-	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
-		error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
-		if (error)
-			return error;
-
-		gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
-		memcpy(GFS2_EA2DATA(el->el_ea), data,
-		       GFS2_EA_DATA_LEN(el->el_ea));
-	} else
-		error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
-
-	if (error)
-		return error;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		error = inode_setattr(&ip->i_inode, attr);
-		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
-
-	gfs2_trans_end(GFS2_SB(&ip->i_inode));
-
-	return error;
-}
-
-static int ea_dealloc_indirect(struct gfs2_inode *ip)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_rgrp_list rlist;
-	struct buffer_head *indbh, *dibh;
-	__be64 *eablk, *end;
-	unsigned int rg_blocks = 0;
-	u64 bstart = 0;
-	unsigned int blen = 0;
-	unsigned int blks = 0;
-	unsigned int x;
-	int error;
-
-	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
-
-	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
-	if (error)
-		return error;
-
-	if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
-		error = -EIO;
-		goto out;
-	}
-
-	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
-	end = eablk + sdp->sd_inptrs;
-
-	for (; eablk < end; eablk++) {
-		u64 bn;
-
-		if (!*eablk)
-			break;
-		bn = be64_to_cpu(*eablk);
-
-		if (bstart + blen == bn)
-			blen++;
-		else {
-			if (bstart)
-				gfs2_rlist_add(sdp, &rlist, bstart);
-			bstart = bn;
-			blen = 1;
-		}
-		blks++;
-	}
-	if (bstart)
-		gfs2_rlist_add(sdp, &rlist, bstart);
-	else
-		goto out;
-
-	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
-
-	for (x = 0; x < rlist.rl_rgrps; x++) {
-		struct gfs2_rgrpd *rgd;
-		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
-		rg_blocks += rgd->rd_length;
-	}
-
-	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
-	if (error)
-		goto out_rlist_free;
-
-	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT +
-				 RES_STATFS + RES_QUOTA, blks);
-	if (error)
-		goto out_gunlock;
-
-	gfs2_trans_add_bh(ip->i_gl, indbh, 1);
-
-	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
-	bstart = 0;
-	blen = 0;
-
-	for (; eablk < end; eablk++) {
-		u64 bn;
-
-		if (!*eablk)
-			break;
-		bn = be64_to_cpu(*eablk);
-
-		if (bstart + blen == bn)
-			blen++;
-		else {
-			if (bstart)
-				gfs2_free_meta(ip, bstart, blen);
-			bstart = bn;
-			blen = 1;
-		}
-
-		*eablk = 0;
-		gfs2_add_inode_blocks(&ip->i_inode, -1);
-	}
-	if (bstart)
-		gfs2_free_meta(ip, bstart, blen);
-
-	ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
-
-	gfs2_trans_end(sdp);
-
-out_gunlock:
-	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
-out_rlist_free:
-	gfs2_rlist_free(&rlist);
-out:
-	brelse(indbh);
-	return error;
-}
-
-static int ea_dealloc_block(struct gfs2_inode *ip)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_alloc *al = ip->i_alloc;
-	struct gfs2_rgrpd *rgd;
-	struct buffer_head *dibh;
-	int error;
-
-	rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
-	if (!rgd) {
-		gfs2_consist_inode(ip);
-		return -EIO;
-	}
-
-	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
-				   &al->al_rgd_gh);
-	if (error)
-		return error;
-
-	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS +
-				 RES_QUOTA, 1);
-	if (error)
-		goto out_gunlock;
-
-	gfs2_free_meta(ip, ip->i_eattr, 1);
-
-	ip->i_eattr = 0;
-	gfs2_add_inode_blocks(&ip->i_inode, -1);
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (!error) {
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-		gfs2_dinode_out(ip, dibh->b_data);
-		brelse(dibh);
-	}
-
-	gfs2_trans_end(sdp);
-
-out_gunlock:
-	gfs2_glock_dq_uninit(&al->al_rgd_gh);
-	return error;
-}
-
-/**
- * gfs2_ea_dealloc - deallocate the extended attribute fork
- * @ip: the inode
- *
- * Returns: errno
- */
-
-int gfs2_ea_dealloc(struct gfs2_inode *ip)
-{
-	struct gfs2_alloc *al;
-	int error;
-
-	al = gfs2_alloc_get(ip);
-	if (!al)
-		return -ENOMEM;
-
-	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
-	if (error)
-		goto out_alloc;
-
-	error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
-	if (error)
-		goto out_quota;
-
-	error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
-	if (error)
-		goto out_rindex;
-
-	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
-		error = ea_dealloc_indirect(ip);
-		if (error)
-			goto out_rindex;
-	}
-
-	error = ea_dealloc_block(ip);
-
-out_rindex:
-	gfs2_glock_dq_uninit(&al->al_ri_gh);
-out_quota:
-	gfs2_quota_unhold(ip);
-out_alloc:
-	gfs2_alloc_put(ip);
-	return error;
-}
-
-static int gfs2_xattr_user_get(struct inode *inode, const char *name,
-			       void *buffer, size_t size)
-{
-	return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
-}
-
-static int gfs2_xattr_user_set(struct inode *inode, const char *name,
-			       const void *value, size_t size, int flags)
-{
-	return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
-}
-
-static int gfs2_xattr_system_get(struct inode *inode, const char *name,
-				 void *buffer, size_t size)
-{
-	return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
-}
-
-static int gfs2_xattr_system_set(struct inode *inode, const char *name,
-				 const void *value, size_t size, int flags)
-{
-	return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
-}
-
-static int gfs2_xattr_security_get(struct inode *inode, const char *name,
-				   void *buffer, size_t size)
-{
-	return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
-}
-
-static int gfs2_xattr_security_set(struct inode *inode, const char *name,
-				   const void *value, size_t size, int flags)
-{
-	return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
-}
-
-static struct xattr_handler gfs2_xattr_user_handler = {
-	.prefix = XATTR_USER_PREFIX,
-	.get    = gfs2_xattr_user_get,
-	.set    = gfs2_xattr_user_set,
-};
-
-static struct xattr_handler gfs2_xattr_security_handler = {
-	.prefix = XATTR_SECURITY_PREFIX,
-	.get    = gfs2_xattr_security_get,
-	.set    = gfs2_xattr_security_set,
-};
-
-static struct xattr_handler gfs2_xattr_system_handler = {
-	.prefix = XATTR_SYSTEM_PREFIX,
-	.get    = gfs2_xattr_system_get,
-	.set    = gfs2_xattr_system_set,
-};
-
-struct xattr_handler *gfs2_xattr_handlers[] = {
-	&gfs2_xattr_user_handler,
-	&gfs2_xattr_security_handler,
-	&gfs2_xattr_system_handler,
-	NULL,
-};
-
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
deleted file mode 100644
index 4040a188f63b..000000000000
--- a/fs/gfs2/eattr.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __EATTR_DOT_H__
-#define __EATTR_DOT_H__
-
-struct gfs2_inode;
-struct iattr;
-
-#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
-#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
-
-#define GFS2_EA_SIZE(ea) \
-ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
-      ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
-                                  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
-
-#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
-#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
-
-#define GFS2_EAREQ_SIZE_STUFFED(er) \
-ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
-
-#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
-#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
-
-#define GFS2_EA2DATAPTRS(ea) \
-((__be64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
-
-#define GFS2_EA2NEXT(ea) \
-((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
-
-#define GFS2_EA_BH2FIRST(bh) \
-((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
-
-struct gfs2_ea_request {
-	const char *er_name;
-	char *er_data;
-	unsigned int er_name_len;
-	unsigned int er_data_len;
-	unsigned int er_type; /* GFS2_EATYPE_... */
-};
-
-struct gfs2_ea_location {
-	struct buffer_head *el_bh;
-	struct gfs2_ea_header *el_ea;
-	struct gfs2_ea_header *el_prev;
-};
-
-extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
-			  void *buffer, size_t size);
-extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
-			  const void *value, size_t size, int flags);
-extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
-extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
-
-/* Exported to acl.c */
-
-extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
-			struct gfs2_ea_location *el);
-extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-			    char *data, size_t size);
-extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-			     struct iattr *attr, char *data);
-
-#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index f936d2d68138..4f5e442d60d8 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -24,7 +24,7 @@
 #include "acl.h"
 #include "bmap.h"
 #include "dir.h"
-#include "eattr.h"
+#include "xattr.h"
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1ee0c2657467..c3ac18054057 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -26,7 +26,7 @@
 #include "acl.h"
 #include "bmap.h"
 #include "dir.h"
-#include "eattr.h"
+#include "xattr.h"
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7a5c128e8776..2e78a3f68a76 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -38,7 +38,7 @@
 #include "trans.h"
 #include "util.h"
 #include "sys.h"
-#include "eattr.h"
+#include "xattr.h"
 
 #define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
 
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
new file mode 100644
index 000000000000..34ae9ba4c9fe
--- /dev/null
+++ b/fs/gfs2/xattr.c
@@ -0,0 +1,1558 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "xattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+/**
+ * ea_calc_size - returns the acutal number of bytes the request will take up
+ *                (not counting any unstuffed data blocks)
+ * @sdp:
+ * @er:
+ * @size:
+ *
+ * Returns: 1 if the EA should be stuffed
+ */
+
+static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
+			unsigned int *size)
+{
+	unsigned int jbsize = sdp->sd_jbsize;
+
+	/* Stuffed */
+	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
+
+	if (*size <= jbsize)
+		return 1;
+
+	/* Unstuffed */
+	*size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
+		      (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
+
+	return 0;
+}
+
+static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
+{
+	unsigned int size;
+
+	if (dsize > GFS2_EA_MAX_DATA_LEN)
+		return -ERANGE;
+
+	ea_calc_size(sdp, nsize, dsize, &size);
+
+	/* This can only happen with 512 byte blocks */
+	if (size > sdp->sd_jbsize)
+		return -ERANGE;
+
+	return 0;
+}
+
+typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
+			  struct gfs2_ea_header *ea,
+			  struct gfs2_ea_header *prev, void *private);
+
+static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
+			ea_call_t ea_call, void *data)
+{
+	struct gfs2_ea_header *ea, *prev = NULL;
+	int error = 0;
+
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
+		return -EIO;
+
+	for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
+		if (!GFS2_EA_REC_LEN(ea))
+			goto fail;
+		if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <=
+						  bh->b_data + bh->b_size))
+			goto fail;
+		if (!GFS2_EATYPE_VALID(ea->ea_type))
+			goto fail;
+
+		error = ea_call(ip, bh, ea, prev, data);
+		if (error)
+			return error;
+
+		if (GFS2_EA_IS_LAST(ea)) {
+			if ((char *)GFS2_EA2NEXT(ea) !=
+			    bh->b_data + bh->b_size)
+				goto fail;
+			break;
+		}
+	}
+
+	return error;
+
+fail:
+	gfs2_consist_inode(ip);
+	return -EIO;
+}
+
+static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
+{
+	struct buffer_head *bh, *eabh;
+	__be64 *eablk, *end;
+	int error;
+
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
+	if (error)
+		return error;
+
+	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
+		error = ea_foreach_i(ip, bh, ea_call, data);
+		goto out;
+	}
+
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
+		error = -EIO;
+		goto out;
+	}
+
+	eablk = (__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
+	end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
+		if (error)
+			break;
+		error = ea_foreach_i(ip, eabh, ea_call, data);
+		brelse(eabh);
+		if (error)
+			break;
+	}
+out:
+	brelse(bh);
+	return error;
+}
+
+struct ea_find {
+	int type;
+	const char *name;
+	size_t namel;
+	struct gfs2_ea_location *ef_el;
+};
+
+static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
+		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+		     void *private)
+{
+	struct ea_find *ef = private;
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED)
+		return 0;
+
+	if (ea->ea_type == ef->type) {
+		if (ea->ea_name_len == ef->namel &&
+		    !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
+			struct gfs2_ea_location *el = ef->ef_el;
+			get_bh(bh);
+			el->el_bh = bh;
+			el->el_ea = ea;
+			el->el_prev = prev;
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+		 struct gfs2_ea_location *el)
+{
+	struct ea_find ef;
+	int error;
+
+	ef.type = type;
+	ef.name = name;
+	ef.namel = strlen(name);
+	ef.ef_el = el;
+
+	memset(el, 0, sizeof(struct gfs2_ea_location));
+
+	error = ea_foreach(ip, ea_find_i, &ef);
+	if (error > 0)
+		return 0;
+
+	return error;
+}
+
+/**
+ * ea_dealloc_unstuffed -
+ * @ip:
+ * @bh:
+ * @ea:
+ * @prev:
+ * @private:
+ *
+ * Take advantage of the fact that all unstuffed blocks are
+ * allocated from the same RG.  But watch, this may not always
+ * be true.
+ *
+ * Returns: errno
+ */
+
+static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+				struct gfs2_ea_header *ea,
+				struct gfs2_ea_header *prev, void *private)
+{
+	int *leave = private;
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder rg_gh;
+	struct buffer_head *dibh;
+	__be64 *dataptrs;
+	u64 bn = 0;
+	u64 bstart = 0;
+	unsigned int blen = 0;
+	unsigned int blks = 0;
+	unsigned int x;
+	int error;
+
+	if (GFS2_EA_IS_STUFFED(ea))
+		return 0;
+
+	dataptrs = GFS2_EA2DATAPTRS(ea);
+	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+		if (*dataptrs) {
+			blks++;
+			bn = be64_to_cpu(*dataptrs);
+		}
+	}
+	if (!blks)
+		return 0;
+
+	rgd = gfs2_blk2rgrpd(sdp, bn);
+	if (!rgd) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
+	if (error)
+		return error;
+
+	error = gfs2_trans_begin(sdp, rgd->rd_length + RES_DINODE +
+				 RES_EATTR + RES_STATFS + RES_QUOTA, blks);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+	dataptrs = GFS2_EA2DATAPTRS(ea);
+	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+		if (!*dataptrs)
+			break;
+		bn = be64_to_cpu(*dataptrs);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_free_meta(ip, bstart, blen);
+			bstart = bn;
+			blen = 1;
+		}
+
+		*dataptrs = 0;
+		gfs2_add_inode_blocks(&ip->i_inode, -1);
+	}
+	if (bstart)
+		gfs2_free_meta(ip, bstart, blen);
+
+	if (prev && !leave) {
+		u32 len;
+
+		len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+		prev->ea_rec_len = cpu_to_be32(len);
+
+		if (GFS2_EA_IS_LAST(ea))
+			prev->ea_flags |= GFS2_EAFLAG_LAST;
+	} else {
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+		ea->ea_num_ptrs = 0;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_uninit(&rg_gh);
+	return error;
+}
+
+static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+			       struct gfs2_ea_header *ea,
+			       struct gfs2_ea_header *prev, int leave)
+{
+	struct gfs2_alloc *al;
+	int error;
+
+	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
+
+	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out_alloc;
+
+	error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+	if (error)
+		goto out_quota;
+
+	error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
+
+	gfs2_glock_dq_uninit(&al->al_ri_gh);
+
+out_quota:
+	gfs2_quota_unhold(ip);
+out_alloc:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+struct ea_list {
+	struct gfs2_ea_request *ei_er;
+	unsigned int ei_size;
+};
+
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+	switch (ea->ea_type) {
+	case GFS2_EATYPE_USR:
+		return 5 + ea->ea_name_len + 1;
+	case GFS2_EATYPE_SYS:
+		return 7 + ea->ea_name_len + 1;
+	case GFS2_EATYPE_SECURITY:
+		return 9 + ea->ea_name_len + 1;
+	default:
+		return 0;
+	}
+}
+
+static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
+		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+		     void *private)
+{
+	struct ea_list *ei = private;
+	struct gfs2_ea_request *er = ei->ei_er;
+	unsigned int ea_size = gfs2_ea_strlen(ea);
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED)
+		return 0;
+
+	if (er->er_data_len) {
+		char *prefix = NULL;
+		unsigned int l = 0;
+		char c = 0;
+
+		if (ei->ei_size + ea_size > er->er_data_len)
+			return -ERANGE;
+
+		switch (ea->ea_type) {
+		case GFS2_EATYPE_USR:
+			prefix = "user.";
+			l = 5;
+			break;
+		case GFS2_EATYPE_SYS:
+			prefix = "system.";
+			l = 7;
+			break;
+		case GFS2_EATYPE_SECURITY:
+			prefix = "security.";
+			l = 9;
+			break;
+		}
+
+		BUG_ON(l == 0);
+
+		memcpy(er->er_data + ei->ei_size, prefix, l);
+		memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
+		       ea->ea_name_len);
+		memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
+	}
+
+	ei->ei_size += ea_size;
+
+	return 0;
+}
+
+/**
+ * gfs2_listxattr - List gfs2 extended attributes
+ * @dentry: The dentry whose inode we are interested in
+ * @buffer: The buffer to write the results
+ * @size: The size of the buffer
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_ea_request er;
+	struct gfs2_holder i_gh;
+	int error;
+
+	memset(&er, 0, sizeof(struct gfs2_ea_request));
+	if (size) {
+		er.er_data = buffer;
+		er.er_data_len = size;
+	}
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (error)
+		return error;
+
+	if (ip->i_eattr) {
+		struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
+
+		error = ea_foreach(ip, ea_list_i, &ei);
+		if (!error)
+			error = ei.ei_size;
+	}
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return error;
+}
+
+/**
+ * ea_get_unstuffed - actually copies the unstuffed data into the
+ *                    request buffer
+ * @ip: The GFS2 inode
+ * @ea: The extended attribute header structure
+ * @data: The data to be copied
+ *
+ * Returns: errno
+ */
+
+static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+			    char *data)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head **bh;
+	unsigned int amount = GFS2_EA_DATA_LEN(ea);
+	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+	unsigned int x;
+	int error = 0;
+
+	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!bh)
+		return -ENOMEM;
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+				       bh + x);
+		if (error) {
+			while (x--)
+				brelse(bh[x]);
+			goto out;
+		}
+		dataptrs++;
+	}
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_wait(sdp, bh[x]);
+		if (error) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			goto out;
+		}
+		if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			error = -EIO;
+			goto out;
+		}
+
+		memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header),
+		       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+
+		amount -= sdp->sd_jbsize;
+		data += sdp->sd_jbsize;
+
+		brelse(bh[x]);
+	}
+
+out:
+	kfree(bh);
+	return error;
+}
+
+int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+		     char *data, size_t size)
+{
+	int ret;
+	size_t len = GFS2_EA_DATA_LEN(el->el_ea);
+	if (len > size)
+		return -ERANGE;
+
+	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+		memcpy(data, GFS2_EA2DATA(el->el_ea), len);
+		return len;
+	}
+	ret = ea_get_unstuffed(ip, el->el_ea, data);
+	if (ret < 0)
+		return ret;
+	return len;
+}
+
+/**
+ * gfs2_xattr_get - Get a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of extended attribute
+ * @name: The name of the extended attribute
+ * @buffer: The buffer to write the result into
+ * @size: The size of the buffer
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+		   void *buffer, size_t size)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_ea_location el;
+	int error;
+
+	if (!ip->i_eattr)
+		return -ENODATA;
+	if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
+		return -EINVAL;
+
+	error = gfs2_ea_find(ip, type, name, &el);
+	if (error)
+		return error;
+	if (!el.el_ea)
+		return -ENODATA;
+	if (size) 
+		error = gfs2_ea_get_copy(ip, &el, buffer, size);
+	else
+		error = GFS2_EA_DATA_LEN(el.el_ea);
+	brelse(el.el_bh);
+
+	return error;
+}
+
+/**
+ * ea_alloc_blk - allocates a new block for extended attributes.
+ * @ip: A pointer to the inode that's getting extended attributes
+ * @bhp: Pointer to pointer to a struct buffer_head
+ *
+ * Returns: errno
+ */
+
+static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_ea_header *ea;
+	unsigned int n = 1;
+	u64 block;
+	int error;
+
+	error = gfs2_alloc_block(ip, &block, &n);
+	if (error)
+		return error;
+	gfs2_trans_add_unrevoke(sdp, block, 1);
+	*bhp = gfs2_meta_new(ip->i_gl, block);
+	gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
+	gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
+	gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
+
+	ea = GFS2_EA_BH2FIRST(*bhp);
+	ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
+	ea->ea_type = GFS2_EATYPE_UNUSED;
+	ea->ea_flags = GFS2_EAFLAG_LAST;
+	ea->ea_num_ptrs = 0;
+
+	gfs2_add_inode_blocks(&ip->i_inode, 1);
+
+	return 0;
+}
+
+/**
+ * ea_write - writes the request info to an ea, creating new blocks if
+ *            necessary
+ * @ip: inode that is being modified
+ * @ea: the location of the new ea in a block
+ * @er: the write request
+ *
+ * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
+ *
+ * returns : errno
+ */
+
+static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+		    struct gfs2_ea_request *er)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int error;
+
+	ea->ea_data_len = cpu_to_be32(er->er_data_len);
+	ea->ea_name_len = er->er_name_len;
+	ea->ea_type = er->er_type;
+	ea->__pad = 0;
+
+	memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
+
+	if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
+		ea->ea_num_ptrs = 0;
+		memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
+	} else {
+		__be64 *dataptr = GFS2_EA2DATAPTRS(ea);
+		const char *data = er->er_data;
+		unsigned int data_len = er->er_data_len;
+		unsigned int copy;
+		unsigned int x;
+
+		ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
+		for (x = 0; x < ea->ea_num_ptrs; x++) {
+			struct buffer_head *bh;
+			u64 block;
+			int mh_size = sizeof(struct gfs2_meta_header);
+			unsigned int n = 1;
+
+			error = gfs2_alloc_block(ip, &block, &n);
+			if (error)
+				return error;
+			gfs2_trans_add_unrevoke(sdp, block, 1);
+			bh = gfs2_meta_new(ip->i_gl, block);
+			gfs2_trans_add_bh(ip->i_gl, bh, 1);
+			gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
+
+			gfs2_add_inode_blocks(&ip->i_inode, 1);
+
+			copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
+							   data_len;
+			memcpy(bh->b_data + mh_size, data, copy);
+			if (copy < sdp->sd_jbsize)
+				memset(bh->b_data + mh_size + copy, 0,
+				       sdp->sd_jbsize - copy);
+
+			*dataptr++ = cpu_to_be64(bh->b_blocknr);
+			data += copy;
+			data_len -= copy;
+
+			brelse(bh);
+		}
+
+		gfs2_assert_withdraw(sdp, !data_len);
+	}
+
+	return 0;
+}
+
+typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
+				   struct gfs2_ea_request *er, void *private);
+
+static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+			     unsigned int blks,
+			     ea_skeleton_call_t skeleton_call, void *private)
+{
+	struct gfs2_alloc *al;
+	struct buffer_head *dibh;
+	int error;
+
+	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
+
+	error = gfs2_quota_lock_check(ip);
+	if (error)
+		goto out;
+
+	al->al_requested = blks;
+
+	error = gfs2_inplace_reserve(ip);
+	if (error)
+		goto out_gunlock_q;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
+				 blks + al->al_rgd->rd_length +
+				 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+	if (error)
+		goto out_ipres;
+
+	error = skeleton_call(ip, er, private);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+out_end_trans:
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+out_ipres:
+	gfs2_inplace_release(ip);
+out_gunlock_q:
+	gfs2_quota_unlock(ip);
+out:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+		     void *private)
+{
+	struct buffer_head *bh;
+	int error;
+
+	error = ea_alloc_blk(ip, &bh);
+	if (error)
+		return error;
+
+	ip->i_eattr = bh->b_blocknr;
+	error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
+
+	brelse(bh);
+
+	return error;
+}
+
+/**
+ * ea_init - initializes a new eattr block
+ * @ip:
+ * @er:
+ *
+ * Returns: errno
+ */
+
+static int ea_init(struct gfs2_inode *ip, int type, const char *name,
+		   const void *data, size_t size)
+{
+	struct gfs2_ea_request er;
+	unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
+	unsigned int blks = 1;
+
+	er.er_type = type;
+	er.er_name = name;
+	er.er_name_len = strlen(name);
+	er.er_data = (void *)data;
+	er.er_data_len = size;
+
+	if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
+		blks += DIV_ROUND_UP(er.er_data_len, jbsize);
+
+	return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
+}
+
+static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
+{
+	u32 ea_size = GFS2_EA_SIZE(ea);
+	struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
+				     ea_size);
+	u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size;
+	int last = ea->ea_flags & GFS2_EAFLAG_LAST;
+
+	ea->ea_rec_len = cpu_to_be32(ea_size);
+	ea->ea_flags ^= last;
+
+	new->ea_rec_len = cpu_to_be32(new_size);
+	new->ea_flags = last;
+
+	return new;
+}
+
+static void ea_set_remove_stuffed(struct gfs2_inode *ip,
+				  struct gfs2_ea_location *el)
+{
+	struct gfs2_ea_header *ea = el->el_ea;
+	struct gfs2_ea_header *prev = el->el_prev;
+	u32 len;
+
+	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+
+	if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+		return;
+	} else if (GFS2_EA2NEXT(prev) != ea) {
+		prev = GFS2_EA2NEXT(prev);
+		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
+	}
+
+	len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+	prev->ea_rec_len = cpu_to_be32(len);
+
+	if (GFS2_EA_IS_LAST(ea))
+		prev->ea_flags |= GFS2_EAFLAG_LAST;
+}
+
+struct ea_set {
+	int ea_split;
+
+	struct gfs2_ea_request *es_er;
+	struct gfs2_ea_location *es_el;
+
+	struct buffer_head *es_bh;
+	struct gfs2_ea_header *es_ea;
+};
+
+static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
+				 struct gfs2_ea_header *ea, struct ea_set *es)
+{
+	struct gfs2_ea_request *er = es->es_er;
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
+	if (error)
+		return error;
+
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+	if (es->ea_split)
+		ea = ea_split_ea(ea);
+
+	ea_write(ip, ea, er);
+
+	if (es->es_el)
+		ea_set_remove_stuffed(ip, es->es_el);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+	ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+out:
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+	return error;
+}
+
+static int ea_set_simple_alloc(struct gfs2_inode *ip,
+			       struct gfs2_ea_request *er, void *private)
+{
+	struct ea_set *es = private;
+	struct gfs2_ea_header *ea = es->es_ea;
+	int error;
+
+	gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
+
+	if (es->ea_split)
+		ea = ea_split_ea(ea);
+
+	error = ea_write(ip, ea, er);
+	if (error)
+		return error;
+
+	if (es->es_el)
+		ea_set_remove_stuffed(ip, es->es_el);
+
+	return 0;
+}
+
+static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
+			 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+			 void *private)
+{
+	struct ea_set *es = private;
+	unsigned int size;
+	int stuffed;
+	int error;
+
+	stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
+			       es->es_er->er_data_len, &size);
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED) {
+		if (GFS2_EA_REC_LEN(ea) < size)
+			return 0;
+		if (!GFS2_EA_IS_STUFFED(ea)) {
+			error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
+			if (error)
+				return error;
+		}
+		es->ea_split = 0;
+	} else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
+		es->ea_split = 1;
+	else
+		return 0;
+
+	if (stuffed) {
+		error = ea_set_simple_noalloc(ip, bh, ea, es);
+		if (error)
+			return error;
+	} else {
+		unsigned int blks;
+
+		es->es_bh = bh;
+		es->es_ea = ea;
+		blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
+					GFS2_SB(&ip->i_inode)->sd_jbsize);
+
+		error = ea_alloc_skeleton(ip, es->es_er, blks,
+					  ea_set_simple_alloc, es);
+		if (error)
+			return error;
+	}
+
+	return 1;
+}
+
+static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+			void *private)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *indbh, *newbh;
+	__be64 *eablk;
+	int error;
+	int mh_size = sizeof(struct gfs2_meta_header);
+
+	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
+		__be64 *end;
+
+		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
+				       &indbh);
+		if (error)
+			return error;
+
+		if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+			error = -EIO;
+			goto out;
+		}
+
+		eablk = (__be64 *)(indbh->b_data + mh_size);
+		end = eablk + sdp->sd_inptrs;
+
+		for (; eablk < end; eablk++)
+			if (!*eablk)
+				break;
+
+		if (eablk == end) {
+			error = -ENOSPC;
+			goto out;
+		}
+
+		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+	} else {
+		u64 blk;
+		unsigned int n = 1;
+		error = gfs2_alloc_block(ip, &blk, &n);
+		if (error)
+			return error;
+		gfs2_trans_add_unrevoke(sdp, blk, 1);
+		indbh = gfs2_meta_new(ip->i_gl, blk);
+		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+		gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+		gfs2_buffer_clear_tail(indbh, mh_size);
+
+		eablk = (__be64 *)(indbh->b_data + mh_size);
+		*eablk = cpu_to_be64(ip->i_eattr);
+		ip->i_eattr = blk;
+		ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
+		gfs2_add_inode_blocks(&ip->i_inode, 1);
+
+		eablk++;
+	}
+
+	error = ea_alloc_blk(ip, &newbh);
+	if (error)
+		goto out;
+
+	*eablk = cpu_to_be64((u64)newbh->b_blocknr);
+	error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
+	brelse(newbh);
+	if (error)
+		goto out;
+
+	if (private)
+		ea_set_remove_stuffed(ip, private);
+
+out:
+	brelse(indbh);
+	return error;
+}
+
+static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
+		    const void *value, size_t size, struct gfs2_ea_location *el)
+{
+	struct gfs2_ea_request er;
+	struct ea_set es;
+	unsigned int blks = 2;
+	int error;
+
+	er.er_type = type;
+	er.er_name = name;
+	er.er_data = (void *)value;
+	er.er_name_len = strlen(name);
+	er.er_data_len = size;
+
+	memset(&es, 0, sizeof(struct ea_set));
+	es.es_er = &er;
+	es.es_el = el;
+
+	error = ea_foreach(ip, ea_set_simple, &es);
+	if (error > 0)
+		return 0;
+	if (error)
+		return error;
+
+	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
+		blks++;
+	if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+		blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+
+	return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
+}
+
+static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
+				   struct gfs2_ea_location *el)
+{
+	if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
+		el->el_prev = GFS2_EA2NEXT(el->el_prev);
+		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+				     GFS2_EA2NEXT(el->el_prev) == el->el_ea);
+	}
+
+	return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
+}
+
+static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
+{
+	struct gfs2_ea_header *ea = el->el_ea;
+	struct gfs2_ea_header *prev = el->el_prev;
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+	if (error)
+		return error;
+
+	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+
+	if (prev) {
+		u32 len;
+
+		len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+		prev->ea_rec_len = cpu_to_be32(len);
+
+		if (GFS2_EA_IS_LAST(ea))
+			prev->ea_flags |= GFS2_EAFLAG_LAST;
+	} else {
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+
+	return error;
+}
+
+/**
+ * gfs2_xattr_remove - Remove a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ *
+ * This is not called directly by the VFS since we use the (common)
+ * scheme of making a "set with NULL data" mean a remove request. Note
+ * that this is different from a set with zero length data.
+ *
+ * Returns: 0, or errno on failure
+ */
+
+static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_ea_location el;
+	int error;
+
+	if (!ip->i_eattr)
+		return -ENODATA;
+
+	error = gfs2_ea_find(ip, type, name, &el);
+	if (error)
+		return error;
+	if (!el.el_ea)
+		return -ENODATA;
+
+	if (GFS2_EA_IS_STUFFED(el.el_ea))
+		error = ea_remove_stuffed(ip, &el);
+	else
+		error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
+
+	brelse(el.el_bh);
+
+	return error;
+}
+
+/**
+ * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ * @value: The value of the extended attribute (NULL for remove)
+ * @size: The size of the @value argument
+ * @flags: Create or Replace
+ *
+ * See gfs2_xattr_remove() for details of the removal of xattrs.
+ *
+ * Returns: 0 or errno on failure
+ */
+
+int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+		   const void *value, size_t size, int flags)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_ea_location el;
+	unsigned int namel = strlen(name);
+	int error;
+
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		return -EPERM;
+	if (namel > GFS2_EA_MAX_NAME_LEN)
+		return -ERANGE;
+
+	if (value == NULL)
+		return gfs2_xattr_remove(inode, type, name);
+
+	if (ea_check_size(sdp, namel, size))
+		return -ERANGE;
+
+	if (!ip->i_eattr) {
+		if (flags & XATTR_REPLACE)
+			return -ENODATA;
+		return ea_init(ip, type, name, value, size);
+	}
+
+	error = gfs2_ea_find(ip, type, name, &el);
+	if (error)
+		return error;
+
+	if (el.el_ea) {
+		if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
+			brelse(el.el_bh);
+			return -EPERM;
+		}
+
+		error = -EEXIST;
+		if (!(flags & XATTR_CREATE)) {
+			int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+			error = ea_set_i(ip, type, name, value, size, &el);
+			if (!error && unstuffed)
+				ea_set_remove_unstuffed(ip, &el);
+		}
+
+		brelse(el.el_bh);
+		return error;
+	}
+
+	error = -ENODATA;
+	if (!(flags & XATTR_REPLACE))
+		error = ea_set_i(ip, type, name, value, size, NULL);
+
+	return error;
+}
+
+static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
+				  struct gfs2_ea_header *ea, char *data)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head **bh;
+	unsigned int amount = GFS2_EA_DATA_LEN(ea);
+	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+	__be64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+	unsigned int x;
+	int error;
+
+	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!bh)
+		return -ENOMEM;
+
+	error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
+	if (error)
+		goto out;
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+				       bh + x);
+		if (error) {
+			while (x--)
+				brelse(bh[x]);
+			goto fail;
+		}
+		dataptrs++;
+	}
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_wait(sdp, bh[x]);
+		if (error) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			goto fail;
+		}
+		if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			error = -EIO;
+			goto fail;
+		}
+
+		gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
+
+		memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data,
+		       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+
+		amount -= sdp->sd_jbsize;
+		data += sdp->sd_jbsize;
+
+		brelse(bh[x]);
+	}
+
+out:
+	kfree(bh);
+	return error;
+
+fail:
+	gfs2_trans_end(sdp);
+	kfree(bh);
+	return error;
+}
+
+int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+		      struct iattr *attr, char *data)
+{
+	struct buffer_head *dibh;
+	int error;
+
+	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+		error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+		if (error)
+			return error;
+
+		gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+		memcpy(GFS2_EA2DATA(el->el_ea), data,
+		       GFS2_EA_DATA_LEN(el->el_ea));
+	} else
+		error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
+
+	if (error)
+		return error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		error = inode_setattr(&ip->i_inode, attr);
+		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+
+	return error;
+}
+
+static int ea_dealloc_indirect(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrp_list rlist;
+	struct buffer_head *indbh, *dibh;
+	__be64 *eablk, *end;
+	unsigned int rg_blocks = 0;
+	u64 bstart = 0;
+	unsigned int blen = 0;
+	unsigned int blks = 0;
+	unsigned int x;
+	int error;
+
+	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
+	if (error)
+		return error;
+
+	if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+		error = -EIO;
+		goto out;
+	}
+
+	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+	end = eablk + sdp->sd_inptrs;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_rlist_add(sdp, &rlist, bstart);
+			bstart = bn;
+			blen = 1;
+		}
+		blks++;
+	}
+	if (bstart)
+		gfs2_rlist_add(sdp, &rlist, bstart);
+	else
+		goto out;
+
+	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
+
+	for (x = 0; x < rlist.rl_rgrps; x++) {
+		struct gfs2_rgrpd *rgd;
+		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+		rg_blocks += rgd->rd_length;
+	}
+
+	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+	if (error)
+		goto out_rlist_free;
+
+	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT +
+				 RES_STATFS + RES_QUOTA, blks);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+
+	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+	bstart = 0;
+	blen = 0;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_free_meta(ip, bstart, blen);
+			bstart = bn;
+			blen = 1;
+		}
+
+		*eablk = 0;
+		gfs2_add_inode_blocks(&ip->i_inode, -1);
+	}
+	if (bstart)
+		gfs2_free_meta(ip, bstart, blen);
+
+	ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist_free:
+	gfs2_rlist_free(&rlist);
+out:
+	brelse(indbh);
+	return error;
+}
+
+static int ea_dealloc_block(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = ip->i_alloc;
+	struct gfs2_rgrpd *rgd;
+	struct buffer_head *dibh;
+	int error;
+
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
+	if (!rgd) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+				   &al->al_rgd_gh);
+	if (error)
+		return error;
+
+	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS +
+				 RES_QUOTA, 1);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_free_meta(ip, ip->i_eattr, 1);
+
+	ip->i_eattr = 0;
+	gfs2_add_inode_blocks(&ip->i_inode, -1);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(ip, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_uninit(&al->al_rgd_gh);
+	return error;
+}
+
+/**
+ * gfs2_ea_dealloc - deallocate the extended attribute fork
+ * @ip: the inode
+ *
+ * Returns: errno
+ */
+
+int gfs2_ea_dealloc(struct gfs2_inode *ip)
+{
+	struct gfs2_alloc *al;
+	int error;
+
+	al = gfs2_alloc_get(ip);
+	if (!al)
+		return -ENOMEM;
+
+	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out_alloc;
+
+	error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+	if (error)
+		goto out_quota;
+
+	error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
+	if (error)
+		goto out_rindex;
+
+	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
+		error = ea_dealloc_indirect(ip);
+		if (error)
+			goto out_rindex;
+	}
+
+	error = ea_dealloc_block(ip);
+
+out_rindex:
+	gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_quota:
+	gfs2_quota_unhold(ip);
+out_alloc:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+static int gfs2_xattr_user_get(struct inode *inode, const char *name,
+			       void *buffer, size_t size)
+{
+	return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
+}
+
+static int gfs2_xattr_user_set(struct inode *inode, const char *name,
+			       const void *value, size_t size, int flags)
+{
+	return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
+}
+
+static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+				 void *buffer, size_t size)
+{
+	return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
+}
+
+static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+				 const void *value, size_t size, int flags)
+{
+	return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
+}
+
+static int gfs2_xattr_security_get(struct inode *inode, const char *name,
+				   void *buffer, size_t size)
+{
+	return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
+}
+
+static int gfs2_xattr_security_set(struct inode *inode, const char *name,
+				   const void *value, size_t size, int flags)
+{
+	return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
+}
+
+static struct xattr_handler gfs2_xattr_user_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.get    = gfs2_xattr_user_get,
+	.set    = gfs2_xattr_user_set,
+};
+
+static struct xattr_handler gfs2_xattr_security_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.get    = gfs2_xattr_security_get,
+	.set    = gfs2_xattr_security_set,
+};
+
+static struct xattr_handler gfs2_xattr_system_handler = {
+	.prefix = XATTR_SYSTEM_PREFIX,
+	.get    = gfs2_xattr_system_get,
+	.set    = gfs2_xattr_system_set,
+};
+
+struct xattr_handler *gfs2_xattr_handlers[] = {
+	&gfs2_xattr_user_handler,
+	&gfs2_xattr_security_handler,
+	&gfs2_xattr_system_handler,
+	NULL,
+};
+
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
new file mode 100644
index 000000000000..4040a188f63b
--- /dev/null
+++ b/fs/gfs2/xattr.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __EATTR_DOT_H__
+#define __EATTR_DOT_H__
+
+struct gfs2_inode;
+struct iattr;
+
+#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
+#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
+
+#define GFS2_EA_SIZE(ea) \
+ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
+      ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
+                                  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
+
+#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
+#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
+
+#define GFS2_EAREQ_SIZE_STUFFED(er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
+
+#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
+#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
+
+#define GFS2_EA2DATAPTRS(ea) \
+((__be64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
+
+#define GFS2_EA2NEXT(ea) \
+((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
+
+#define GFS2_EA_BH2FIRST(bh) \
+((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
+
+struct gfs2_ea_request {
+	const char *er_name;
+	char *er_data;
+	unsigned int er_name_len;
+	unsigned int er_data_len;
+	unsigned int er_type; /* GFS2_EATYPE_... */
+};
+
+struct gfs2_ea_location {
+	struct buffer_head *el_bh;
+	struct gfs2_ea_header *el_ea;
+	struct gfs2_ea_header *el_prev;
+};
+
+extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+			  void *buffer, size_t size);
+extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+			  const void *value, size_t size, int flags);
+extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
+extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
+
+/* Exported to acl.c */
+
+extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+			struct gfs2_ea_location *el);
+extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+			    char *data, size_t size);
+extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+			     struct iattr *attr, char *data);
+
+#endif /* __EATTR_DOT_H__ */
-- 
cgit v1.2.3-70-g09d2


From 52cef7555adf5ca09b3b7283097466759120d901 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 24 Aug 2009 16:03:35 -0400
Subject: inotify: seperate new watch creation updating existing watches

There is nothing known wrong with the inotify watch addition/modification
but this patch seperates the two code paths to make them each easy to
verify as correct.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 172 +++++++++++++++++++++++----------------
 1 file changed, 103 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index dc32ed8323ba..d8f73c253073 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -431,80 +431,29 @@ static void inotify_free_mark(struct fsnotify_mark_entry *entry)
 	kmem_cache_free(inotify_inode_mark_cachep, ientry);
 }
 
-static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+static int inotify_update_existing_watch(struct fsnotify_group *group,
+					 struct inode *inode,
+					 u32 arg)
 {
-	struct fsnotify_mark_entry *entry = NULL;
+	struct fsnotify_mark_entry *entry;
 	struct inotify_inode_mark_entry *ientry;
-	struct inotify_inode_mark_entry *tmp_ientry;
-	int ret = 0;
-	int add = (arg & IN_MASK_ADD);
-	__u32 mask;
 	__u32 old_mask, new_mask;
+	__u32 mask;
+	int add = (arg & IN_MASK_ADD);
+	int ret;
 
 	/* don't allow invalid bits: we don't want flags set */
 	mask = inotify_arg_to_mask(arg);
 	if (unlikely(!mask))
 		return -EINVAL;
 
-	tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
-	if (unlikely(!tmp_ientry))
-		return -ENOMEM;
-	/* we set the mask at the end after attaching it */
-	fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
-	tmp_ientry->wd = -1;
-
-find_entry:
 	spin_lock(&inode->i_lock);
 	entry = fsnotify_find_mark_entry(group, inode);
 	spin_unlock(&inode->i_lock);
-	if (entry) {
-		ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
-	} else {
-		ret = -ENOSPC;
-		if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
-			goto out_err;
-retry:
-		ret = -ENOMEM;
-		if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
-			goto out_err;
-
-		spin_lock(&group->inotify_data.idr_lock);
-		ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
-					group->inotify_data.last_wd,
-					&tmp_ientry->wd);
-		spin_unlock(&group->inotify_data.idr_lock);
-		if (ret) {
-			if (ret == -EAGAIN)
-				goto retry;
-			goto out_err;
-		}
+	if (!entry)
+		return -ENOENT;
 
-		ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
-		if (ret) {
-			inotify_remove_from_idr(group, tmp_ientry);
-			if (ret == -EEXIST)
-				goto find_entry;
-			goto out_err;
-		}
-
-		/* tmp_ientry has been added to the inode, so we are all set up.
-		 * now we just need to make sure tmp_ientry doesn't get freed and
-		 * we need to set up entry and ientry so the generic code can
-		 * do its thing. */
-		ientry = tmp_ientry;
-		entry = &ientry->fsn_entry;
-		tmp_ientry = NULL;
-
-		atomic_inc(&group->inotify_data.user->inotify_watches);
-
-		/* update the idr hint */
-		group->inotify_data.last_wd = ientry->wd;
-
-		/* we put the mark on the idr, take a reference */
-		fsnotify_get_mark(entry);
-	}
-
-	ret = ientry->wd;
+	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
 
 	spin_lock(&entry->lock);
 
@@ -536,18 +485,103 @@ retry:
 			fsnotify_recalc_group_mask(group);
 	}
 
-	/* this either matches fsnotify_find_mark_entry, or init_mark_entry
-	 * depending on which path we took... */
+	/* return the wd */
+	ret = ientry->wd;
+
+	/* match the get from fsnotify_find_mark_entry() */
 	fsnotify_put_mark(entry);
 
+	return ret;
+}
+
+static int inotify_new_watch(struct fsnotify_group *group,
+			     struct inode *inode,
+			     u32 arg)
+{
+	struct inotify_inode_mark_entry *tmp_ientry;
+	__u32 mask;
+	int ret;
+
+	/* don't allow invalid bits: we don't want flags set */
+	mask = inotify_arg_to_mask(arg);
+	if (unlikely(!mask))
+		return -EINVAL;
+
+	tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+	if (unlikely(!tmp_ientry))
+		return -ENOMEM;
+
+	fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
+	tmp_ientry->fsn_entry.mask = mask;
+	tmp_ientry->wd = -1;
+
+	ret = -ENOSPC;
+	if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
+		goto out_err;
+retry:
+	ret = -ENOMEM;
+	if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
+		goto out_err;
+
+	spin_lock(&group->inotify_data.idr_lock);
+	ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
+				group->inotify_data.last_wd,
+				&tmp_ientry->wd);
+	spin_unlock(&group->inotify_data.idr_lock);
+	if (ret) {
+		/* idr was out of memory allocate and try again */
+		if (ret == -EAGAIN)
+			goto retry;
+		goto out_err;
+	}
+
+	/* we are on the idr, now get on the inode */
+	ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+	if (ret) {
+		/* we failed to get on the inode, get off the idr */
+		inotify_remove_from_idr(group, tmp_ientry);
+		goto out_err;
+	}
+
+	/* we put the mark on the idr, take a reference */
+	fsnotify_get_mark(&tmp_ientry->fsn_entry);
+
+	/* update the idr hint, who cares about races, it's just a hint */
+	group->inotify_data.last_wd = tmp_ientry->wd;
+
+	/* increment the number of watches the user has */
+	atomic_inc(&group->inotify_data.user->inotify_watches);
+
+	/* return the watch descriptor for this new entry */
+	ret = tmp_ientry->wd;
+
+	/* match the ref from fsnotify_init_markentry() */
+	fsnotify_put_mark(&tmp_ientry->fsn_entry);
+
 out_err:
-	/* could be an error, could be that we found an existing mark */
-	if (tmp_ientry) {
-		/* on the idr but didn't make it on the inode */
-		if (tmp_ientry->wd != -1)
-			inotify_remove_from_idr(group, tmp_ientry);
+	if (ret < 0)
 		kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
-	}
+
+	return ret;
+}
+
+static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+{
+	int ret = 0;
+
+retry:
+	/* try to update and existing watch with the new arg */
+	ret = inotify_update_existing_watch(group, inode, arg);
+	/* no mark present, try to add a new one */
+	if (ret == -ENOENT)
+		ret = inotify_new_watch(group, inode, arg);
+	/*
+	 * inotify_new_watch could race with another thread which did an
+	 * inotify_new_watch between the update_existing and the add watch
+	 * here, go back and try to update an existing mark again.
+	 */
+	if (ret == -EEXIST)
+		goto retry;
 
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From cf4374267fbe966e8e4e7db68f5dc7b267439780 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 24 Aug 2009 16:03:35 -0400
Subject: inotify: do not BUG on idr entries at inotify destruction

If an inotify watch is left in the idr when an fsnotify group is destroyed
this will lead to a BUG.  This is not a dangerous situation and really
indicates a programming bug and leak of memory.  This patch changes it to
use a WARN and a printk rather than killing people's boxes.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_fsnotify.c | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 5dcbafe72d71..c9ee67b442e1 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -105,16 +105,45 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
 	return send;
 }
 
+/*
+ * This is NEVER supposed to be called.  Inotify marks should either have been
+ * removed from the idr when the watch was removed or in the
+ * fsnotify_destroy_mark_by_group() call when the inotify instance was being
+ * torn down.  This is only called if the idr is about to be freed but there
+ * are still marks in it.
+ */
 static int idr_callback(int id, void *p, void *data)
 {
-	BUG();
+	struct fsnotify_mark_entry *entry;
+	struct inotify_inode_mark_entry *ientry;
+	static bool warned = false;
+
+	if (warned)
+		return 0;
+
+	warned = false;
+	entry = p;
+	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+
+	WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in "
+		"idr.  Probably leaking memory\n", id, p, data);
+
+	/*
+	 * I'm taking the liberty of assuming that the mark in question is a
+	 * valid address and I'm dereferencing it.  This might help to figure
+	 * out why we got here and the panic is no worse than the original
+	 * BUG() that was here.
+	 */
+	if (entry)
+		printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n",
+			entry->group, entry->inode, ientry->wd);
 	return 0;
 }
 
 static void inotify_free_group_priv(struct fsnotify_group *group)
 {
 	/* ideally the idr is empty and we won't hit the BUG in teh callback */
-	idr_for_each(&group->inotify_data.idr, idr_callback, NULL);
+	idr_for_each(&group->inotify_data.idr, idr_callback, group);
 	idr_remove_all(&group->inotify_data.idr);
 	idr_destroy(&group->inotify_data.idr);
 }
-- 
cgit v1.2.3-70-g09d2


From dead537dd8a1c9495322c1d6f7c780697f474af0 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 24 Aug 2009 16:03:35 -0400
Subject: inotify: fix locking around inotify watching in the idr

The are races around the idr storage of inotify watches.  It's possible
that a watch could be found from sys_inotify_rm_watch() in the idr, but it
could be removed from the idr before that code does it's removal.  Move the
locking and the refcnt'ing so that these have to happen atomically.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 50 ++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index d8f73c253073..ce1f5823e2c2 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -364,20 +364,53 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
 	return error;
 }
 
+/*
+ * Remove the mark from the idr (if present) and drop the reference
+ * on the mark because it was in the idr.
+ */
 static void inotify_remove_from_idr(struct fsnotify_group *group,
 				    struct inotify_inode_mark_entry *ientry)
 {
 	struct idr *idr;
+	struct fsnotify_mark_entry *entry;
+	struct inotify_inode_mark_entry *found_ientry;
+	int wd;
 
 	spin_lock(&group->inotify_data.idr_lock);
 	idr = &group->inotify_data.idr;
-	idr_remove(idr, ientry->wd);
-	spin_unlock(&group->inotify_data.idr_lock);
+	wd = ientry->wd;
+
+	if (wd == -1)
+		goto out;
+
+	entry = idr_find(&group->inotify_data.idr, wd);
+	if (unlikely(!entry))
+		goto out;
+
+	found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+	if (unlikely(found_ientry != ientry)) {
+		/* We found an entry in the idr with the right wd, but it's
+		 * not the entry we were told to remove.  eparis seriously
+		 * fucked up somewhere. */
+		WARN_ON(1);
+		ientry->wd = -1;
+		goto out;
+	}
+
+	/* One ref for being in the idr, one ref held by the caller */
+	BUG_ON(atomic_read(&entry->refcnt) < 2);
+
+	idr_remove(idr, wd);
 	ientry->wd = -1;
+
+	/* removed from the idr, drop that ref */
+	fsnotify_put_mark(entry);
+out:
+	spin_unlock(&group->inotify_data.idr_lock);
 }
+
 /*
- * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the
- * internal reference help on the mark because it is in the idr.
+ * Send IN_IGNORED for this wd, remove this wd from the idr.
  */
 void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 				    struct fsnotify_group *group)
@@ -417,9 +450,6 @@ skip_send_ignore:
 	/* remove this entry from the idr */
 	inotify_remove_from_idr(group, ientry);
 
-	/* removed from idr, drop that reference */
-	fsnotify_put_mark(entry);
-
 	atomic_dec(&group->inotify_data.user->inotify_watches);
 }
 
@@ -535,6 +565,9 @@ retry:
 		goto out_err;
 	}
 
+	/* we put the mark on the idr, take a reference */
+	fsnotify_get_mark(&tmp_ientry->fsn_entry);
+
 	/* we are on the idr, now get on the inode */
 	ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
 	if (ret) {
@@ -543,9 +576,6 @@ retry:
 		goto out_err;
 	}
 
-	/* we put the mark on the idr, take a reference */
-	fsnotify_get_mark(&tmp_ientry->fsn_entry);
-
 	/* update the idr hint, who cares about races, it's just a hint */
 	group->inotify_data.last_wd = tmp_ientry->wd;
 
-- 
cgit v1.2.3-70-g09d2


From 0db501bd0610ee0c0aca84d927f90bcccd09e2bd Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Aug 2009 03:20:04 -0700
Subject: inotify: Ensure we alwasy write the terminating NULL.

Before the rewrite copy_event_to_user always wrote a terqminating '\0'
byte to user space after the filename.  Since the rewrite that
terminating byte was skipped if your filename is exactly a multiple of
event_size.  Ouch!

So add one byte to name_size before we round up and use clear_user to
set userspace to zero like /dev/zero does instead of copying the
strange nul_inotify_event.  I can't quite convince myself len_to_zero
will never exceed 16 and even if it doesn't clear_user should be more
efficient and a more accurate reflection of what the code is trying to
do.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ce1f5823e2c2..0e781bc88d1e 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -47,9 +47,6 @@
 
 static struct vfsmount *inotify_mnt __read_mostly;
 
-/* this just sits here and wastes global memory.  used to just pad userspace messages with zeros */
-static struct inotify_event nul_inotify_event;
-
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
 static int inotify_max_queued_events __read_mostly;
@@ -199,8 +196,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 		inotify_free_event_priv(fsn_priv);
 	}
 
-	/* round up event->name_len so it is a multiple of event_size */
-	name_len = roundup(event->name_len, event_size);
+	/* round up event->name_len so it is a multiple of event_size
+	 * plus an extra byte for the terminating '\0'.
+	 */
+	name_len = roundup(event->name_len + 1, event_size);
 	inotify_event.len = name_len;
 
 	inotify_event.mask = inotify_mask_to_arg(event->mask);
@@ -224,8 +223,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 			return -EFAULT;
 		buf += event->name_len;
 
-		/* fill userspace with 0's from nul_inotify_event */
-		if (copy_to_user(buf, &nul_inotify_event, len_to_zero))
+		/* fill userspace with 0's */
+		if (clear_user(buf, len_to_zero))
 			return -EFAULT;
 		buf += len_to_zero;
 		event_size += name_len;
-- 
cgit v1.2.3-70-g09d2


From 8d8291ae93ecb4a246e87e452d55cca412373300 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 27 Aug 2009 15:51:07 +0100
Subject: GFS2: Remove no_formal_ino generating code

The inum structure used throughout GFS2 has two fields. One
no_addr is the disk block number of the inode in question and
is used everywhere as the inode number. The other, no_formal_ino,
is used only as the generation number for NFS.

Historically the no_formal_ino field was set using a complicated
system of one global and one per-node file containing inode numbers
in order to ensure that each no_formal_ino was unique. Also this
code made no provision for what would happen when eventually the
(64 bit) numbers ran out. Now I know that is pretty unlikely to
happen given the large space of numbers, but it is possible
nevertheless.

The only guarantee required for no_formal_ino is that, for any
single inode, the same number doesn't get reused too quickly.

We already have a generation number which is kept in the inode
and initialised from a counter in the resource group (almost
no overhead, since we have to touch the resource group anyway
in order to allocate an inode in the first place). Aside from
ensuring that we never use the value 0 in the no_formal_ino
field, we can use that counter directly.

As a result of that change, we lose about 200 lines of code and
also gain about 10 creates/sec on the postmark benchmark (on
my test machine).

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     |   7 ---
 fs/gfs2/inode.c      | 143 ++-------------------------------------------------
 fs/gfs2/ops_fstype.c |  45 ++--------------
 fs/gfs2/rgrp.c       |   2 +
 fs/gfs2/super.c      |   3 --
 5 files changed, 10 insertions(+), 190 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 1d11e6e0373e..52436abc3b4d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -548,18 +548,12 @@ struct gfs2_sbd {
 	struct dentry *sd_root_dir;
 
 	struct inode *sd_jindex;
-	struct inode *sd_inum_inode;
 	struct inode *sd_statfs_inode;
-	struct inode *sd_ir_inode;
 	struct inode *sd_sc_inode;
 	struct inode *sd_qc_inode;
 	struct inode *sd_rindex;
 	struct inode *sd_quota_inode;
 
-	/* Inum stuff */
-
-	struct mutex sd_inum_mutex;
-
 	/* StatFS stuff */
 
 	spinlock_t sd_statfs_spin;
@@ -587,7 +581,6 @@ struct gfs2_sbd {
 	struct gfs2_holder sd_journal_gh;
 	struct gfs2_holder sd_jinode_gh;
 
-	struct gfs2_holder sd_ir_gh;
 	struct gfs2_holder sd_sc_gh;
 	struct gfs2_holder sd_qc_gh;
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 4f5e442d60d8..fb15d3b1f409 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -519,139 +519,6 @@ out:
 	return inode ? inode : ERR_PTR(error);
 }
 
-static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf)
-{
-	const struct gfs2_inum_range *str = buf;
-
-	ir->ir_start = be64_to_cpu(str->ir_start);
-	ir->ir_length = be64_to_cpu(str->ir_length);
-}
-
-static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf)
-{
-	struct gfs2_inum_range *str = buf;
-
-	str->ir_start = cpu_to_be64(ir->ir_start);
-	str->ir_length = cpu_to_be64(ir->ir_length);
-}
-
-static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
-{
-	struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
-	struct buffer_head *bh;
-	struct gfs2_inum_range_host ir;
-	int error;
-
-	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
-	if (error)
-		return error;
-	mutex_lock(&sdp->sd_inum_mutex);
-
-	error = gfs2_meta_inode_buffer(ip, &bh);
-	if (error) {
-		mutex_unlock(&sdp->sd_inum_mutex);
-		gfs2_trans_end(sdp);
-		return error;
-	}
-
-	gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
-	if (ir.ir_length) {
-		*formal_ino = ir.ir_start++;
-		ir.ir_length--;
-		gfs2_trans_add_bh(ip->i_gl, bh, 1);
-		gfs2_inum_range_out(&ir,
-				    bh->b_data + sizeof(struct gfs2_dinode));
-		brelse(bh);
-		mutex_unlock(&sdp->sd_inum_mutex);
-		gfs2_trans_end(sdp);
-		return 0;
-	}
-
-	brelse(bh);
-
-	mutex_unlock(&sdp->sd_inum_mutex);
-	gfs2_trans_end(sdp);
-
-	return 1;
-}
-
-static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
-{
-	struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
-	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
-	struct gfs2_holder gh;
-	struct buffer_head *bh;
-	struct gfs2_inum_range_host ir;
-	int error;
-
-	error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	if (error)
-		return error;
-
-	error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
-	if (error)
-		goto out;
-	mutex_lock(&sdp->sd_inum_mutex);
-
-	error = gfs2_meta_inode_buffer(ip, &bh);
-	if (error)
-		goto out_end_trans;
-
-	gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
-	if (!ir.ir_length) {
-		struct buffer_head *m_bh;
-		u64 x, y;
-		__be64 z;
-
-		error = gfs2_meta_inode_buffer(m_ip, &m_bh);
-		if (error)
-			goto out_brelse;
-
-		z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
-		x = y = be64_to_cpu(z);
-		ir.ir_start = x;
-		ir.ir_length = GFS2_INUM_QUANTUM;
-		x += GFS2_INUM_QUANTUM;
-		if (x < y)
-			gfs2_consist_inode(m_ip);
-		z = cpu_to_be64(x);
-		gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
-		*(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z;
-
-		brelse(m_bh);
-	}
-
-	*formal_ino = ir.ir_start++;
-	ir.ir_length--;
-
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
-	gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
-out_brelse:
-	brelse(bh);
-out_end_trans:
-	mutex_unlock(&sdp->sd_inum_mutex);
-	gfs2_trans_end(sdp);
-out:
-	gfs2_glock_dq_uninit(&gh);
-	return error;
-}
-
-static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
-{
-	int error;
-
-	error = pick_formal_ino_1(sdp, inum);
-	if (error <= 0)
-		return error;
-
-	error = pick_formal_ino_2(sdp, inum);
-
-	return error;
-}
-
 /**
  * create_ok - OK to create a new on-disk inode here?
  * @dip:  Directory in which dinode is to be created
@@ -981,13 +848,10 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	if (error)
 		goto fail_gunlock;
 
-	error = pick_formal_ino(sdp, &inum.no_formal_ino);
-	if (error)
-		goto fail_gunlock;
-
 	error = alloc_dinode(dip, &inum.no_addr, &generation);
 	if (error)
 		goto fail_gunlock;
+	inum.no_formal_ino = generation;
 
 	error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops,
 				  LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
@@ -998,9 +862,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	if (error)
 		goto fail_gunlock2;
 
-	inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode),
-					inum.no_addr,
-					inum.no_formal_ino, 0);
+	inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
+				  inum.no_formal_ino, 0);
 	if (IS_ERR(inode))
 		goto fail_gunlock2;
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ec69e63c694..8e3f00e1430d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -84,7 +84,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 	gfs2_tune_init(&sdp->sd_tune);
 
-	mutex_init(&sdp->sd_inum_mutex);
 	spin_lock_init(&sdp->sd_statfs_spin);
 
 	spin_lock_init(&sdp->sd_rindex_spin);
@@ -833,21 +832,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 	if (error)
 		goto fail;
 
-	/* Read in the master inode number inode */
-	sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
-	if (IS_ERR(sdp->sd_inum_inode)) {
-		error = PTR_ERR(sdp->sd_inum_inode);
-		fs_err(sdp, "can't read in inum inode: %d\n", error);
-		goto fail_journal;
-	}
-
-
 	/* Read in the master statfs inode */
 	sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
 	if (IS_ERR(sdp->sd_statfs_inode)) {
 		error = PTR_ERR(sdp->sd_statfs_inode);
 		fs_err(sdp, "can't read in statfs inode: %d\n", error);
-		goto fail_inum;
+		goto fail_journal;
 	}
 
 	/* Read in the resource index inode */
@@ -876,8 +866,6 @@ fail_rindex:
 	iput(sdp->sd_rindex);
 fail_statfs:
 	iput(sdp->sd_statfs_inode);
-fail_inum:
-	iput(sdp->sd_inum_inode);
 fail_journal:
 	init_journal(sdp, UNDO);
 fail:
@@ -905,20 +893,12 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
 		return error;
 	}
 
-	sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
-	sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
-	if (IS_ERR(sdp->sd_ir_inode)) {
-		error = PTR_ERR(sdp->sd_ir_inode);
-		fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
-		goto fail;
-	}
-
 	sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
 	sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
 	if (IS_ERR(sdp->sd_sc_inode)) {
 		error = PTR_ERR(sdp->sd_sc_inode);
 		fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
-		goto fail_ir_i;
+		goto fail;
 	}
 
 	sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
@@ -932,27 +912,16 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
 	iput(pn);
 	pn = NULL;
 
-	ip = GFS2_I(sdp->sd_ir_inode);
-	error = gfs2_glock_nq_init(ip->i_gl,
-				   LM_ST_EXCLUSIVE, 0,
-				   &sdp->sd_ir_gh);
-	if (error) {
-		fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
-		goto fail_qc_i;
-	}
-
 	ip = GFS2_I(sdp->sd_sc_inode);
-	error = gfs2_glock_nq_init(ip->i_gl,
-				   LM_ST_EXCLUSIVE, 0,
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
 				   &sdp->sd_sc_gh);
 	if (error) {
 		fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
-		goto fail_ir_gh;
+		goto fail_qc_i;
 	}
 
 	ip = GFS2_I(sdp->sd_qc_inode);
-	error = gfs2_glock_nq_init(ip->i_gl,
-				   LM_ST_EXCLUSIVE, 0,
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
 				   &sdp->sd_qc_gh);
 	if (error) {
 		fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
@@ -965,14 +934,10 @@ fail_qc_gh:
 	gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
 fail_ut_gh:
 	gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-fail_ir_gh:
-	gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
 fail_qc_i:
 	iput(sdp->sd_qc_inode);
 fail_ut_i:
 	iput(sdp->sd_sc_inode);
-fail_ir_i:
-	iput(sdp->sd_ir_inode);
 fail:
 	if (pn)
 		iput(pn);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c681c54fbd60..388a61d12fc8 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1567,6 +1567,8 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
 	rgd->rd_free--;
 	rgd->rd_dinodes++;
 	*generation = rgd->rd_igeneration++;
+	if (*generation == 0)
+		*generation = rgd->rd_igeneration++;
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2e78a3f68a76..d95cf777d244 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -788,7 +788,6 @@ restart:
 	/*  Release stuff  */
 
 	iput(sdp->sd_jindex);
-	iput(sdp->sd_inum_inode);
 	iput(sdp->sd_statfs_inode);
 	iput(sdp->sd_rindex);
 	iput(sdp->sd_quota_inode);
@@ -799,10 +798,8 @@ restart:
 	if (!sdp->sd_args.ar_spectator) {
 		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
 		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
 		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
 		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
-		iput(sdp->sd_ir_inode);
 		iput(sdp->sd_sc_inode);
 		iput(sdp->sd_qc_inode);
 	}
-- 
cgit v1.2.3-70-g09d2


From 9886e836a6a5dbd273dc55b17e713f0a188d137f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 27 Aug 2009 13:09:06 +0100
Subject: AFS: Stop readlink() on AFS crashing due to NULL 'file' ptr

kAFS crashes when asked to read a symbolic link because page_getlink()
passes a NULL file pointer to read_mapping_page(), but afs_readpage()
expects a file pointer from which to extract a key.

Modify afs_readpage() to request the appropriate key from the calling
process's keyrings if a file struct is not supplied with one attached.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/file.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0149dab365e7..681c2a7b013f 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -134,9 +134,16 @@ static int afs_readpage(struct file *file, struct page *page)
 
 	inode = page->mapping->host;
 
-	ASSERT(file != NULL);
-	key = file->private_data;
-	ASSERT(key != NULL);
+	if (file) {
+		key = file->private_data;
+		ASSERT(key != NULL);
+	} else {
+		key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
+		if (IS_ERR(key)) {
+			ret = PTR_ERR(key);
+			goto error_nokey;
+		}
+	}
 
 	_enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
 
@@ -207,12 +214,17 @@ static int afs_readpage(struct file *file, struct page *page)
 		unlock_page(page);
 	}
 
+	if (!file)
+		key_put(key);
 	_leave(" = 0");
 	return 0;
 
 error:
 	SetPageError(page);
 	unlock_page(page);
+	if (!file)
+		key_put(key);
+error_nokey:
 	_leave(" = %d", ret);
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From a1b08e75dff3dc18a88444803753e667bb1d126e Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 27 Aug 2009 14:46:56 +0800
Subject: ocfs2: invalidate dentry if its dentry_lock isn't initialized.

In commit a5a0a630922a2f6a774b6dac19f70cb5abd86bb0, when
ocfs2_attch_dentry_lock fails, we call an extra iput and reset
dentry->d_fsdata to NULL. This resolve a bug, but it isn't
completed and the dentry is still there. When we want to use
it again, ocfs2_dentry_revalidate doesn't catch it and return
true. That make future ocfs2_dentry_lock panic out.
One bug is http://oss.oracle.com/bugzilla/show_bug.cgi?id=1162.

The resolution is to add a check for dentry->d_fsdata in
revalidate process and return false if dentry->d_fsdata is NULL,
so that a new ocfs2_lookup will be called again.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dcache.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 2f28b7de2c8d..b4957c7d9fe2 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -85,6 +85,17 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
 		goto bail;
 	}
 
+	/*
+	 * If the last lookup failed to create dentry lock, let us
+	 * redo it.
+	 */
+	if (!dentry->d_fsdata) {
+		mlog(0, "Inode %llu doesn't have dentry lock, "
+		     "returning false\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		goto bail;
+	}
+
 	ret = 1;
 
 bail:
-- 
cgit v1.2.3-70-g09d2


From b962e7312ae87006aed6f68ceee94bdf8db08338 Mon Sep 17 00:00:00 2001
From: Brian Rogers <brian@xyzw.org>
Date: Fri, 28 Aug 2009 10:00:05 -0400
Subject: inotify: do not send a block of zeros when no pathname is available

When an event has no pathname, there's no need to pad it with a null byte and
therefore generate an inotify_event sized block of zeros. This fixes a
regression introduced by commit 0db501bd0610ee0c0aca84d927f90bcccd09e2bd where
my system wouldn't finish booting because some process was being confused by
this.

Signed-off-by: Brian Rogers <brian@xyzw.org>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 0e781bc88d1e..b547ae17b461 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -180,7 +180,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	struct fsnotify_event_private_data *fsn_priv;
 	struct inotify_event_private_data *priv;
 	size_t event_size = sizeof(struct inotify_event);
-	size_t name_len;
+	size_t name_len = 0;
 
 	/* we get the inotify watch descriptor from the event private data */
 	spin_lock(&event->lock);
@@ -196,10 +196,12 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 		inotify_free_event_priv(fsn_priv);
 	}
 
-	/* round up event->name_len so it is a multiple of event_size
+	/*
+	 * round up event->name_len so it is a multiple of event_size
 	 * plus an extra byte for the terminating '\0'.
 	 */
-	name_len = roundup(event->name_len + 1, event_size);
+	if (event->name_len)
+		name_len = roundup(event->name_len + 1, event_size);
 	inotify_event.len = name_len;
 
 	inotify_event.mask = inotify_mask_to_arg(event->mask);
-- 
cgit v1.2.3-70-g09d2


From 55ad63bf3a30936aced50f13452735c2f58b234c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 28 Aug 2009 10:40:33 -0400
Subject: ext4: fix extent sanity checking code with AGGRESSIVE_TEST

The extents sanity-checking code depends on the ext4_ext_space_*()
functions returning the maximum alloable size for eh_max; however,
when the debugging #ifdef AGGRESSIVE_TEST is enabled to test the
extent tree handling code, this prevents a normally created ext4
filesystem from being mounted with the errors:

Aug 26 15:43:50 bsd086 kernel: [   96.070277] EXT4-fs error (device sda8): ext4_ext_check_inode: bad header/extent in inode #8: too large eh_max - magic f30a, entries 1, max 4(3), depth 0(0)
Aug 26 15:43:50 bsd086 kernel: [   96.070526] EXT4-fs (sda8): no journal found

Bug reported by Akira Fujita.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 60 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8c20caf4aa5c..7a3832577923 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -229,57 +229,65 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 	return newblock;
 }
 
-static int ext4_ext_space_block(struct inode *inode)
+static inline int ext4_ext_space_block(struct inode *inode, int check)
 {
 	int size;
 
 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 			/ sizeof(struct ext4_extent);
+	if (!check) {
 #ifdef AGGRESSIVE_TEST
-	if (size > 6)
-		size = 6;
+		if (size > 6)
+			size = 6;
 #endif
+	}
 	return size;
 }
 
-static int ext4_ext_space_block_idx(struct inode *inode)
+static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
 {
 	int size;
 
 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 			/ sizeof(struct ext4_extent_idx);
+	if (!check) {
 #ifdef AGGRESSIVE_TEST
-	if (size > 5)
-		size = 5;
+		if (size > 5)
+			size = 5;
 #endif
+	}
 	return size;
 }
 
-static int ext4_ext_space_root(struct inode *inode)
+static inline int ext4_ext_space_root(struct inode *inode, int check)
 {
 	int size;
 
 	size = sizeof(EXT4_I(inode)->i_data);
 	size -= sizeof(struct ext4_extent_header);
 	size /= sizeof(struct ext4_extent);
+	if (!check) {
 #ifdef AGGRESSIVE_TEST
-	if (size > 3)
-		size = 3;
+		if (size > 3)
+			size = 3;
 #endif
+	}
 	return size;
 }
 
-static int ext4_ext_space_root_idx(struct inode *inode)
+static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 {
 	int size;
 
 	size = sizeof(EXT4_I(inode)->i_data);
 	size -= sizeof(struct ext4_extent_header);
 	size /= sizeof(struct ext4_extent_idx);
+	if (!check) {
 #ifdef AGGRESSIVE_TEST
-	if (size > 4)
-		size = 4;
+		if (size > 4)
+			size = 4;
 #endif
+	}
 	return size;
 }
 
@@ -293,9 +301,9 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
 	int lcap, icap, rcap, leafs, idxs, num;
 	int newextents = blocks;
 
-	rcap = ext4_ext_space_root_idx(inode);
-	lcap = ext4_ext_space_block(inode);
-	icap = ext4_ext_space_block_idx(inode);
+	rcap = ext4_ext_space_root_idx(inode, 0);
+	lcap = ext4_ext_space_block(inode, 0);
+	icap = ext4_ext_space_block_idx(inode, 0);
 
 	/* number of new leaf blocks needed */
 	num = leafs = (newextents + lcap - 1) / lcap;
@@ -320,14 +328,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 
 	if (depth == ext_depth(inode)) {
 		if (depth == 0)
-			max = ext4_ext_space_root(inode);
+			max = ext4_ext_space_root(inode, 1);
 		else
-			max = ext4_ext_space_root_idx(inode);
+			max = ext4_ext_space_root_idx(inode, 1);
 	} else {
 		if (depth == 0)
-			max = ext4_ext_space_block(inode);
+			max = ext4_ext_space_block(inode, 1);
 		else
-			max = ext4_ext_space_block_idx(inode);
+			max = ext4_ext_space_block_idx(inode, 1);
 	}
 
 	return max;
@@ -626,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 	eh->eh_depth = 0;
 	eh->eh_entries = 0;
 	eh->eh_magic = EXT4_EXT_MAGIC;
-	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
+	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_ext_invalidate_cache(inode);
 	return 0;
@@ -851,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 	neh = ext_block_hdr(bh);
 	neh->eh_entries = 0;
-	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
 	neh->eh_magic = EXT4_EXT_MAGIC;
 	neh->eh_depth = 0;
 	ex = EXT_FIRST_EXTENT(neh);
@@ -927,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		neh = ext_block_hdr(bh);
 		neh->eh_entries = cpu_to_le16(1);
 		neh->eh_magic = EXT4_EXT_MAGIC;
-		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
 		neh->eh_depth = cpu_to_le16(depth - i);
 		fidx = EXT_FIRST_INDEX(neh);
 		fidx->ei_block = border;
@@ -1052,9 +1060,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	/* old root could have indexes or leaves
 	 * so calculate e_max right way */
 	if (ext_depth(inode))
-	  neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
 	else
-	  neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
 	neh->eh_magic = EXT4_EXT_MAGIC;
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
@@ -1069,7 +1077,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 		goto out;
 
 	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
-	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
+	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
 	curp->p_hdr->eh_entries = cpu_to_le16(1);
 	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
 
@@ -2348,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 		if (err == 0) {
 			ext_inode_hdr(inode)->eh_depth = 0;
 			ext_inode_hdr(inode)->eh_max =
-				cpu_to_le16(ext4_ext_space_root(inode));
+				cpu_to_le16(ext4_ext_space_root(inode, 0));
 			err = ext4_ext_dirty(handle, inode, path);
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From 83cb10f0ef3c96162be92339ccf8c0c9c9f2d13e Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 28 Aug 2009 11:57:55 -0400
Subject: inotify: fix length reporting and size checking

0db501bd0610ee0c0 introduced a regresion in that it now sends a nul
terminator but the length accounting when checking for space or
reporting to userspace did not take this into account.  This corrects
all of the rounding logic.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index b547ae17b461..6111670b2573 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -154,7 +154,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 
 	event = fsnotify_peek_notify_event(group);
 
-	event_size += roundup(event->name_len, event_size);
+	if (event->name_len)
+		event_size += roundup(event->name_len + 1, event_size);
 
 	if (event_size > count)
 		return ERR_PTR(-EINVAL);
@@ -327,8 +328,9 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 		list_for_each_entry(holder, &group->notification_list, event_list) {
 			event = holder->event;
 			send_len += sizeof(struct inotify_event);
-			send_len += roundup(event->name_len,
-					     sizeof(struct inotify_event));
+			if (event->name_len)
+				send_len += roundup(event->name_len + 1,
+						sizeof(struct inotify_event));
 		}
 		mutex_unlock(&group->notification_mutex);
 		ret = put_user(send_len, (int __user *) p);
-- 
cgit v1.2.3-70-g09d2


From 750a8870fe4016ef3091fc97e084d58c613c2cc7 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 28 Aug 2009 12:50:47 -0400
Subject: inotify: update the group mask on mark addition

Seperating the addition and update of marks in inotify resulted in a
regression in that inotify never gets events.  The inotify group mask is
always 0.  This mask should be updated any time a new mark is added.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 6111670b2573..dcd2040d330c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -591,6 +591,10 @@ retry:
 	/* match the ref from fsnotify_init_markentry() */
 	fsnotify_put_mark(&tmp_ientry->fsn_entry);
 
+	/* if this mark added a new event update the group mask */
+	if (mask & ~group->mask)
+		fsnotify_recalc_group_mask(group);
+
 out_err:
 	if (ret < 0)
 		kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
-- 
cgit v1.2.3-70-g09d2


From 2c94eb86c66e1eaaa1e7d8a2120f4fad5e7e7736 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 28 Aug 2009 21:43:15 -0400
Subject: ext4: Allow rename to create more than EXT4_LINK_MAX subdirectories

Use EXT4_DIR_LINK_MAX so that rename() can move a directory into new
parent directory without running into the EXT4_LINK_MAX limit.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index fea14dbd3c22..ba91bd0f7916 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2421,7 +2421,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto end_rename;
 		retval = -EMLINK;
 		if (!new_inode && new_dir != old_dir &&
-				new_dir->i_nlink >= EXT4_LINK_MAX)
+		    EXT4_DIR_LINK_MAX(new_dir))
 			goto end_rename;
 	}
 	if (!new_bh) {
-- 
cgit v1.2.3-70-g09d2


From b05ab1dc3795e6f997fb0d34f38fce5012533c3e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 29 Aug 2009 21:08:08 -0400
Subject: ext4: Limit number of links that can be created by ext4_link()

In ext4_link we need to check using EXT4_LINK_MAX, and not
EXT4_DIR_LINK_MAX(), since ext4_link() is creating hard links of
regular files, and not directories.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ba91bd0f7916..f27c8164c1fe 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2318,7 +2318,7 @@ static int ext4_link(struct dentry *old_dentry,
 	struct inode *inode = old_dentry->d_inode;
 	int err, retries = 0;
 
-	if (EXT4_DIR_LINK_MAX(inode))
+	if (inode->i_nlink >= EXT4_LINK_MAX)
 		return -EMLINK;
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From b1f1b8ce0a1d71cbc72f7540134d52b79bd8f5ac Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 30 Aug 2009 04:21:41 +0900
Subject: nilfs2: fix preempt count underflow in
 nilfs_btnode_prepare_change_key

This will fix the following preempt count underflow reported from
users with the title "[NILFS users] segctord problem" (Message-ID:
<949415.6494.qm@web58808.mail.re1.yahoo.com> and Message-ID:
<debc30fc0908270825v747c1734xa59126623cfd5b05@mail.gmail.com>):

 WARNING: at kernel/sched.c:4890 sub_preempt_count+0x95/0xa0()
 Hardware name: HP Compaq 6530b (KR980UT#ABC)
 Modules linked in: bridge stp llc bnep rfcomm l2cap xfs exportfs nilfs2 cowloop loop vboxnetadp vboxnetflt vboxdrv btusb bluetooth uvcvideo videodev v4l1_compat v4l2_compat_ioctl32 arc4 snd_hda_codec_analog ecb iwlagn iwlcore rfkill lib80211 mac80211 snd_hda_intel snd_hda_codec ehci_hcd uhci_hcd usbcore snd_hwdep snd_pcm tg3 cfg80211 psmouse snd_timer joydev libphy ohci1394 snd_page_alloc hp_accel lis3lv02d ieee1394 led_class i915 drm i2c_algo_bit video backlight output i2c_core dm_crypt dm_mod
 Pid: 4197, comm: segctord Not tainted 2.6.30-gentoo-r4-64 #7
 Call Trace:
  [<ffffffff8023fa05>] ? sub_preempt_count+0x95/0xa0
  [<ffffffff802470f8>] warn_slowpath_common+0x78/0xd0
  [<ffffffff8024715f>] warn_slowpath_null+0xf/0x20
  [<ffffffff8023fa05>] sub_preempt_count+0x95/0xa0
  [<ffffffffa04ce4db>] nilfs_btnode_prepare_change_key+0x11b/0x190 [nilfs2]
  [<ffffffffa04d01ad>] nilfs_btree_assign_p+0x19d/0x1e0 [nilfs2]
  [<ffffffffa04d10ad>] nilfs_btree_assign+0xbd/0x130 [nilfs2]
  [<ffffffffa04cead7>] nilfs_bmap_assign+0x47/0x70 [nilfs2]
  [<ffffffffa04d9bc6>] nilfs_segctor_do_construct+0x956/0x20f0 [nilfs2]
  [<ffffffff805ac8e2>] ? _spin_unlock_irqrestore+0x12/0x40
  [<ffffffff803c06e0>] ? __up_write+0xe0/0x150
  [<ffffffff80262959>] ? up_write+0x9/0x10
  [<ffffffffa04ce9f3>] ? nilfs_bmap_test_and_clear_dirty+0x43/0x60 [nilfs2]
  [<ffffffffa04cd627>] ? nilfs_mdt_fetch_dirty+0x27/0x60 [nilfs2]
  [<ffffffffa04db5fc>] nilfs_segctor_construct+0x8c/0xd0 [nilfs2]
  [<ffffffffa04dc3dc>] nilfs_segctor_thread+0x15c/0x3a0 [nilfs2]
  [<ffffffffa04dbe20>] ? nilfs_construction_timeout+0x0/0x10 [nilfs2]
  [<ffffffff80252633>] ? add_timer+0x13/0x20
  [<ffffffff802370da>] ? __wake_up_common+0x5a/0x90
  [<ffffffff8025e960>] ? autoremove_wake_function+0x0/0x40
  [<ffffffffa04dc280>] ? nilfs_segctor_thread+0x0/0x3a0 [nilfs2]
  [<ffffffffa04dc280>] ? nilfs_segctor_thread+0x0/0x3a0 [nilfs2]
  [<ffffffff8025e556>] kthread+0x56/0x90
  [<ffffffff8020cdea>] child_rip+0xa/0x20
  [<ffffffff8025e500>] ? kthread+0x0/0x90
  [<ffffffff8020cde0>] ? child_rip+0x0/0x20

This problem was caused due to a missing radix_tree_preload() call in
the retry path of nilfs_btnode_prepare_change_key() function.

Reported-by: Eric A <eric225125@yahoo.com>
Reported-by: Jerome Poulin <jeromepoulin@gmail.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Tested-by: Jerome Poulin <jeromepoulin@gmail.com>
Cc: stable@kernel.org
---
 fs/nilfs2/btnode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 7e0b61be212e..c668bca579c1 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -209,6 +209,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
 		 * We cannot call radix_tree_preload for the kernels older
 		 * than 2.6.23, because it is not exported for modules.
 		 */
+retry:
 		err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
 		if (err)
 			goto failed_unlock;
@@ -219,7 +220,6 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
 				       (unsigned long long)oldkey,
 				       (unsigned long long)newkey);
 
-retry:
 		spin_lock_irq(&btnc->tree_lock);
 		err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
 		spin_unlock_irq(&btnc->tree_lock);
-- 
cgit v1.2.3-70-g09d2


From 2920ee2b47fc8e6aebe1d1956b2725f48fa93cc5 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 31 Aug 2009 15:27:26 +0000
Subject: [CIFS] potential NULL dereference in parse_DFS_referrals()

memory allocation may fail, prevent a NULL dereference

Pointed out by Roel Kluin

CC: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1866bc2927d4..5f0b80d39923 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3961,6 +3961,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		if (is_unicode) {
 			__le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
 						GFP_KERNEL);
+			if (tmp == NULL) {
+				rc = -ENOMEM;
+				goto parse_DFS_referrals_exit;
+			}
 			cifsConvertToUCS((__le16 *) tmp, searchName,
 					PATH_MAX, nls_codepage, remap);
 			node->path_consumed = cifs_ucs2_bytes(tmp,
-- 
cgit v1.2.3-70-g09d2


From d96f8f891f69ac1dc8c7bd82e27525de220c04e1 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 2 Jul 2009 00:09:33 -0500
Subject: xfs: add more statics & drop some unused functions

A lot more functions could be made static, but they need
forward declarations; this does some easy ones, and also
found a few unused functions in the process.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c |  2 +-
 fs/xfs/linux-2.6/xfs_sync.c  | 15 ---------------
 fs/xfs/linux-2.6/xfs_sync.h  |  1 -
 fs/xfs/xfs_ag.h              |  3 ---
 fs/xfs/xfs_alloc.c           |  2 +-
 fs/xfs/xfs_bmap.c            |  2 +-
 fs/xfs/xfs_bmap.h            | 11 -----------
 fs/xfs/xfs_bmap_btree.c      | 20 ++++++++++----------
 fs/xfs/xfs_bmap_btree.h      |  1 -
 fs/xfs/xfs_btree.c           | 42 +-----------------------------------------
 fs/xfs/xfs_btree.h           | 15 ---------------
 fs/xfs/xfs_inode.c           |  8 ++++----
 fs/xfs/xfs_inode.h           |  5 -----
 fs/xfs/xfs_itable.c          |  2 +-
 fs/xfs/xfs_itable.h          |  5 -----
 fs/xfs/xfs_log_priv.h        |  2 --
 fs/xfs/xfs_log_recover.c     |  2 +-
 fs/xfs/xfs_mount.c           |  2 +-
 fs/xfs/xfs_mount.h           |  3 ---
 fs/xfs/xfs_mru_cache.c       | 29 -----------------------------
 fs/xfs/xfs_mru_cache.h       |  1 -
 fs/xfs/xfs_rw.h              |  6 ------
 fs/xfs/xfs_vnodeops.c        |  2 +-
 23 files changed, 22 insertions(+), 159 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a220d36f789b..c709ed61a53e 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -687,7 +687,7 @@ xfs_barrier_test(
 	return error;
 }
 
-void
+STATIC void
 xfs_mountfs_check_barriers(xfs_mount_t *mp)
 {
 	int error;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 98ef624d9baf..320be6aea492 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -749,21 +749,6 @@ __xfs_inode_clear_reclaim_tag(
 			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
 }
 
-void
-xfs_inode_clear_reclaim_tag(
-	xfs_inode_t	*ip)
-{
-	xfs_mount_t	*mp = ip->i_mount;
-	xfs_perag_t	*pag = xfs_get_perag(mp, ip->i_ino);
-
-	read_lock(&pag->pag_ici_lock);
-	spin_lock(&ip->i_flags_lock);
-	__xfs_inode_clear_reclaim_tag(mp, pag, ip);
-	spin_unlock(&ip->i_flags_lock);
-	read_unlock(&pag->pag_ici_lock);
-	xfs_put_perag(mp, pag);
-}
-
 STATIC int
 xfs_reclaim_inode_now(
 	struct xfs_inode	*ip,
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 59120602588a..27920eb7a820 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -49,7 +49,6 @@ int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 				struct xfs_inode *ip);
 
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f24b50b68d03..af3cfeb18b0d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -91,9 +91,6 @@ typedef struct xfs_agf {
 #define	XFS_AGF_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
 #define	XFS_BUF_TO_AGF(bp)	((xfs_agf_t *)XFS_BUF_PTR(bp))
 
-extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
-			xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
-
 /*
  * Size of the unlinked inode hash table in the agi.
  */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 2cf944eb796d..6316004922d7 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2248,7 +2248,7 @@ xfs_alloc_put_freelist(
 /*
  * Read in the allocation group header (free/alloc section).
  */
-int					/* error */
+STATIC int				/* error */
 xfs_read_agf(
 	struct xfs_mount	*mp,	/* mount point structure */
 	struct xfs_trans	*tp,	/* transaction pointer */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8ee5b5a76a2a..8971fb09d387 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3713,7 +3713,7 @@ done:
  * entry (null if none).  Else, *lastxp will be set to the index
  * of the found entry; *gotp will contain the entry.
  */
-xfs_bmbt_rec_host_t *			/* pointer to found extent entry */
+STATIC xfs_bmbt_rec_host_t *		/* pointer to found extent entry */
 xfs_bmap_search_multi_extents(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	xfs_fileoff_t	bno,		/* block number searched for */
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 1b8ff9256bd0..56f62d2edc35 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -392,17 +392,6 @@ xfs_bmap_count_blocks(
 	int			whichfork,
 	int			*count);
 
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry.  If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none).  Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-xfs_bmbt_rec_host_t *
-xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *,
-			xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *);
-
 #endif	/* __KERNEL__ */
 
 #endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 5c1ade06578e..eb7b702d0690 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -202,16 +202,6 @@ xfs_bmbt_get_state(
 				ext_flag);
 }
 
-/* Endian flipping versions of the bmbt extraction functions */
-void
-xfs_bmbt_disk_get_all(
-	xfs_bmbt_rec_t	*r,
-	xfs_bmbt_irec_t *s)
-{
-	__xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
-				get_unaligned_be64(&r->l1), s);
-}
-
 /*
  * Extract the blockcount field from an on disk bmap extent record.
  */
@@ -816,6 +806,16 @@ xfs_bmbt_trace_key(
 	*l1 = 0;
 }
 
+/* Endian flipping versions of the bmbt extraction functions */
+STATIC void
+xfs_bmbt_disk_get_all(
+	xfs_bmbt_rec_t	*r,
+	xfs_bmbt_irec_t *s)
+{
+	__xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
+				get_unaligned_be64(&r->l1), s);
+}
+
 STATIC void
 xfs_bmbt_trace_record(
 	struct xfs_btree_cur	*cur,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e8df007615e..5549d495947f 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -220,7 +220,6 @@ extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
 extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
 extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
 
-extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
 extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
 
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 26717388acf5..52b5f14d0c32 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -645,46 +645,6 @@ xfs_btree_read_bufl(
 	return 0;
 }
 
-/*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int					/* error */
-xfs_btree_read_bufs(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_agnumber_t	agno,		/* allocation group number */
-	xfs_agblock_t	agbno,		/* allocation group block number */
-	uint		lock,		/* lock flags for read_buf */
-	xfs_buf_t	**bpp,		/* buffer for agno/agbno */
-	int		refval)		/* ref count value for buffer */
-{
-	xfs_buf_t	*bp;		/* return value */
-	xfs_daddr_t	d;		/* real disk block address */
-	int		error;
-
-	ASSERT(agno != NULLAGNUMBER);
-	ASSERT(agbno != NULLAGBLOCK);
-	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-	if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
-					mp->m_bsize, lock, &bp))) {
-		return error;
-	}
-	ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-	if (bp != NULL) {
-		switch (refval) {
-		case XFS_ALLOC_BTREE_REF:
-			XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
-			break;
-		case XFS_INO_BTREE_REF:
-			XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
-			break;
-		}
-	}
-	*bpp = bp;
-	return 0;
-}
-
 /*
  * Read-ahead the block, don't wait for it, don't return a buffer.
  * Long-form addressing.
@@ -2951,7 +2911,7 @@ error0:
  * inode we have to copy the single block it was pointing to into the
  * inode.
  */
-int
+STATIC int
 xfs_btree_kill_iroot(
 	struct xfs_btree_cur	*cur)
 {
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 4f852b735b96..7fa07062bdda 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -378,20 +378,6 @@ xfs_btree_read_bufl(
 	struct xfs_buf		**bpp,	/* buffer for fsbno */
 	int			refval);/* ref count value for buffer */
 
-/*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int					/* error */
-xfs_btree_read_bufs(
-	struct xfs_mount	*mp,	/* file system mount point */
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_agnumber_t		agno,	/* allocation group number */
-	xfs_agblock_t		agbno,	/* allocation group block number */
-	uint			lock,	/* lock flags for read_buf */
-	struct xfs_buf		**bpp,	/* buffer for agno/agbno */
-	int			refval);/* ref count value for buffer */
-
 /*
  * Read-ahead the block, don't wait for it, don't return a buffer.
  * Long-form addressing.
@@ -432,7 +418,6 @@ int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
 int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
 int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
 int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
-int xfs_btree_kill_iroot(struct xfs_btree_cur *);
 int xfs_btree_insert(struct xfs_btree_cur *, int *);
 int xfs_btree_delete(struct xfs_btree_cur *, int *);
 int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index da428b3fe0f5..c1dc7ef5a1d8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -651,7 +651,7 @@ xfs_iformat_btree(
 	return 0;
 }
 
-void
+STATIC void
 xfs_dinode_from_disk(
 	xfs_icdinode_t		*to,
 	xfs_dinode_t		*from)
@@ -1247,7 +1247,7 @@ xfs_isize_check(
  * In that case the pages will still be in memory, but the inode size
  * will never have been updated.
  */
-xfs_fsize_t
+STATIC xfs_fsize_t
 xfs_file_last_byte(
 	xfs_inode_t	*ip)
 {
@@ -3837,7 +3837,7 @@ xfs_iext_inline_to_direct(
 /*
  * Resize an extent indirection array to new_size bytes.
  */
-void
+STATIC void
 xfs_iext_realloc_indirect(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	int		new_size)	/* new indirection array size */
@@ -3862,7 +3862,7 @@ xfs_iext_realloc_indirect(
 /*
  * Switch from indirection array to linear (direct) extent allocations.
  */
-void
+STATIC void
 xfs_iext_indirect_to_direct(
 	 xfs_ifork_t	*ifp)		/* inode fork pointer */
 {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 65f24a3cc992..00e8505bde2d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -504,7 +504,6 @@ void		xfs_ipin(xfs_inode_t *);
 void		xfs_iunpin(xfs_inode_t *);
 int		xfs_iflush(xfs_inode_t *, uint);
 void		xfs_ichgtime(xfs_inode_t *, int);
-xfs_fsize_t	xfs_file_last_byte(xfs_inode_t *);
 void		xfs_lock_inodes(xfs_inode_t **, int, uint);
 void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 
@@ -572,8 +571,6 @@ int		xfs_itobp(struct xfs_mount *, struct xfs_trans *,
 			  struct xfs_buf **, uint);
 int		xfs_iread(struct xfs_mount *, struct xfs_trans *,
 			  struct xfs_inode *, xfs_daddr_t, uint);
-void		xfs_dinode_from_disk(struct xfs_icdinode *,
-				     struct xfs_dinode *);
 void		xfs_dinode_to_disk(struct xfs_dinode *,
 				   struct xfs_icdinode *);
 void		xfs_idestroy_fork(struct xfs_inode *, int);
@@ -592,8 +589,6 @@ void		xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
 void		xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
 void		xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
 void		xfs_iext_realloc_direct(xfs_ifork_t *, int);
-void		xfs_iext_realloc_indirect(xfs_ifork_t *, int);
-void		xfs_iext_indirect_to_direct(xfs_ifork_t *);
 void		xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
 void		xfs_iext_inline_to_direct(xfs_ifork_t *, int);
 void		xfs_iext_destroy(xfs_ifork_t *);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index aeb2d2221c7d..c471122d2234 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -39,7 +39,7 @@
 #include "xfs_error.h"
 #include "xfs_btree.h"
 
-int
+STATIC int
 xfs_internal_inum(
 	xfs_mount_t	*mp,
 	xfs_ino_t	ino)
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 1fb04e7deb61..20792bf45946 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -99,11 +99,6 @@ xfs_bulkstat_one(
 	void			*dibuff,
 	int			*stat);
 
-int
-xfs_internal_inum(
-	xfs_mount_t		*mp,
-	xfs_ino_t		ino);
-
 typedef int (*inumbers_fmt_pf)(
 	void			__user *ubuffer, /* buffer to write to */
 	const xfs_inogrp_t	*buffer,	/* buffer to read from */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index bcad5f4c1fd1..679c7c4926a2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -451,8 +451,6 @@ extern int	 xlog_find_tail(xlog_t	*log,
 extern int	 xlog_recover(xlog_t *log);
 extern int	 xlog_recover_finish(xlog_t *log);
 extern void	 xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern void	 xlog_recover_process_iunlinks(xlog_t *log);
-
 extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
 extern void	 xlog_put_bp(struct xfs_buf *);
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 47da2fb45377..1099395d7d6c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3263,7 +3263,7 @@ xlog_recover_process_one_iunlink(
  * freeing of the inode and its removal from the list must be
  * atomic.
  */
-void
+STATIC void
 xlog_recover_process_iunlinks(
 	xlog_t		*log)
 {
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5c6f092659c1..8b6c9e807efb 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1568,7 +1568,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
  *
  * The m_sb_lock must be held when this routine is called.
  */
-int
+STATIC int
 xfs_mod_incore_sb_unlocked(
 	xfs_mount_t	*mp,
 	xfs_sb_field_t	field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a5122382afde..a6c023bc0fb2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -414,13 +414,10 @@ typedef struct xfs_mod_sb {
 
 extern int	xfs_log_sbcount(xfs_mount_t *, uint);
 extern int	xfs_mountfs(xfs_mount_t *mp);
-extern void	xfs_mountfs_check_barriers(xfs_mount_t *mp);
 
 extern void	xfs_unmountfs(xfs_mount_t *);
 extern int	xfs_unmountfs_writesb(xfs_mount_t *);
 extern int	xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
-extern int	xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
-			int64_t, int);
 extern int	xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
 			uint, int);
 extern int	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index afee7eb24323..4b0613d99faa 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -563,35 +563,6 @@ xfs_mru_cache_lookup(
 	return elem ? elem->value : NULL;
 }
 
-/*
- * To look up an element using its key, but leave its location in the internal
- * lists alone, call xfs_mru_cache_peek().  If the element isn't found, this
- * function returns NULL.
- *
- * See the comments above the declaration of the xfs_mru_cache_lookup() function
- * for important locking information pertaining to this call.
- */
-void *
-xfs_mru_cache_peek(
-	xfs_mru_cache_t	*mru,
-	unsigned long	key)
-{
-	xfs_mru_cache_elem_t *elem;
-
-	ASSERT(mru && mru->lists);
-	if (!mru || !mru->lists)
-		return NULL;
-
-	spin_lock(&mru->lock);
-	elem = radix_tree_lookup(&mru->store, key);
-	if (!elem)
-		spin_unlock(&mru->lock);
-	else
-		__release(mru_lock); /* help sparse not be stupid */
-
-	return elem ? elem->value : NULL;
-}
-
 /*
  * To release the internal data structure spinlock after having performed an
  * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index dd58ea1bbebe..5d439f34b0c9 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -49,7 +49,6 @@ int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
 void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
 void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
 void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
-void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key);
 void xfs_mru_cache_done(struct xfs_mru_cache *mru);
 
 #endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f76c003ec55d..ae65f0df87da 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -78,10 +78,4 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
 extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
 				xfs_buf_t *bp, xfs_daddr_t blkno);
 
-/*
- * Prototypes for functions in xfs_vnodeops.c.
- */
-extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
-			int flags);
-
 #endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 492d75bae2bf..ceecafd1f9c1 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -718,7 +718,7 @@ xfs_fsync(
  * when the link count isn't zero and by xfs_dm_punch_hole() when
  * punching a hole to EOF.
  */
-int
+STATIC int
 xfs_free_eofblocks(
 	xfs_mount_t	*mp,
 	xfs_inode_t	*ip,
-- 
cgit v1.2.3-70-g09d2


From fef1111ecdd2b5198afe91537a5cd4c6be80a255 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 2 Jul 2009 21:35:43 -0500
Subject: un-static xfs_read_agf

CONFIG_XFS_DEBUG builds still need xfs_read_agf to be
non-static, oops.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ag.h    | 3 +++
 fs/xfs/xfs_alloc.c | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index af3cfeb18b0d..f24b50b68d03 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -91,6 +91,9 @@ typedef struct xfs_agf {
 #define	XFS_AGF_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
 #define	XFS_BUF_TO_AGF(bp)	((xfs_agf_t *)XFS_BUF_PTR(bp))
 
+extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+			xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+
 /*
  * Size of the unlinked inode hash table in the agi.
  */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 6316004922d7..2cf944eb796d 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2248,7 +2248,7 @@ xfs_alloc_put_freelist(
 /*
  * Read in the allocation group header (free/alloc section).
  */
-STATIC int				/* error */
+int					/* error */
 xfs_read_agf(
 	struct xfs_mount	*mp,	/* mount point structure */
 	struct xfs_trans	*tp,	/* transaction pointer */
-- 
cgit v1.2.3-70-g09d2


From eb00457d624c70a2492c319e7e05ed7e067b2794 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Sun, 5 Jul 2009 12:23:35 -0500
Subject: xfs: remove XFS_INO64_OFFSET

Commit a19d9f887d81106d52cacbc9930207b487e07e0e removed the
ino64 option but left the XFS_INO64_OFFSET define it used
in place - just remove it.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_inum.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
index 7a28191cb0de..b8e4ee4e89a4 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/xfs_inum.h
@@ -72,7 +72,6 @@ struct xfs_mount;
 
 #if XFS_BIG_INUMS
 #define	XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 56) - 1ULL))
-#define	XFS_INO64_OFFSET	((xfs_ino_t)(1ULL << 32))
 #else
 #define	XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 32) - 1ULL))
 #endif
-- 
cgit v1.2.3-70-g09d2


From a0f7bfd3429fc48f4d1aa5364b9428d75798ba04 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 27 Jul 2009 18:15:25 +0200
Subject: fs/xfs: Correct redundant test

bp was tested for NULL a few lines before, followed by a return, and there
is no intervening modification of its value.

A simplified version of the semantic match that finds this problem is as
follows: (http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@r exists@
local idexpression x;
expression E;
position p1,p2;
@@

if (x == NULL || ...) { ... when forall
   return ...; }
... when != \(x=E\|x--\|x++\|--x\|++x\|x-=E\|x+=E\|x|=E\|x&=E\|&x\)
(
*x == NULL
|
*x != NULL
)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Acked-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_trans_buf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8ee2f8c8b0a6..218829e6a152 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -307,7 +307,7 @@ xfs_trans_read_buf(
 			return (flags & XFS_BUF_TRYLOCK) ?
 					EAGAIN : XFS_ERROR(ENOMEM);
 
-		if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
+		if (XFS_BUF_GETERROR(bp) != 0) {
 			xfs_ioerror_alert("xfs_trans_read_buf", mp,
 					  bp, blkno);
 			error = XFS_BUF_GETERROR(bp);
@@ -315,7 +315,7 @@ xfs_trans_read_buf(
 			return error;
 		}
 #ifdef DEBUG
-		if (xfs_do_error && (bp != NULL)) {
+		if (xfs_do_error) {
 			if (xfs_error_target == target) {
 				if (((xfs_req_num++) % xfs_error_mod) == 0) {
 					xfs_buf_relse(bp);
-- 
cgit v1.2.3-70-g09d2


From de89de6e0cf4b1eb13f27137cf2aa40d287aabdf Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 31 Aug 2009 17:00:59 -0400
Subject: ext4: Restore wbc->range_start in ext4_da_writepages()

To solve a lock inversion problem, we implement part of the
range_cyclic algorithm in ext4_da_writepages().  (See commit 2acf2c26
for more details.)

As part of that change wbc->range_start was modified by ext4's
writepages function, which causes its callers to get confused since
they aren't expecting the filesystem to modify it.  The simplest fix
is to save and restore wbc->range_start in ext4_da_writepages.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d61fb523308f..ff659e757578 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2749,6 +2749,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	long pages_skipped;
 	int range_cyclic, cycled = 1, io_done = 0;
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
+	loff_t range_start = wbc->range_start;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
 	trace_ext4_da_writepages(inode, wbc);
@@ -2917,6 +2918,7 @@ out_writepages:
 	if (!no_nrwrite_index_update)
 		wbc->no_nrwrite_index_update = 0;
 	wbc->nr_to_write -= nr_to_writebump;
+	wbc->range_start = range_start;
 	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From b3a3ca8ca0c3c29abc5b2bfe94bb14f3f4590df9 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 31 Aug 2009 23:13:11 -0400
Subject: ext4: Add new tracepoint: trace_ext4_da_write_pages()

Add a new tracepoint which shows the pages that will be written using
write_cache_pages() by ext4_da_writepages().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h              | 15 +++++++++++++++
 fs/ext4/inode.c             | 13 +------------
 include/trace/events/ext4.h | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 41a76e163b99..81014f4ed22d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -113,6 +113,21 @@ struct ext4_allocation_request {
 	unsigned int flags;
 };
 
+/*
+ * For delayed allocation tracking
+ */
+struct mpage_da_data {
+	struct inode *inode;
+	sector_t b_blocknr;		/* start block number of extent */
+	size_t b_size;			/* size of extent */
+	unsigned long b_state;		/* state of the extent */
+	unsigned long first_page, next_page;	/* extent of pages */
+	struct writeback_control *wbc;
+	int io_done;
+	int pages_written;
+	int retval;
+};
+
 /*
  * Special inodes numbers
  */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ff659e757578..17802a96af9f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1875,18 +1875,6 @@ static void ext4_da_page_release_reservation(struct page *page,
  * Delayed allocation stuff
  */
 
-struct mpage_da_data {
-	struct inode *inode;
-	sector_t b_blocknr;		/* start block number of extent */
-	size_t b_size;			/* size of extent */
-	unsigned long b_state;		/* state of the extent */
-	unsigned long first_page, next_page;	/* extent of pages */
-	struct writeback_control *wbc;
-	int io_done;
-	int pages_written;
-	int retval;
-};
-
 /*
  * mpage_da_submit_io - walks through extent of pages and try to write
  * them with writepage() call back
@@ -2863,6 +2851,7 @@ retry:
 			mpd.io_done = 1;
 			ret = MPAGE_DA_EXTENT_TAIL;
 		}
+		trace_ext4_da_write_pages(inode, &mpd);
 		wbc->nr_to_write -= mpd.pages_written;
 
 		ext4_journal_stop(handle);
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 15051d2d1219..dd43399288ea 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -251,6 +251,40 @@ TRACE_EVENT(ext4_da_writepages,
 		  __entry->range_cyclic)
 );
 
+TRACE_EVENT(ext4_da_write_pages,
+	TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
+
+	TP_ARGS(inode, mpd),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	__u64,	b_blocknr		)
+		__field(	__u32,	b_size			)
+		__field(	__u32,	b_state			)
+		__field(	unsigned long,	first_page	)
+		__field(	int,	io_done			)
+		__field(	int,	pages_written		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->ino		= inode->i_ino;
+		__entry->b_blocknr	= mpd->b_blocknr;
+		__entry->b_size		= mpd->b_size;
+		__entry->b_state	= mpd->b_state;
+		__entry->first_page	= mpd->first_page;
+		__entry->io_done	= mpd->io_done;
+		__entry->pages_written	= mpd->pages_written;
+	),
+
+	TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino,
+		  __entry->b_blocknr, __entry->b_size,
+		  __entry->b_state, __entry->first_page,
+		  __entry->io_done, __entry->pages_written)
+);
+
 TRACE_EVENT(ext4_da_writepages_result,
 	TP_PROTO(struct inode *inode, struct writeback_control *wbc,
 			int ret, int pages_written),
-- 
cgit v1.2.3-70-g09d2


From 37d0892c5a94e208cf863e3b7bac014edee4346d Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 1 Sep 2009 11:26:22 +0800
Subject: autofs4 - fix missed case when changing to use struct path

In the recent change by Al Viro that changes verious subsystems
to use "struct path" one case was missed in the autofs4 module
which causes mounts to no longer expire.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/expire.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index aa39ae83f019..3da18d453488 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -77,7 +77,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 	}
 
 	/* Update the expiry counter if fs is busy */
-	if (!may_umount_tree(mnt)) {
+	if (!may_umount_tree(path.mnt)) {
 		struct autofs_info *ino = autofs4_dentry_ino(top);
 		ino->last_used = jiffies;
 		goto done;
-- 
cgit v1.2.3-70-g09d2


From 1b3859bc9e20d764316346665fc93ecea2d2b176 Mon Sep 17 00:00:00 2001
From: Alexander Strakh <strakh@ispras.ru>
Date: Tue, 1 Sep 2009 17:02:24 +0000
Subject: [CIFS] Memory leak in ntlmv2 hash calculation

 in function calc_ntlmv2_hash memory is not released.
1. If in the line 333 we successfully allocate memory and assign it to
pctxt variable:
       pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL);
then we go to line 376 and exit wihout releasing memory pointed to by pctxt
variable.

Add a memory  releasing for pctxt variable before exit from function
calc_ntlmv2_hash.

Signed-off-by: Alexander Strakh <strakh@ispras.ru>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7c9809523f42..7efe1745494d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -373,6 +373,7 @@ calc_exit_2:
 	   compare with the NTLM example */
 	hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
 
+	kfree(pctxt);
 	return rc;
 }
 
-- 
cgit v1.2.3-70-g09d2


From bdb97adcdf0993adbd2eef44b4533620d43792de Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Thu, 20 Aug 2009 13:03:34 +0530
Subject: PATCH] cifs: fix broken mounts when a SSH tunnel is used (try #4)

One more try..

It seems there is a regression that got introduced while Jeff fixed
all the mount/umount races. While attempting to find whether a tcp
session is already existing, we were not checking whether the "port"
used are the same. When a second mount is attempted with a different
"port=" option, it is being ignored. Because of this the cifs mounts
that uses a SSH tunnel appears to be broken.

Steps to reproduce:

1. create 2 shares
# SSH Tunnel a SMB session
2. ssh -f -L 6111:127.0.0.1:445 root@localhost "sleep 86400"
3. ssh -f -L 6222:127.0.0.1:445 root@localhost "sleep 86400"
4. tcpdump -i lo 6111 &
5. mkdir -p /mnt/mnt1
6. mkdir -p /mnt/mnt2
7. mount.cifs //localhost/a /mnt/mnt1 -o username=guest,ip=127.0.0.1,port=6111
#(shows tcpdump activity on port 6111)
8. mount.cifs //localhost/b /mnt/mnt2 -o username=guest,ip=127.0.0.1,port=6222
#(shows tcpdump activity only on port 6111 and not on 6222

Fix by adding a check to compare the port _only_ if the user tries to
override the tcp port with "port=" option, before deciding that an
existing tcp session is found. Also, clean up a bit by replacing
if-else if by a switch statment while at it as suggested by Jeff.

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 45 +++++++++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1f3345d7fa79..4e47a9b800ce 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1377,7 +1377,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 }
 
 static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr_storage *addr)
+cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
 {
 	struct list_head *tmp;
 	struct TCP_Server_Info *server;
@@ -1397,16 +1397,37 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
 		if (server->tcpStatus == CifsNew)
 			continue;
 
-		if (addr->ss_family == AF_INET &&
-		    (addr4->sin_addr.s_addr !=
-		     server->addr.sockAddr.sin_addr.s_addr))
-			continue;
-		else if (addr->ss_family == AF_INET6 &&
-			 (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
-					   &addr6->sin6_addr) ||
-			  server->addr.sockAddr6.sin6_scope_id !=
-					   addr6->sin6_scope_id))
-			continue;
+		switch (addr->ss_family) {
+		case AF_INET:
+			if (addr4->sin_addr.s_addr ==
+			    server->addr.sockAddr.sin_addr.s_addr) {
+				addr4->sin_port = htons(port);
+				/* user overrode default port? */
+				if (addr4->sin_port) {
+					if (addr4->sin_port !=
+					    server->addr.sockAddr.sin_port)
+						continue;
+				}
+				break;
+			} else
+				continue;
+
+		case AF_INET6:
+			if (ipv6_addr_equal(&addr6->sin6_addr,
+			    &server->addr.sockAddr6.sin6_addr) &&
+			    (addr6->sin6_scope_id == 
+			    server->addr.sockAddr6.sin6_scope_id)) {
+				addr6->sin6_port = htons(port);
+				/* user overrode default port? */
+				if (addr6->sin6_port) {
+				       if (addr6->sin6_port !=
+					   server->addr.sockAddr6.sin6_port)
+					       continue;
+				}
+				break;
+			} else 
+				continue;
+		}
 
 		++server->srv_count;
 		write_unlock(&cifs_tcp_ses_lock);
@@ -1475,7 +1496,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	}
 
 	/* see if we already have a matching tcp_ses */
-	tcp_ses = cifs_find_tcp_session(&addr);
+	tcp_ses = cifs_find_tcp_session(&addr, volume_info->port);
 	if (tcp_ses)
 		return tcp_ses;
 
-- 
cgit v1.2.3-70-g09d2


From ca43e3beee38623a3f80691b28623d6ecf214742 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 1 Sep 2009 17:20:50 +0000
Subject: [CIFS] Fix checkpatch warnings

Also update version number to 1.61

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   |  5 ++++-
 fs/cifs/cifsfs.h  |  2 +-
 fs/cifs/connect.c | 12 ++++++------
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index e85b1e4389e0..145540a316ab 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -3,7 +3,10 @@ Version 1.60
 Fix memory leak in reconnect.  Fix oops in DFS mount error path.
 Set s_maxbytes to smaller (the max that vfs can handle) so that
 sendfile will now work over cifs mounts again.  Add noforcegid
-and noforceuid mount parameters.
+and noforceuid mount parameters. Fix small mem leak when using
+ntlmv2. Fix 2nd mount to same server but with different port to
+be allowed (rather than reusing the 1st port) - only when the
+user explicitly overrides the port on the 2nd mount.
 
 Version 1.59
 ------------
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 6c170948300d..094325e3f714 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.60"
+#define CIFS_VERSION   "1.61"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4e47a9b800ce..d49682433c20 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1415,17 +1415,17 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
 		case AF_INET6:
 			if (ipv6_addr_equal(&addr6->sin6_addr,
 			    &server->addr.sockAddr6.sin6_addr) &&
-			    (addr6->sin6_scope_id == 
+			    (addr6->sin6_scope_id ==
 			    server->addr.sockAddr6.sin6_scope_id)) {
 				addr6->sin6_port = htons(port);
 				/* user overrode default port? */
 				if (addr6->sin6_port) {
-				       if (addr6->sin6_port !=
+					if (addr6->sin6_port !=
 					   server->addr.sockAddr6.sin6_port)
-					       continue;
+						continue;
 				}
 				break;
-			} else 
+			} else
 				continue;
 		}
 
@@ -2657,9 +2657,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		return -EIO;
 
 	smb_buffer = cifs_buf_get();
-	if (smb_buffer == NULL) {
+	if (smb_buffer == NULL)
 		return -ENOMEM;
-	}
+
 	smb_buffer_response = smb_buffer;
 
 	header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
-- 
cgit v1.2.3-70-g09d2


From 85c0b2ab5e69ca6133380ead1c50e0840d136b39 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 31 Aug 2009 20:56:51 -0300
Subject: xfs: factor out inode initialisation

Factor out code to initialize new inode clusters into a function of it's own.
This keeps xfs_ialloc_ag_alloc smaller and better structured and enables a
future inode cluster initialization transaction.  Also initialize the agno
variable earlier in xfs_ialloc_ag_alloc to avoid repeated byte swaps.

[hch:  The original patch is from Dave from his unpublished inode create
 transaction patch series, with some modifcations by me to apply stand-alone]

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 175 ++++++++++++++++++++++++++++------------------------
 1 file changed, 95 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 3120a3a5e20f..ce9edf7f4cb4 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -152,6 +152,87 @@ xfs_inobt_get_rec(
 	return error;
 }
 
+/*
+ * Initialise a new set of inodes.
+ */
+STATIC void
+xfs_ialloc_inode_init(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	xfs_agblock_t		length,
+	unsigned int		gen)
+{
+	struct xfs_buf		*fbuf;
+	struct xfs_dinode	*free;
+	int			blks_per_cluster, nbufs, ninodes;
+	int			version;
+	int			i, j;
+	xfs_daddr_t		d;
+
+	/*
+	 * Loop over the new block(s), filling in the inodes.
+	 * For small block sizes, manipulate the inodes in buffers
+	 * which are multiples of the blocks size.
+	 */
+	if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+		blks_per_cluster = 1;
+		nbufs = length;
+		ninodes = mp->m_sb.sb_inopblock;
+	} else {
+		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
+				   mp->m_sb.sb_blocksize;
+		nbufs = length / blks_per_cluster;
+		ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
+	}
+
+	/*
+	 * Figure out what version number to use in the inodes we create.
+	 * If the superblock version has caught up to the one that supports
+	 * the new inode format, then use the new inode version.  Otherwise
+	 * use the old version so that old kernels will continue to be
+	 * able to use the file system.
+	 */
+	if (xfs_sb_version_hasnlink(&mp->m_sb))
+		version = 2;
+	else
+		version = 1;
+
+	for (j = 0; j < nbufs; j++) {
+		/*
+		 * Get the block.
+		 */
+		d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+		fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+					 mp->m_bsize * blks_per_cluster,
+					 XFS_BUF_LOCK);
+		ASSERT(fbuf);
+		ASSERT(!XFS_BUF_GETERROR(fbuf));
+
+		/*
+		 * Initialize all inodes in this buffer and then log them.
+		 *
+		 * XXX: It would be much better if we had just one transaction
+		 *	to log a whole cluster of inodes instead of all the
+		 *	individual transactions causing a lot of log traffic.
+		 */
+		xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+		for (i = 0; i < ninodes; i++) {
+			int	ioffset = i << mp->m_sb.sb_inodelog;
+			uint	isize = sizeof(struct xfs_dinode);
+
+			free = xfs_make_iptr(mp, fbuf, i);
+			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+			free->di_version = version;
+			free->di_gen = cpu_to_be32(gen);
+			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+			xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
+		}
+		xfs_trans_inode_alloc_buf(tp, fbuf);
+	}
+}
+
 /*
  * Allocate new inodes in the allocation group specified by agbp.
  * Return 0 for success, else error code.
@@ -164,24 +245,15 @@ xfs_ialloc_ag_alloc(
 {
 	xfs_agi_t	*agi;		/* allocation group header */
 	xfs_alloc_arg_t	args;		/* allocation argument structure */
-	int		blks_per_cluster;  /* fs blocks per inode cluster */
 	xfs_btree_cur_t	*cur;		/* inode btree cursor */
-	xfs_daddr_t	d;		/* disk addr of buffer */
 	xfs_agnumber_t	agno;
 	int		error;
-	xfs_buf_t	*fbuf;		/* new free inodes' buffer */
-	xfs_dinode_t	*free;		/* new free inode structure */
-	int		i;		/* inode counter */
-	int		j;		/* block counter */
-	int		nbufs;		/* num bufs of new inodes */
+	int		i;
 	xfs_agino_t	newino;		/* new first inode's number */
 	xfs_agino_t	newlen;		/* new number of inodes */
-	int		ninodes;	/* num inodes per buf */
 	xfs_agino_t	thisino;	/* current inode number, for loop */
-	int		version;	/* inode version number to use */
 	int		isaligned = 0;	/* inode allocation at stripe unit */
 					/* boundary */
-	unsigned int	gen;
 
 	args.tp = tp;
 	args.mp = tp->t_mountp;
@@ -202,12 +274,12 @@ xfs_ialloc_ag_alloc(
  	 */
 	agi = XFS_BUF_TO_AGI(agbp);
 	newino = be32_to_cpu(agi->agi_newino);
+	agno = be32_to_cpu(agi->agi_seqno);
 	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
 			XFS_IALLOC_BLOCKS(args.mp);
 	if (likely(newino != NULLAGINO &&
 		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
-		args.fsbno = XFS_AGB_TO_FSB(args.mp,
-				be32_to_cpu(agi->agi_seqno), args.agbno);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 		args.type = XFS_ALLOCTYPE_THIS_BNO;
 		args.mod = args.total = args.wasdel = args.isfl =
 			args.userdata = args.minalignslop = 0;
@@ -258,8 +330,7 @@ xfs_ialloc_ag_alloc(
 		 * For now, just allocate blocks up front.
 		 */
 		args.agbno = be32_to_cpu(agi->agi_root);
-		args.fsbno = XFS_AGB_TO_FSB(args.mp,
-				be32_to_cpu(agi->agi_seqno), args.agbno);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 		/*
 		 * Allocate a fixed-size extent of inodes.
 		 */
@@ -282,8 +353,7 @@ xfs_ialloc_ag_alloc(
 	if (isaligned && args.fsbno == NULLFSBLOCK) {
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 		args.agbno = be32_to_cpu(agi->agi_root);
-		args.fsbno = XFS_AGB_TO_FSB(args.mp,
-				be32_to_cpu(agi->agi_seqno), args.agbno);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 		args.alignment = xfs_ialloc_cluster_alignment(&args);
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
@@ -294,85 +364,30 @@ xfs_ialloc_ag_alloc(
 		return 0;
 	}
 	ASSERT(args.len == args.minlen);
-	/*
-	 * Convert the results.
-	 */
-	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
-	/*
-	 * Loop over the new block(s), filling in the inodes.
-	 * For small block sizes, manipulate the inodes in buffers
-	 * which are multiples of the blocks size.
-	 */
-	if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
-		blks_per_cluster = 1;
-		nbufs = (int)args.len;
-		ninodes = args.mp->m_sb.sb_inopblock;
-	} else {
-		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
-				   args.mp->m_sb.sb_blocksize;
-		nbufs = (int)args.len / blks_per_cluster;
-		ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
-	}
-	/*
-	 * Figure out what version number to use in the inodes we create.
-	 * If the superblock version has caught up to the one that supports
-	 * the new inode format, then use the new inode version.  Otherwise
-	 * use the old version so that old kernels will continue to be
-	 * able to use the file system.
-	 */
-	if (xfs_sb_version_hasnlink(&args.mp->m_sb))
-		version = 2;
-	else
-		version = 1;
 
 	/*
+	 * Stamp and write the inode buffers.
+	 *
 	 * Seed the new inode cluster with a random generation number. This
 	 * prevents short-term reuse of generation numbers if a chunk is
 	 * freed and then immediately reallocated. We use random numbers
 	 * rather than a linear progression to prevent the next generation
 	 * number from being easily guessable.
 	 */
-	gen = random32();
-	for (j = 0; j < nbufs; j++) {
-		/*
-		 * Get the block.
-		 */
-		d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
-				     args.agbno + (j * blks_per_cluster));
-		fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
-					 args.mp->m_bsize * blks_per_cluster,
-					 XFS_BUF_LOCK);
-		ASSERT(fbuf);
-		ASSERT(!XFS_BUF_GETERROR(fbuf));
+	xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
+			      random32());
 
-		/*
-		 * Initialize all inodes in this buffer and then log them.
-		 *
-		 * XXX: It would be much better if we had just one transaction to
-		 *      log a whole cluster of inodes instead of all the individual
-		 *      transactions causing a lot of log traffic.
-		 */
-		xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
-		for (i = 0; i < ninodes; i++) {
-			int	ioffset = i << args.mp->m_sb.sb_inodelog;
-			uint	isize = sizeof(struct xfs_dinode);
-
-			free = xfs_make_iptr(args.mp, fbuf, i);
-			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
-			free->di_version = version;
-			free->di_gen = cpu_to_be32(gen);
-			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
-			xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
-		}
-		xfs_trans_inode_alloc_buf(tp, fbuf);
-	}
+	/*
+	 * Convert the results.
+	 */
+	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
 	be32_add_cpu(&agi->agi_count, newlen);
 	be32_add_cpu(&agi->agi_freecount, newlen);
-	agno = be32_to_cpu(agi->agi_seqno);
 	down_read(&args.mp->m_peraglock);
 	args.mp->m_perag[agno].pagi_freecount += newlen;
 	up_read(&args.mp->m_peraglock);
 	agi->agi_newino = cpu_to_be32(newino);
+
 	/*
 	 * Insert records describing the new inode chunk into the btree.
 	 */
-- 
cgit v1.2.3-70-g09d2


From 2e287a731e0607e0371dc6165b7dd3ebc67fa8e1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 Aug 2009 20:56:58 -0300
Subject: xfs: improve xfs_inobt_get_rec prototype

Most callers of xfs_inobt_get_rec need to fill a xfs_inobt_rec_incore_t, and
those who don't yet are fine with a xfs_inobt_rec_incore_t, instead of the
three individual variables, too.  So just change xfs_inobt_get_rec to write
the output into a xfs_inobt_rec_incore_t directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 81 ++++++++++++++++++++-------------------------------
 fs/xfs/xfs_ialloc.h |  4 +--
 fs/xfs/xfs_itable.c | 84 ++++++++++++++++++++++++++++-------------------------
 3 files changed, 77 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index ce9edf7f4cb4..72fa3bfc56eb 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -135,9 +135,7 @@ xfs_inobt_update(
 int					/* error */
 xfs_inobt_get_rec(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agino_t		*ino,	/* output: starting inode of chunk */
-	__int32_t		*fcnt,	/* output: number of free inodes */
-	xfs_inofree_t		*free,	/* output: free inode mask */
+	xfs_inobt_rec_incore_t	*irec,	/* btree record */
 	int			*stat)	/* output: success/failure */
 {
 	union xfs_btree_rec	*rec;
@@ -145,9 +143,9 @@ xfs_inobt_get_rec(
 
 	error = xfs_btree_get_rec(cur, &rec, stat);
 	if (!error && *stat == 1) {
-		*ino = be32_to_cpu(rec->inobt.ir_startino);
-		*fcnt = be32_to_cpu(rec->inobt.ir_freecount);
-		*free = be64_to_cpu(rec->inobt.ir_free);
+		irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+		irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
+		irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
 	}
 	return error;
 }
@@ -746,8 +744,8 @@ nextag:
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 		do {
-			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-					&rec.ir_freecount, &rec.ir_free, &i)))
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+			if (error)
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 			freecount += rec.ir_freecount;
@@ -766,8 +764,7 @@ nextag:
 		if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
 			goto error0;
 		if (i != 0 &&
-		    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-			    &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
+		    (error = xfs_inobt_get_rec(cur, &rec, &j)) == 0 &&
 		    j == 1 &&
 		    rec.ir_freecount > 0) {
 			/*
@@ -799,10 +796,8 @@ nextag:
 				goto error1;
 			doneleft = !i;
 			if (!doneleft) {
-				if ((error = xfs_inobt_get_rec(tcur,
-						&trec.ir_startino,
-						&trec.ir_freecount,
-						&trec.ir_free, &i)))
+				error = xfs_inobt_get_rec(tcur, &trec, &i);
+				if (error)
 					goto error1;
 				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
 			}
@@ -813,10 +808,8 @@ nextag:
 				goto error1;
 			doneright = !i;
 			if (!doneright) {
-				if ((error = xfs_inobt_get_rec(cur,
-						&rec.ir_startino,
-						&rec.ir_freecount,
-						&rec.ir_free, &i)))
+				error = xfs_inobt_get_rec(cur, &rec, &i);
+				if (error)
 					goto error1;
 				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
 			}
@@ -876,11 +869,9 @@ nextag:
 						goto error1;
 					doneleft = !i;
 					if (!doneleft) {
-						if ((error = xfs_inobt_get_rec(
-							    tcur,
-							    &trec.ir_startino,
-							    &trec.ir_freecount,
-							    &trec.ir_free, &i)))
+						error = xfs_inobt_get_rec(
+							    tcur, &trec, &i);
+						if (error)
 							goto error1;
 						XFS_WANT_CORRUPTED_GOTO(i == 1,
 							error1);
@@ -896,11 +887,9 @@ nextag:
 						goto error1;
 					doneright = !i;
 					if (!doneright) {
-						if ((error = xfs_inobt_get_rec(
-							    cur,
-							    &rec.ir_startino,
-							    &rec.ir_freecount,
-							    &rec.ir_free, &i)))
+						error = xfs_inobt_get_rec(
+							    cur, &rec, &i);
+						if (error)
 							goto error1;
 						XFS_WANT_CORRUPTED_GOTO(i == 1,
 							error1);
@@ -919,8 +908,7 @@ nextag:
 				be32_to_cpu(agi->agi_newino), 0, 0, &i)))
 			goto error0;
 		if (i == 1 &&
-		    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-			    &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
+		    (error = xfs_inobt_get_rec(cur, &rec, &j)) == 0 &&
 		    j == 1 &&
 		    rec.ir_freecount > 0) {
 			/*
@@ -938,10 +926,8 @@ nextag:
 				goto error0;
 			ASSERT(i == 1);
 			for (;;) {
-				if ((error = xfs_inobt_get_rec(cur,
-						&rec.ir_startino,
-						&rec.ir_freecount, &rec.ir_free,
-						&i)))
+				error = xfs_inobt_get_rec(cur, &rec, &i);
+				if (error)
 					goto error0;
 				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 				if (rec.ir_freecount > 0)
@@ -975,8 +961,8 @@ nextag:
 		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
 			goto error0;
 		do {
-			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-					&rec.ir_freecount, &rec.ir_free, &i)))
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+			if (error)
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 			freecount += rec.ir_freecount;
@@ -1084,8 +1070,8 @@ xfs_difree(
 		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
 			goto error0;
 		do {
-			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-					&rec.ir_freecount, &rec.ir_free, &i)))
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+			if (error)
 				goto error0;
 			if (i) {
 				freecount += rec.ir_freecount;
@@ -1107,8 +1093,8 @@ xfs_difree(
 		goto error0;
 	}
 	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-	if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount,
-			&rec.ir_free, &i))) {
+	error = xfs_inobt_get_rec(cur, &rec, &i);
+	if (error) {
 		cmn_err(CE_WARN,
 			"xfs_difree: xfs_inobt_get_rec()  returned an error %d on %s.  Returning error.",
 			error, mp->m_fsname);
@@ -1187,10 +1173,8 @@ xfs_difree(
 		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
 			goto error0;
 		do {
-			if ((error = xfs_inobt_get_rec(cur,
-					&rec.ir_startino,
-					&rec.ir_freecount,
-					&rec.ir_free, &i)))
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+			if (error)
 				goto error0;
 			if (i) {
 				freecount += rec.ir_freecount;
@@ -1312,9 +1296,7 @@ xfs_imap(
 		chunk_agbno = agbno - offset_agbno;
 	} else {
 		xfs_btree_cur_t	*cur;	/* inode btree cursor */
-		xfs_agino_t	chunk_agino; /* first agino in inode chunk */
-		__int32_t	chunk_cnt; /* count of free inodes in chunk */
-		xfs_inofree_t	chunk_free; /* mask of free inodes in chunk */
+		xfs_inobt_rec_incore_t chunk_rec;
 		xfs_buf_t	*agbp;	/* agi buffer */
 		int		i;	/* temp state */
 
@@ -1337,8 +1319,7 @@ xfs_imap(
 			goto error0;
 		}
 
-		error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
-				&chunk_free, &i);
+		error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
 		if (error) {
 			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
 					"xfs_inobt_get_rec() failed");
@@ -1356,7 +1337,7 @@ xfs_imap(
 		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 		if (error)
 			return error;
-		chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
+		chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
 		offset_agbno = agbno - chunk_agbno;
 	}
 
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index aeee8278f92c..52e72fe7e411 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -166,7 +166,7 @@ int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
 /*
  * Get the data from the pointed-to record.
  */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
-			     __int32_t *fcnt, xfs_inofree_t *free, int *stat);
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+		xfs_inobt_rec_incore_t *rec, int *stat);
 
 #endif	/* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c471122d2234..3ec13e8a8c54 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -353,9 +353,6 @@ xfs_bulkstat(
 	int			end_of_ag; /* set if we've seen the ag end */
 	int			error;	/* error code */
 	int                     fmterror;/* bulkstat formatter result */
-	__int32_t		gcnt;	/* current btree rec's count */
-	xfs_inofree_t		gfree;	/* current btree rec's free mask */
-	xfs_agino_t		gino;	/* current btree rec's start inode */
 	int			i;	/* loop index */
 	int			icount;	/* count of inodes good in irbuf */
 	size_t			irbsize; /* size of irec buffer in bytes */
@@ -442,6 +439,8 @@ xfs_bulkstat(
 		 * we need to get the remainder of the chunk we're in.
 		 */
 		if (agino > 0) {
+			xfs_inobt_rec_incore_t r;
+
 			/*
 			 * Lookup the inode chunk that this inode lives in.
 			 */
@@ -449,33 +448,33 @@ xfs_bulkstat(
 			if (!error &&	/* no I/O error */
 			    tmp &&	/* lookup succeeded */
 					/* got the record, should always work */
-			    !(error = xfs_inobt_get_rec(cur, &gino, &gcnt,
-				    &gfree, &i)) &&
+			    !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
 			    i == 1 &&
 					/* this is the right chunk */
-			    agino < gino + XFS_INODES_PER_CHUNK &&
+			    agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
 					/* lastino was not last in chunk */
-			    (chunkidx = agino - gino + 1) <
+			    (chunkidx = agino - r.ir_startino + 1) <
 				    XFS_INODES_PER_CHUNK &&
 					/* there are some left allocated */
 			    xfs_inobt_maskn(chunkidx,
-				    XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
+				    XFS_INODES_PER_CHUNK - chunkidx) &
+				    ~r.ir_free) {
 				/*
 				 * Grab the chunk record.  Mark all the
 				 * uninteresting inodes (because they're
 				 * before our start point) free.
 				 */
 				for (i = 0; i < chunkidx; i++) {
-					if (XFS_INOBT_MASK(i) & ~gfree)
-						gcnt++;
+					if (XFS_INOBT_MASK(i) & ~r.ir_free)
+						r.ir_freecount++;
 				}
-				gfree |= xfs_inobt_maskn(0, chunkidx);
-				irbp->ir_startino = gino;
-				irbp->ir_freecount = gcnt;
-				irbp->ir_free = gfree;
+				r.ir_free |= xfs_inobt_maskn(0, chunkidx);
+				irbp->ir_startino = r.ir_startino;
+				irbp->ir_freecount = r.ir_freecount;
+				irbp->ir_free = r.ir_free;
 				irbp++;
-				agino = gino + XFS_INODES_PER_CHUNK;
-				icount = XFS_INODES_PER_CHUNK - gcnt;
+				agino = r.ir_startino + XFS_INODES_PER_CHUNK;
+				icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
 			} else {
 				/*
 				 * If any of those tests failed, bump the
@@ -501,6 +500,8 @@ xfs_bulkstat(
 		 * until we run out of inodes or space in the buffer.
 		 */
 		while (irbp < irbufend && icount < ubcount) {
+			xfs_inobt_rec_incore_t r;
+
 			/*
 			 * Loop as long as we're unable to read the
 			 * inode btree.
@@ -518,43 +519,47 @@ xfs_bulkstat(
 			 * If ran off the end of the ag either with an error,
 			 * or the normal way, set end and stop collecting.
 			 */
-			if (error ||
-			    (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
-				    &gfree, &i)) ||
-			    i == 0) {
+			if (error) {
+				end_of_ag = 1;
+				break;
+			}
+
+			error = xfs_inobt_get_rec(cur, &r, &i);
+			if (error || i == 0) {
 				end_of_ag = 1;
 				break;
 			}
+
 			/*
 			 * If this chunk has any allocated inodes, save it.
 			 * Also start read-ahead now for this chunk.
 			 */
-			if (gcnt < XFS_INODES_PER_CHUNK) {
+			if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
 				/*
 				 * Loop over all clusters in the next chunk.
 				 * Do a readahead if there are any allocated
 				 * inodes in that cluster.
 				 */
-				for (agbno = XFS_AGINO_TO_AGBNO(mp, gino),
-				     chunkidx = 0;
+				agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
+				for (chunkidx = 0;
 				     chunkidx < XFS_INODES_PER_CHUNK;
 				     chunkidx += nicluster,
 				     agbno += nbcluster) {
-					if (xfs_inobt_maskn(chunkidx,
-							    nicluster) & ~gfree)
+					if (xfs_inobt_maskn(chunkidx, nicluster)
+							& ~r.ir_free)
 						xfs_btree_reada_bufs(mp, agno,
 							agbno, nbcluster);
 				}
-				irbp->ir_startino = gino;
-				irbp->ir_freecount = gcnt;
-				irbp->ir_free = gfree;
+				irbp->ir_startino = r.ir_startino;
+				irbp->ir_freecount = r.ir_freecount;
+				irbp->ir_free = r.ir_free;
 				irbp++;
-				icount += XFS_INODES_PER_CHUNK - gcnt;
+				icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
 			}
 			/*
 			 * Set agino to after this chunk and bump the cursor.
 			 */
-			agino = gino + XFS_INODES_PER_CHUNK;
+			agino = r.ir_startino + XFS_INODES_PER_CHUNK;
 			error = xfs_btree_increment(cur, 0, &tmp);
 			cond_resched();
 		}
@@ -820,9 +825,7 @@ xfs_inumbers(
 	int		bufidx;
 	xfs_btree_cur_t	*cur;
 	int		error;
-	__int32_t	gcnt;
-	xfs_inofree_t	gfree;
-	xfs_agino_t	gino;
+	xfs_inobt_rec_incore_t r;
 	int		i;
 	xfs_ino_t	ino;
 	int		left;
@@ -870,9 +873,8 @@ xfs_inumbers(
 				continue;
 			}
 		}
-		if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree,
-			&i)) ||
-		    i == 0) {
+		error = xfs_inobt_get_rec(cur, &r, &i);
+		if (error || i == 0) {
 			xfs_buf_relse(agbp);
 			agbp = NULL;
 			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -881,10 +883,12 @@ xfs_inumbers(
 			agino = 0;
 			continue;
 		}
-		agino = gino + XFS_INODES_PER_CHUNK - 1;
-		buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino);
-		buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt;
-		buffer[bufidx].xi_allocmask = ~gfree;
+		agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
+		buffer[bufidx].xi_startino =
+			XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
+		buffer[bufidx].xi_alloccount =
+			XFS_INODES_PER_CHUNK - r.ir_freecount;
+		buffer[bufidx].xi_allocmask = ~r.ir_free;
 		bufidx++;
 		left--;
 		if (bufidx == bcount) {
-- 
cgit v1.2.3-70-g09d2


From afabc24a73bfee2656724b0a70395f1693eaa62b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 Aug 2009 20:57:03 -0300
Subject: xfs: improve xfs_inobt_update prototype

Both callers of xfs_inobt_update have the record in form of a
xfs_inobt_rec_incore_t, so just pass a pointer to it instead of the
individual variables.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 72fa3bfc56eb..8819cdacf702 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -110,22 +110,19 @@ xfs_inobt_lookup_le(
 }
 
 /*
- * Update the record referred to by cur to the value given
- * by [ino, fcnt, free].
+ * Update the record referred to by cur to the value given.
  * This either works (return 0) or gets an EFSCORRUPTED error.
  */
 STATIC int				/* error */
 xfs_inobt_update(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agino_t		ino,	/* starting inode of chunk */
-	__int32_t		fcnt,	/* free inode count */
-	xfs_inofree_t		free)	/* free inode mask */
+	xfs_inobt_rec_incore_t	*irec)	/* btree record */
 {
 	union xfs_btree_rec	rec;
 
-	rec.inobt.ir_startino = cpu_to_be32(ino);
-	rec.inobt.ir_freecount = cpu_to_be32(fcnt);
-	rec.inobt.ir_free = cpu_to_be64(free);
+	rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+	rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+	rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
 	return xfs_btree_update(cur, &rec);
 }
 
@@ -946,8 +943,8 @@ nextag:
 	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
 	rec.ir_free &= ~XFS_INOBT_MASK(offset);
 	rec.ir_freecount--;
-	if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
-			rec.ir_free)))
+	error = xfs_inobt_update(cur, &rec);
+	if (error)
 		goto error0;
 	be32_add_cpu(&agi->agi_freecount, -1);
 	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
@@ -1149,12 +1146,14 @@ xfs_difree(
 	} else {
 		*delete = 0;
 
-		if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) {
+		error = xfs_inobt_update(cur, &rec);
+		if (error) {
 			cmn_err(CE_WARN,
-				"xfs_difree: xfs_inobt_update()  returned an error %d on %s.  Returning error.",
+	"xfs_difree: xfs_inobt_update returned an error %d on %s.",
 				error, mp->m_fsname);
 			goto error0;
 		}
+
 		/* 
 		 * Change the inode free counts and log the ag/sb changes.
 		 */
-- 
cgit v1.2.3-70-g09d2


From 0b48db80ba689edfd96ed06c3124d6cf1146de3f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dgc@sgi.com>
Date: Mon, 31 Aug 2009 20:57:09 -0300
Subject: xfs: factor out debug checks from xfs_dialloc and xfs_difree

Factor out a common helper from repeated debug checks in xfs_dialloc and
xfs_difree.

[hch: split out from Dave's dynamic allocation policy patches]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 131 ++++++++++++++++++++++------------------------------
 1 file changed, 56 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 8819cdacf702..18bf6eece54d 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -147,6 +147,47 @@ xfs_inobt_get_rec(
 	return error;
 }
 
+/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+	struct xfs_btree_cur	*cur,
+	struct xfs_agi		*agi)
+{
+	if (cur->bc_nlevels == 1) {
+		xfs_inobt_rec_incore_t rec;
+		int		freecount = 0;
+		int		error;
+		int		i;
+
+		error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i);
+		if (error)
+			return error;
+
+		do {
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+			if (error)
+				return error;
+
+			if (i) {
+				freecount += rec.ir_freecount;
+				error = xfs_btree_increment(cur, 0, &i);
+				if (error)
+					return error;
+			}
+		} while (i == 1);
+
+		if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+			ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+	}
+	return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi)	0
+#endif
+
 /*
  * Initialise a new set of inodes.
  */
@@ -548,6 +589,7 @@ nextag:
 	}
 }
 
+
 /*
  * Visible inode allocation functions.
  */
@@ -733,27 +775,11 @@ nextag:
 	 */
 	if (!pagino)
 		pagino = be32_to_cpu(agi->agi_newino);
-#ifdef DEBUG
-	if (cur->bc_nlevels == 1) {
-		int	freecount = 0;
 
-		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		do {
-			error = xfs_inobt_get_rec(cur, &rec, &i);
-			if (error)
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			freecount += rec.ir_freecount;
-			if ((error = xfs_btree_increment(cur, 0, &i)))
-				goto error0;
-		} while (i == 1);
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
 
-		ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-		       XFS_FORCED_SHUTDOWN(mp));
-	}
-#endif
 	/*
 	 * If in the same a.g. as the parent, try to get near the parent.
 	 */
@@ -951,25 +977,11 @@ nextag:
 	down_read(&mp->m_peraglock);
 	mp->m_perag[tagno].pagi_freecount--;
 	up_read(&mp->m_peraglock);
-#ifdef DEBUG
-	if (cur->bc_nlevels == 1) {
-		int	freecount = 0;
 
-		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
-			goto error0;
-		do {
-			error = xfs_inobt_get_rec(cur, &rec, &i);
-			if (error)
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			freecount += rec.ir_freecount;
-			if ((error = xfs_btree_increment(cur, 0, &i)))
-				goto error0;
-		} while (i == 1);
-		ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-		       XFS_FORCED_SHUTDOWN(mp));
-	}
-#endif
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
 	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
 	*inop = ino;
@@ -1060,26 +1072,11 @@ xfs_difree(
 	 * Initialize the cursor.
 	 */
 	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-#ifdef DEBUG
-	if (cur->bc_nlevels == 1) {
-		int freecount = 0;
 
-		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
-			goto error0;
-		do {
-			error = xfs_inobt_get_rec(cur, &rec, &i);
-			if (error)
-				goto error0;
-			if (i) {
-				freecount += rec.ir_freecount;
-				if ((error = xfs_btree_increment(cur, 0, &i)))
-					goto error0;
-			}
-		} while (i == 1);
-		ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-		       XFS_FORCED_SHUTDOWN(mp));
-	}
-#endif
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
+
 	/*
 	 * Look for the entry describing this inode.
 	 */
@@ -1165,26 +1162,10 @@ xfs_difree(
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
 	}
 
-#ifdef DEBUG
-	if (cur->bc_nlevels == 1) {
-		int freecount = 0;
+	error = xfs_check_agi_freecount(cur, agi);
+	if (error)
+		goto error0;
 
-		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
-			goto error0;
-		do {
-			error = xfs_inobt_get_rec(cur, &rec, &i);
-			if (error)
-				goto error0;
-			if (i) {
-				freecount += rec.ir_freecount;
-				if ((error = xfs_btree_increment(cur, 0, &i)))
-					goto error0;
-			}
-		} while (i == 1);
-		ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-		       XFS_FORCED_SHUTDOWN(mp));
-	}
-#endif
 	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 	return 0;
 
-- 
cgit v1.2.3-70-g09d2


From 4254b0bbb1c0826b7443ffa593576696bc591aa2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 Aug 2009 20:57:14 -0300
Subject: xfs: untangle xfs_dialloc

Clarify the control flow in xfs_dialloc.  Factor out a helper to go to the
next node from the current one and improve the control flow by expanding
composite if statements and using gotos.

The xfs_ialloc_next_rec helper is borrowed from Dave Chinners dynamic
allocation policy patches.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 291 +++++++++++++++++++++++++---------------------------
 1 file changed, 138 insertions(+), 153 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 18bf6eece54d..7679c1633053 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -589,6 +589,37 @@ nextag:
 	}
 }
 
+/*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+	struct xfs_btree_cur	*cur,
+	xfs_inobt_rec_incore_t	*rec,
+	int			*done,
+	int			left)
+{
+	int                     error;
+	int			i;
+
+	if (left)
+		error = xfs_btree_decrement(cur, 0, &i);
+	else
+		error = xfs_btree_increment(cur, 0, &i);
+
+	if (error)
+		return error;
+	*done = !i;
+	if (i) {
+		error = xfs_inobt_get_rec(cur, rec, &i);
+		if (error)
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+
+	return 0;
+}
+
 
 /*
  * Visible inode allocation functions.
@@ -644,8 +675,8 @@ xfs_dialloc(
 	int		j;		/* result code */
 	xfs_mount_t	*mp;		/* file system mount structure */
 	int		offset;		/* index of inode in chunk */
-	xfs_agino_t	pagino;		/* parent's a.g. relative inode # */
-	xfs_agnumber_t	pagno;		/* parent's allocation group number */
+	xfs_agino_t	pagino;		/* parent's AG relative inode # */
+	xfs_agnumber_t	pagno;		/* parent's AG number */
 	xfs_inobt_rec_incore_t rec;	/* inode allocation record */
 	xfs_agnumber_t	tagno;		/* testing allocation group number */
 	xfs_btree_cur_t	*tcur;		/* temp cursor */
@@ -781,186 +812,140 @@ nextag:
 		goto error0;
 
 	/*
-	 * If in the same a.g. as the parent, try to get near the parent.
+	 * If in the same AG as the parent, try to get near the parent.
 	 */
 	if (pagno == agno) {
-		if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
+		int		doneleft;	/* done, to the left */
+		int		doneright;	/* done, to the right */
+
+		error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		error = xfs_inobt_get_rec(cur, &rec, &j);
+		if (error)
 			goto error0;
-		if (i != 0 &&
-		    (error = xfs_inobt_get_rec(cur, &rec, &j)) == 0 &&
-		    j == 1 &&
-		    rec.ir_freecount > 0) {
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		if (rec.ir_freecount > 0) {
 			/*
 			 * Found a free inode in the same chunk
-			 * as parent, done.
+			 * as the parent, done.
 			 */
+			goto alloc_inode;
 		}
+
+
 		/*
-		 * In the same a.g. as parent, but parent's chunk is full.
+		 * In the same AG as parent, but parent's chunk is full.
 		 */
-		else {
-			int	doneleft;	/* done, to the left */
-			int	doneright;	/* done, to the right */
 
-			if (error)
-				goto error0;
-			ASSERT(i == 1);
-			ASSERT(j == 1);
-			/*
-			 * Duplicate the cursor, search left & right
-			 * simultaneously.
-			 */
-			if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-				goto error0;
-			/*
-			 * Search left with tcur, back up 1 record.
-			 */
-			if ((error = xfs_btree_decrement(tcur, 0, &i)))
-				goto error1;
-			doneleft = !i;
-			if (!doneleft) {
-				error = xfs_inobt_get_rec(tcur, &trec, &i);
-				if (error)
-					goto error1;
-				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
+		/* duplicate the cursor, search left & right simultaneously */
+		error = xfs_btree_dup_cursor(cur, &tcur);
+		if (error)
+			goto error0;
+
+		/* search left with tcur, back up 1 record */
+		error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+		if (error)
+			goto error1;
+
+		/* search right with cur, go forward 1 record. */
+		error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+		if (error)
+			goto error1;
+
+		/*
+		 * Loop until we find an inode chunk with a free inode.
+		 */
+		while (!doneleft || !doneright) {
+			int	useleft;  /* using left inode chunk this time */
+
+			/* figure out the closer block if both are valid. */
+			if (!doneleft && !doneright) {
+				useleft = pagino -
+				 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+				  rec.ir_startino - pagino;
+			} else {
+				useleft = !doneleft;
 			}
-			/*
-			 * Search right with cur, go forward 1 record.
-			 */
-			if ((error = xfs_btree_increment(cur, 0, &i)))
-				goto error1;
-			doneright = !i;
-			if (!doneright) {
-				error = xfs_inobt_get_rec(cur, &rec, &i);
-				if (error)
-					goto error1;
-				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
+
+			/* free inodes to the left? */
+			if (useleft && trec.ir_freecount) {
+				rec = trec;
+				xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+				cur = tcur;
+				goto alloc_inode;
 			}
-			/*
-			 * Loop until we find the closest inode chunk
-			 * with a free one.
-			 */
-			while (!doneleft || !doneright) {
-				int	useleft;  /* using left inode
-						     chunk this time */
 
-				/*
-				 * Figure out which block is closer,
-				 * if both are valid.
-				 */
-				if (!doneleft && !doneright)
-					useleft =
-						pagino -
-						(trec.ir_startino +
-						 XFS_INODES_PER_CHUNK - 1) <
-						 rec.ir_startino - pagino;
-				else
-					useleft = !doneleft;
-				/*
-				 * If checking the left, does it have
-				 * free inodes?
-				 */
-				if (useleft && trec.ir_freecount) {
-					/*
-					 * Yes, set it up as the chunk to use.
-					 */
-					rec = trec;
-					xfs_btree_del_cursor(cur,
-						XFS_BTREE_NOERROR);
-					cur = tcur;
-					break;
-				}
-				/*
-				 * If checking the right, does it have
-				 * free inodes?
-				 */
-				if (!useleft && rec.ir_freecount) {
-					/*
-					 * Yes, it's already set up.
-					 */
-					xfs_btree_del_cursor(tcur,
-						XFS_BTREE_NOERROR);
-					break;
-				}
-				/*
-				 * If used the left, get another one
-				 * further left.
-				 */
-				if (useleft) {
-					if ((error = xfs_btree_decrement(tcur, 0,
-							&i)))
-						goto error1;
-					doneleft = !i;
-					if (!doneleft) {
-						error = xfs_inobt_get_rec(
-							    tcur, &trec, &i);
-						if (error)
-							goto error1;
-						XFS_WANT_CORRUPTED_GOTO(i == 1,
-							error1);
-					}
-				}
-				/*
-				 * If used the right, get another one
-				 * further right.
-				 */
-				else {
-					if ((error = xfs_btree_increment(cur, 0,
-							&i)))
-						goto error1;
-					doneright = !i;
-					if (!doneright) {
-						error = xfs_inobt_get_rec(
-							    cur, &rec, &i);
-						if (error)
-							goto error1;
-						XFS_WANT_CORRUPTED_GOTO(i == 1,
-							error1);
-					}
-				}
+			/* free inodes to the right? */
+			if (!useleft && rec.ir_freecount) {
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				goto alloc_inode;
 			}
-			ASSERT(!doneleft || !doneright);
+
+			/* get next record to check */
+			if (useleft) {
+				error = xfs_ialloc_next_rec(tcur, &trec,
+								 &doneleft, 1);
+			} else {
+				error = xfs_ialloc_next_rec(cur, &rec,
+								 &doneright, 0);
+			}
+			if (error)
+				goto error1;
 		}
+		ASSERT(!doneleft || !doneright);
 	}
+
 	/*
-	 * In a different a.g. from the parent.
+	 * In a different AG from the parent.
 	 * See if the most recently allocated block has any free.
 	 */
 	else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
-		if ((error = xfs_inobt_lookup_eq(cur,
-				be32_to_cpu(agi->agi_newino), 0, 0, &i)))
+		error = xfs_inobt_lookup_eq(cur, be32_to_cpu(agi->agi_newino),
+					    0, 0, &i);
+		if (error)
 			goto error0;
-		if (i == 1 &&
-		    (error = xfs_inobt_get_rec(cur, &rec, &j)) == 0 &&
-		    j == 1 &&
-		    rec.ir_freecount > 0) {
-			/*
-			 * The last chunk allocated in the group still has
-			 * a free inode.
-			 */
+
+		if (i == 1) {
+			error = xfs_inobt_get_rec(cur, &rec, &j);
+			if (error)
+				goto error0;
+
+			if (j == 1 && rec.ir_freecount > 0) {
+				/*
+				 * The last chunk allocated in the group
+				 * still has a free inode.
+				 */
+				goto alloc_inode;
+			}
 		}
+
 		/*
-		 * None left in the last group, search the whole a.g.
+		 * None left in the last group, search the whole AG
 		 */
-		else {
+		error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+		for (;;) {
+			error = xfs_inobt_get_rec(cur, &rec, &i);
 			if (error)
 				goto error0;
-			if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if (rec.ir_freecount > 0)
+				break;
+			error = xfs_btree_increment(cur, 0, &i);
+			if (error)
 				goto error0;
-			ASSERT(i == 1);
-			for (;;) {
-				error = xfs_inobt_get_rec(cur, &rec, &i);
-				if (error)
-					goto error0;
-				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-				if (rec.ir_freecount > 0)
-					break;
-				if ((error = xfs_btree_increment(cur, 0, &i)))
-					goto error0;
-				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			}
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 		}
 	}
+
+alloc_inode:
 	offset = xfs_ialloc_find_free(&rec.ir_free);
 	ASSERT(offset >= 0);
 	ASSERT(offset < XFS_INODES_PER_CHUNK);
-- 
cgit v1.2.3-70-g09d2


From 2187550525d7bcb8c87689e4eca41b1955bf9ac3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 Aug 2009 20:58:21 -0300
Subject: xfs: rationalize xfs_inobt_lookup*

Currenly we have a xfs_inobt_lookup* variant for each comparism direction,
and all these get all three fields of the inobt records passed, while the
common case is just looking for the inode number and we have only marginally
more callers than xfs_inobt_lookup* variants.

So opencode a direct call to xfs_btree_lookup for the single case where we
need all fields, and replace xfs_inobt_lookup* with a xfs_inobt_looku that
just takes the inode number and the direction for all other callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 77 +++++++++++++++--------------------------------------
 fs/xfs/xfs_ialloc.h | 14 +++-------
 fs/xfs/xfs_itable.c | 12 +++++----
 3 files changed, 32 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 7679c1633053..748637cd70ff 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -57,56 +57,19 @@ xfs_ialloc_cluster_alignment(
 }
 
 /*
- * Lookup the record equal to ino in the btree given by cur.
+ * Lookup a record by ino in the btree given by cur.
  */
 STATIC int				/* error */
-xfs_inobt_lookup_eq(
+xfs_inobt_lookup(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agino_t		ino,	/* starting inode of chunk */
-	__int32_t		fcnt,	/* free inode count */
-	xfs_inofree_t		free,	/* free inode mask */
+	xfs_lookup_t		dir,	/* <=, >=, == */
 	int			*stat)	/* success/failure */
 {
 	cur->bc_rec.i.ir_startino = ino;
-	cur->bc_rec.i.ir_freecount = fcnt;
-	cur->bc_rec.i.ir_free = free;
-	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-int					/* error */
-xfs_inobt_lookup_ge(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agino_t		ino,	/* starting inode of chunk */
-	__int32_t		fcnt,	/* free inode count */
-	xfs_inofree_t		free,	/* free inode mask */
-	int			*stat)	/* success/failure */
-{
-	cur->bc_rec.i.ir_startino = ino;
-	cur->bc_rec.i.ir_freecount = fcnt;
-	cur->bc_rec.i.ir_free = free;
-	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int					/* error */
-xfs_inobt_lookup_le(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agino_t		ino,	/* starting inode of chunk */
-	__int32_t		fcnt,	/* free inode count */
-	xfs_inofree_t		free,	/* free inode mask */
-	int			*stat)	/* success/failure */
-{
-	cur->bc_rec.i.ir_startino = ino;
-	cur->bc_rec.i.ir_freecount = fcnt;
-	cur->bc_rec.i.ir_free = free;
-	return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+	cur->bc_rec.i.ir_freecount = 0;
+	cur->bc_rec.i.ir_free = 0;
+	return xfs_btree_lookup(cur, dir, stat);
 }
 
 /*
@@ -162,7 +125,7 @@ xfs_check_agi_freecount(
 		int		error;
 		int		i;
 
-		error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i);
+		error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
 		if (error)
 			return error;
 
@@ -431,13 +394,17 @@ xfs_ialloc_ag_alloc(
 	for (thisino = newino;
 	     thisino < newino + newlen;
 	     thisino += XFS_INODES_PER_CHUNK) {
-		if ((error = xfs_inobt_lookup_eq(cur, thisino,
-				XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
+		cur->bc_rec.i.ir_startino = thisino;
+		cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
+		cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
+		error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
+		if (error) {
 			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 			return error;
 		}
 		ASSERT(i == 0);
-		if ((error = xfs_btree_insert(cur, &i))) {
+		error = xfs_btree_insert(cur, &i);
+		if (error) {
 			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 			return error;
 		}
@@ -818,7 +785,7 @@ nextag:
 		int		doneleft;	/* done, to the left */
 		int		doneright;	/* done, to the right */
 
-		error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i);
+		error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
 		if (error)
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -904,8 +871,8 @@ nextag:
 	 * See if the most recently allocated block has any free.
 	 */
 	else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
-		error = xfs_inobt_lookup_eq(cur, be32_to_cpu(agi->agi_newino),
-					    0, 0, &i);
+		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+					 XFS_LOOKUP_EQ, &i);
 		if (error)
 			goto error0;
 
@@ -926,7 +893,7 @@ nextag:
 		/*
 		 * None left in the last group, search the whole AG
 		 */
-		error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i);
+		error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
 		if (error)
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -1065,9 +1032,9 @@ xfs_difree(
 	/*
 	 * Look for the entry describing this inode.
 	 */
-	if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+	if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
 		cmn_err(CE_WARN,
-			"xfs_difree: xfs_inobt_lookup_le returned()  an error %d on %s.  Returning error.",
+			"xfs_difree: xfs_inobt_lookup returned()  an error %d on %s.  Returning error.",
 			error, mp->m_fsname);
 		goto error0;
 	}
@@ -1277,10 +1244,10 @@ xfs_imap(
 		}
 
 		cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-		error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i);
+		error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
 		if (error) {
 			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-					"xfs_inobt_lookup_le() failed");
+					"xfs_inobt_lookup() failed");
 			goto error0;
 		}
 
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 52e72fe7e411..bb5385475e1f 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,18 +150,10 @@ xfs_ialloc_pagi_init(
         xfs_agnumber_t  agno);		/* allocation group number */
 
 /*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
+ * Lookup a record by ino in the btree given by cur.
  */
-int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
-		__int32_t fcnt,	xfs_inofree_t free, int *stat);
-
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
-		__int32_t fcnt,	xfs_inofree_t free, int *stat);
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
+		xfs_lookup_t dir, int *stat);
 
 /*
  * Get the data from the pointed-to record.
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 3ec13e8a8c54..b68f9107e26c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -444,7 +444,8 @@ xfs_bulkstat(
 			/*
 			 * Lookup the inode chunk that this inode lives in.
 			 */
-			error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp);
+			error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
+						 &tmp);
 			if (!error &&	/* no I/O error */
 			    tmp &&	/* lookup succeeded */
 					/* got the record, should always work */
@@ -492,7 +493,7 @@ xfs_bulkstat(
 			/*
 			 * Start of ag.  Lookup the first inode chunk.
 			 */
-			error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp);
+			error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
 			icount = 0;
 		}
 		/*
@@ -511,8 +512,8 @@ xfs_bulkstat(
 				if (XFS_AGINO_TO_AGBNO(mp, agino) >=
 						be32_to_cpu(agi->agi_length))
 					break;
-				error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
-							    &tmp);
+				error = xfs_inobt_lookup(cur, agino,
+							 XFS_LOOKUP_GE, &tmp);
 				cond_resched();
 			}
 			/*
@@ -858,7 +859,8 @@ xfs_inumbers(
 				continue;
 			}
 			cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
-			error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
+			error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
+						 &tmp);
 			if (error) {
 				xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 				cur = NULL;
-- 
cgit v1.2.3-70-g09d2


From bd169565993b39b9b4b102cdac8b13e0a259ce2f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dgc@sgi.com>
Date: Mon, 31 Aug 2009 20:58:28 -0300
Subject: xfs: speed up free inode search

Don't search too far - abort if it is outside a certain radius and simply do
a linear search for the first free inode.  In AGs with a million inodes this
can speed up allocation speed by 3-4x.

[hch: ported to the new xfs_ialloc.c world order]

Signed-off-by: Dave Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ag.h     |   9 ++++
 fs/xfs/xfs_ialloc.c | 133 +++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 115 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f24b50b68d03..a5d54bf4931b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -198,6 +198,15 @@ typedef struct xfs_perag
 	xfs_agino_t	pagi_count;	/* number of allocated inodes */
 	int		pagb_count;	/* pagb slots in use */
 	xfs_perag_busy_t *pagb_list;	/* unstable blocks */
+
+	/*
+	 * Inode allocation search lookup optimisation.
+	 * If the pagino matches, the search for new inodes
+	 * doesn't need to search the near ones again straight away
+	 */
+	xfs_agino_t	pagl_pagino;
+	xfs_agino_t	pagl_leftrec;
+	xfs_agino_t	pagl_rightrec;
 #ifdef __KERNEL__
 	spinlock_t	pagb_lock;	/* lock for pagb_list */
 
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 748637cd70ff..d12d805f20a5 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -587,6 +587,30 @@ xfs_ialloc_next_rec(
 	return 0;
 }
 
+STATIC int
+xfs_ialloc_get_rec(
+	struct xfs_btree_cur	*cur,
+	xfs_agino_t		agino,
+	xfs_inobt_rec_incore_t	*rec,
+	int			*done,
+	int			left)
+{
+	int                     error;
+	int			i;
+
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		return error;
+	*done = !i;
+	if (i) {
+		error = xfs_inobt_get_rec(cur, rec, &i);
+		if (error)
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+
+	return 0;
+}
 
 /*
  * Visible inode allocation functions.
@@ -766,6 +790,8 @@ nextag:
 	 */
 	agno = tagno;
 	*IO_agbp = NULL;
+
+ restart_pagno:
 	cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
 	/*
 	 * If pagino is 0 (this is the root inode allocation) use newino.
@@ -782,8 +808,10 @@ nextag:
 	 * If in the same AG as the parent, try to get near the parent.
 	 */
 	if (pagno == agno) {
+		xfs_perag_t	*pag = &mp->m_perag[agno];
 		int		doneleft;	/* done, to the left */
 		int		doneright;	/* done, to the right */
+		int		searchdistance = 10;
 
 		error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
 		if (error)
@@ -813,15 +841,33 @@ nextag:
 		if (error)
 			goto error0;
 
-		/* search left with tcur, back up 1 record */
-		error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
-		if (error)
-			goto error1;
+		/*
+		 * Skip to last blocks looked up if same parent inode.
+		 */
+		if (pagino != NULLAGINO &&
+		    pag->pagl_pagino == pagino &&
+		    pag->pagl_leftrec != NULLAGINO &&
+		    pag->pagl_rightrec != NULLAGINO) {
+			error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+						   &trec, &doneleft, 1);
+			if (error)
+				goto error1;
 
-		/* search right with cur, go forward 1 record. */
-		error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
-		if (error)
-			goto error1;
+			error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+						   &rec, &doneright, 0);
+			if (error)
+				goto error1;
+		} else {
+			/* search left with tcur, back up 1 record */
+			error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+			if (error)
+				goto error1;
+
+			/* search right with cur, go forward 1 record. */
+			error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+			if (error)
+				goto error1;
+		}
 
 		/*
 		 * Loop until we find an inode chunk with a free inode.
@@ -829,6 +875,17 @@ nextag:
 		while (!doneleft || !doneright) {
 			int	useleft;  /* using left inode chunk this time */
 
+			if (!--searchdistance) {
+				/*
+				 * Not in range - save last search
+				 * location and allocate a new inode
+				 */
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
+				goto newino;
+			}
+
 			/* figure out the closer block if both are valid. */
 			if (!doneleft && !doneright) {
 				useleft = pagino -
@@ -843,12 +900,20 @@ nextag:
 				rec = trec;
 				xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 				cur = tcur;
+
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
 				goto alloc_inode;
 			}
 
 			/* free inodes to the right? */
 			if (!useleft && rec.ir_freecount) {
 				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+				pag->pagl_leftrec = trec.ir_startino;
+				pag->pagl_rightrec = rec.ir_startino;
+				pag->pagl_pagino = pagino;
 				goto alloc_inode;
 			}
 
@@ -863,14 +928,28 @@ nextag:
 			if (error)
 				goto error1;
 		}
-		ASSERT(!doneleft || !doneright);
+
+		/*
+		 * We've reached the end of the btree. because
+		 * we are only searching a small chunk of the
+		 * btree each search, there is obviously free
+		 * inodes closer to the parent inode than we
+		 * are now. restart the search again.
+		 */
+		pag->pagl_pagino = NULLAGINO;
+		pag->pagl_leftrec = NULLAGINO;
+		pag->pagl_rightrec = NULLAGINO;
+		xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+		goto restart_pagno;
 	}
 
 	/*
 	 * In a different AG from the parent.
 	 * See if the most recently allocated block has any free.
 	 */
-	else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
+newino:
+	if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
 		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
 					 XFS_LOOKUP_EQ, &i);
 		if (error)
@@ -889,27 +968,27 @@ nextag:
 				goto alloc_inode;
 			}
 		}
+	}
 
-		/*
-		 * None left in the last group, search the whole AG
-		 */
-		error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+	/*
+	 * None left in the last group, search the whole AG
+	 */
+	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+	if (error)
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+	for (;;) {
+		error = xfs_inobt_get_rec(cur, &rec, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if (rec.ir_freecount > 0)
+			break;
+		error = xfs_btree_increment(cur, 0, &i);
 		if (error)
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-
-		for (;;) {
-			error = xfs_inobt_get_rec(cur, &rec, &i);
-			if (error)
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			if (rec.ir_freecount > 0)
-				break;
-			error = xfs_btree_increment(cur, 0, &i);
-			if (error)
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		}
 	}
 
 alloc_inode:
-- 
cgit v1.2.3-70-g09d2


From 13e6d5cdde0e785aa943810f08b801cadd0935df Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 Aug 2009 21:00:31 -0300
Subject: xfs: merge fsync and O_SYNC handling

The guarantees for O_SYNC are exactly the same as the ones we need to
make for an fsync call (and given that Linux O_SYNC is O_DSYNC the
equivalent is fdadatasync, but we treat both the same in XFS), except
with a range data writeout.  Jan Kara has started unifying these two
path for filesystems using the generic helpers, and I've started to
look at XFS.

The actual transaction commited by xfs_fsync and xfs_write_sync_logforce
has a different transaction number, but actually is exactly the same.
We'll only use the fsync transaction going forward.  One major difference
is that xfs_write_sync_logforce never issues a cache flush unless we
commit a transaction causing that as a side-effect, which is an obvious
bug in the O_SYNC handling.  Second all the locking and i_update_size
vs i_update_core changes from 978b7237123d007b9fa983af6e0e2fa8f97f9934
never made it to xfs_write_sync_logforce, so we add them back.

To make xfs_fsync easily usable from the O_SYNC path, the filemap_fdatawait
call is moved up to xfs_file_fsync, so that we don't wait on the whole
file after we already waited for our portion in xfs_write.

We'll also use a plain call to filemap_write_and_wait_range instead
of the previous sync_page_rang which did it in two steps including
an half-hearted inode write out that doesn't help us.

Once we're done with this also remove the now useless i_update_size
tracking.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c |  1 -
 fs/xfs/linux-2.6/xfs_file.c | 19 +++++++---
 fs/xfs/linux-2.6/xfs_lrw.c  |  7 ++--
 fs/xfs/xfs_iget.c           |  1 -
 fs/xfs/xfs_inode.h          |  1 -
 fs/xfs/xfs_inode_item.c     |  8 -----
 fs/xfs/xfs_rw.c             | 84 ---------------------------------------------
 fs/xfs/xfs_rw.h             |  1 -
 fs/xfs/xfs_trans.h          |  2 +-
 fs/xfs/xfs_vnodeops.c       | 11 ++----
 10 files changed, 23 insertions(+), 112 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index aecf2519db76..d5e5559e31db 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -216,7 +216,6 @@ xfs_setfilesize(
 	if (ip->i_d.di_size < isize) {
 		ip->i_d.di_size = isize;
 		ip->i_update_core = 1;
-		ip->i_update_size = 1;
 		xfs_mark_inode_dirty_sync(ip);
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 0542fd507649..988d8f87bc0f 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -172,12 +172,21 @@ xfs_file_release(
  */
 STATIC int
 xfs_file_fsync(
-	struct file	*filp,
-	struct dentry	*dentry,
-	int		datasync)
+	struct file		*file,
+	struct dentry		*dentry,
+	int			datasync)
 {
-	xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
-	return -xfs_fsync(XFS_I(dentry->d_inode));
+	struct inode		*inode = dentry->d_inode;
+	struct xfs_inode	*ip = XFS_I(inode);
+	int			error;
+
+	/* capture size updates in I/O completion before writing the inode. */
+	error = filemap_fdatawait(inode->i_mapping);
+	if (error)
+		return error;
+
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
+	return -xfs_fsync(ip);
 }
 
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7078974a6eee..49e4a6aea73c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -812,18 +812,21 @@ write_retry:
 
 	/* Handle various SYNC-type writes */
 	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+		loff_t end = pos + ret - 1;
 		int error2;
 
 		xfs_iunlock(xip, iolock);
 		if (need_i_mutex)
 			mutex_unlock(&inode->i_mutex);
-		error2 = sync_page_range(inode, mapping, pos, ret);
+
+		error2 = filemap_write_and_wait_range(mapping, pos, end);
 		if (!error)
 			error = error2;
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
 		xfs_ilock(xip, iolock);
-		error2 = xfs_write_sync_logforce(mp, xip);
+
+		error2 = xfs_fsync(xip);
 		if (!error)
 			error = error2;
 	}
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index ecbf8b4d2e2e..3323826274b3 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -82,7 +82,6 @@ xfs_inode_alloc(
 	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
 	ip->i_flags = 0;
 	ip->i_update_core = 0;
-	ip->i_update_size = 0;
 	ip->i_delayed_blks = 0;
 	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
 	ip->i_size = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 00e8505bde2d..ed566c248ae4 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -261,7 +261,6 @@ typedef struct xfs_inode {
 	/* Miscellaneous state. */
 	unsigned short		i_flags;	/* see defined flags below */
 	unsigned char		i_update_core;	/* timestamps/size is dirty */
-	unsigned char		i_update_size;	/* di_size field is dirty */
 	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
 
 	xfs_icdinode_t		i_d;		/* most of ondisk inode */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 977c4aec587e..2e69412195e6 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -262,14 +262,6 @@ xfs_inode_item_format(
 		SYNCHRONIZE();
 	}
 
-	/*
-	 * We don't have to worry about re-ordering here because
-	 * the update_size field is protected by the inode lock
-	 * and we have that held in exclusive mode.
-	 */
-	if (ip->i_update_size)
-		ip->i_update_size = 0;
-
 	/*
 	 * Make sure to get the latest atime from the Linux inode.
 	 */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index fea68615ed23..3f816ad7ff19 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -87,90 +87,6 @@ xfs_write_clear_setuid(
 	return 0;
 }
 
-/*
- * Handle logging requirements of various synchronous types of write.
- */
-int
-xfs_write_sync_logforce(
-	xfs_mount_t	*mp,
-	xfs_inode_t	*ip)
-{
-	int		error = 0;
-
-	/*
-	 * If we're treating this as O_DSYNC and we have not updated the
-	 * size, force the log.
-	 */
-	if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
-	    !(ip->i_update_size)) {
-		xfs_inode_log_item_t	*iip = ip->i_itemp;
-
-		/*
-		 * If an allocation transaction occurred
-		 * without extending the size, then we have to force
-		 * the log up the proper point to ensure that the
-		 * allocation is permanent.  We can't count on
-		 * the fact that buffered writes lock out direct I/O
-		 * writes - the direct I/O write could have extended
-		 * the size nontransactionally, then finished before
-		 * we started.  xfs_write_file will think that the file
-		 * didn't grow but the update isn't safe unless the
-		 * size change is logged.
-		 *
-		 * Force the log if we've committed a transaction
-		 * against the inode or if someone else has and
-		 * the commit record hasn't gone to disk (e.g.
-		 * the inode is pinned).  This guarantees that
-		 * all changes affecting the inode are permanent
-		 * when we return.
-		 */
-		if (iip && iip->ili_last_lsn) {
-			error = _xfs_log_force(mp, iip->ili_last_lsn,
-					XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
-		} else if (xfs_ipincount(ip) > 0) {
-			error = _xfs_log_force(mp, (xfs_lsn_t)0,
-					XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
-		}
-
-	} else {
-		xfs_trans_t	*tp;
-
-		/*
-		 * O_SYNC or O_DSYNC _with_ a size update are handled
-		 * the same way.
-		 *
-		 * If the write was synchronous then we need to make
-		 * sure that the inode modification time is permanent.
-		 * We'll have updated the timestamp above, so here
-		 * we use a synchronous transaction to log the inode.
-		 * It's not fast, but it's necessary.
-		 *
-		 * If this a dsync write and the size got changed
-		 * non-transactionally, then we need to ensure that
-		 * the size change gets logged in a synchronous
-		 * transaction.
-		 */
-		tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
-		if ((error = xfs_trans_reserve(tp, 0,
-						XFS_SWRITE_LOG_RES(mp),
-						0, 0, 0))) {
-			/* Transaction reserve failed */
-			xfs_trans_cancel(tp, 0);
-		} else {
-			/* Transaction reserve successful */
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-			xfs_trans_ihold(tp, ip);
-			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-			xfs_trans_set_sync(tp);
-			error = xfs_trans_commit(tp, 0);
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		}
-	}
-
-	return error;
-}
-
 /*
  * Force a shutdown of the filesystem instantly while keeping
  * the filesystem consistent. We don't do an unmount here; just shutdown
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index ae65f0df87da..f5e4874c37d8 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -68,7 +68,6 @@ xfs_get_extsz_hint(
  * Prototypes for functions in xfs_rw.c.
  */
 extern int xfs_write_clear_setuid(struct xfs_inode *ip);
-extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
 extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
 extern int xfs_bioerror(struct xfs_buf *bp);
 extern int xfs_bioerror_relse(struct xfs_buf *bp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 775249a54f6f..ed47fc77759c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -68,7 +68,7 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_GROWFS		14
 #define XFS_TRANS_STRAT_WRITE		15
 #define XFS_TRANS_DIOSTRAT		16
-#define	XFS_TRANS_WRITE_SYNC		17
+/* 17 was XFS_TRANS_WRITE_SYNC */
 #define	XFS_TRANS_WRITEID		18
 #define	XFS_TRANS_ADDAFORK		19
 #define	XFS_TRANS_ATTRINVAL		20
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index ceecafd1f9c1..03d3100559ac 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -611,7 +611,7 @@ xfs_fsync(
 	xfs_inode_t	*ip)
 {
 	xfs_trans_t	*tp;
-	int		error;
+	int		error = 0;
 	int		log_flushed = 0, changed = 1;
 
 	xfs_itrace_entry(ip);
@@ -619,14 +619,9 @@ xfs_fsync(
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return XFS_ERROR(EIO);
 
-	/* capture size updates in I/O completion before writing the inode. */
-	error = xfs_wait_on_pages(ip, 0, -1);
-	if (error)
-		return XFS_ERROR(error);
-
 	/*
 	 * We always need to make sure that the required inode state is safe on
-	 * disk.  The vnode might be clean but we still might need to force the
+	 * disk.  The inode might be clean but we still might need to force the
 	 * log because of committed transactions that haven't hit the disk yet.
 	 * Likewise, there could be unflushed non-transactional changes to the
 	 * inode core that have to go to disk and this requires us to issue
@@ -638,7 +633,7 @@ xfs_fsync(
 	 */
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 
-	if (!(ip->i_update_size || ip->i_update_core)) {
+	if (!ip->i_update_core) {
 		/*
 		 * Timestamps/size haven't changed since last inode flush or
 		 * inode transaction commit.  That means either nothing got
-- 
cgit v1.2.3-70-g09d2


From aa72a5cf00001d0b952c7c755be404b9118ceb2e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 Aug 2009 21:51:52 -0300
Subject: xfs: simplify xfs_trans_iget

xfs_trans_iget is a wrapper for xfs_iget that adds the inode to the
transaction after it is read.  Except when the inode already is in the
inode cache, in which case it returns the existing locked inode with
increment lock recursion counts.

Now, no one in the tree every decrements these lock recursion counts,
so any user of this gets a potential double unlock when both the original
owner of the inode and the xfs_trans_iget caller unlock it.  When looking
back in a git bisect in the historic XFS tree there was only one place
that decremented these counts, xfs_trans_iput.  Introduced in commit
ca25df7a840f426eb566d52667b6950b92bb84b5 by Adam Sweeney in 1993,
and removed in commit 19f899a3ab155ff6a49c0c79b06f2f61059afaf3 by
Steve Lord in 2003.  And as long as it didn't slip through git bisects
cracks never actually used in that time frame.

A quick audit of the callers of xfs_trans_iget shows that no caller
really relies on this behaviour fortunately - xfs_ialloc allows this
inode from disk so it must not be there before, and all the RT allocator
routines only every add each RT bitmap inode once.

In addition to removing lots of code and reducing the size of the inode
item this patch also avoids the double inode cache lookup in each
create/mkdir/mknod transaction.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_iget.c        | 26 ---------------
 fs/xfs/xfs_inode.h       |  2 --
 fs/xfs/xfs_inode_item.c  |  2 --
 fs/xfs/xfs_inode_item.h  |  2 --
 fs/xfs/xfs_trans_inode.c | 86 +++---------------------------------------------
 5 files changed, 5 insertions(+), 113 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 3323826274b3..80e526489be5 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -455,32 +455,6 @@ out_error_or_again:
 	return error;
 }
 
-
-/*
- * Look for the inode corresponding to the given ino in the hash table.
- * If it is there and its i_transp pointer matches tp, return it.
- * Otherwise, return NULL.
- */
-xfs_inode_t *
-xfs_inode_incore(xfs_mount_t	*mp,
-		 xfs_ino_t	ino,
-		 xfs_trans_t	*tp)
-{
-	xfs_inode_t	*ip;
-	xfs_perag_t	*pag;
-
-	pag = xfs_get_perag(mp, ino);
-	read_lock(&pag->pag_ici_lock);
-	ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
-	read_unlock(&pag->pag_ici_lock);
-	xfs_put_perag(mp, pag);
-
-	/* the returned inode must match the transaction */
-	if (ip && (ip->i_transp != tp))
-		return NULL;
-	return ip;
-}
-
 /*
  * Decrement reference count of an inode structure and unlock it.
  *
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ed566c248ae4..0b38b9a869ec 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -467,8 +467,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
  * xfs_iget.c prototypes.
  */
-xfs_inode_t	*xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
-				  struct xfs_trans *);
 int		xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
 			 uint, uint, xfs_inode_t **, xfs_daddr_t);
 void		xfs_iput(xfs_inode_t *, uint);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 2e69412195e6..47d5b663c37e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -704,8 +704,6 @@ xfs_inode_item_unlock(
 	 * Clear out the fields of the inode log item particular
 	 * to the current transaction.
 	 */
-	iip->ili_ilock_recur = 0;
-	iip->ili_iolock_recur = 0;
 	iip->ili_flags = 0;
 
 	/*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index a52ac125f055..65bae4c9b8bf 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -137,8 +137,6 @@ typedef struct xfs_inode_log_item {
 	struct xfs_inode	*ili_inode;	   /* inode ptr */
 	xfs_lsn_t		ili_flush_lsn;	   /* lsn at last flush */
 	xfs_lsn_t		ili_last_lsn;	   /* lsn at last transaction */
-	unsigned short		ili_ilock_recur;   /* lock recursion count */
-	unsigned short		ili_iolock_recur;  /* lock recursion count */
 	unsigned short		ili_flags;	   /* misc flags */
 	unsigned short		ili_logged;	   /* flushed logged data */
 	unsigned int		ili_last_fields;   /* fields when flushed */
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 23d276af2e0c..785ff101da0a 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -49,30 +49,7 @@ xfs_trans_inode_broot_debug(
 
 
 /*
- * Get and lock the inode for the caller if it is not already
- * locked within the given transaction.  If it is already locked
- * within the transaction, just increment its lock recursion count
- * and return a pointer to it.
- *
- * For an inode to be locked in a transaction, the inode lock, as
- * opposed to the io lock, must be taken exclusively.  This ensures
- * that the inode can be involved in only 1 transaction at a time.
- * Lock recursion is handled on the io lock, but only for lock modes
- * of equal or lesser strength.  That is, you can recur on the io lock
- * held EXCL with a SHARED request but not vice versa.  Also, if
- * the inode is already a part of the transaction then you cannot
- * go from not holding the io lock to having it EXCL or SHARED.
- *
- * Use the inode cache routine xfs_inode_incore() to find the inode
- * if it is already owned by this transaction.
- *
- * If we don't already own the inode, use xfs_iget() to get it.
- * Since the inode log item structure is embedded in the incore
- * inode structure and is initialized when the inode is brought
- * into memory, there is nothing to do with it here.
- *
- * If the given transaction pointer is NULL, just call xfs_iget().
- * This simplifies code which must handle both cases.
+ * Get an inode and join it to the transaction.
  */
 int
 xfs_trans_iget(
@@ -84,62 +61,11 @@ xfs_trans_iget(
 	xfs_inode_t	**ipp)
 {
 	int			error;
-	xfs_inode_t		*ip;
-
-	/*
-	 * If the transaction pointer is NULL, just call the normal
-	 * xfs_iget().
-	 */
-	if (tp == NULL)
-		return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0);
-
-	/*
-	 * If we find the inode in core with this transaction
-	 * pointer in its i_transp field, then we know we already
-	 * have it locked.  In this case we just increment the lock
-	 * recursion count and return the inode to the caller.
-	 * Assert that the inode is already locked in the mode requested
-	 * by the caller.  We cannot do lock promotions yet, so
-	 * die if someone gets this wrong.
-	 */
-	if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
-		/*
-		 * Make sure that the inode lock is held EXCL and
-		 * that the io lock is never upgraded when the inode
-		 * is already a part of the transaction.
-		 */
-		ASSERT(ip->i_itemp != NULL);
-		ASSERT(lock_flags & XFS_ILOCK_EXCL);
-		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-		ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
-		       xfs_isilocked(ip, XFS_IOLOCK_EXCL));
-		ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
-		       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
-		ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
-		       xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
-		ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
-		       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
-
-		if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
-			ip->i_itemp->ili_iolock_recur++;
-		}
-		if (lock_flags & XFS_ILOCK_EXCL) {
-			ip->i_itemp->ili_ilock_recur++;
-		}
-		*ipp = ip;
-		return 0;
-	}
-
-	ASSERT(lock_flags & XFS_ILOCK_EXCL);
-	error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0);
-	if (error) {
-		return error;
-	}
-	ASSERT(ip != NULL);
 
-	xfs_trans_ijoin(tp, ip, lock_flags);
-	*ipp = ip;
-	return 0;
+	error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0);
+	if (!error && tp)
+		xfs_trans_ijoin(tp, *ipp, lock_flags);
+	return error;
 }
 
 /*
@@ -163,8 +89,6 @@ xfs_trans_ijoin(
 		xfs_inode_item_init(ip, ip->i_mount);
 	iip = ip->i_itemp;
 	ASSERT(iip->ili_flags == 0);
-	ASSERT(iip->ili_ilock_recur == 0);
-	ASSERT(iip->ili_iolock_recur == 0);
 
 	/*
 	 * Get a log_item_desc to point at the new item.
-- 
cgit v1.2.3-70-g09d2


From f4378b6eaf63492c0f9a397d52813e0ae6b49e7b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 1 Sep 2009 14:03:08 -0400
Subject: xfs: actually enable the swapext compat handler

Fix a small typo in the compat ioctl handler that cause the swapext
compat handler to never be called.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Torsten Kaiser <just.for.lkml@googlemail.com>
Tested-by: Torsten Kaiser <just.for.lkml@googlemail.com>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0882d166239a..eafcc7c18706 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -619,7 +619,7 @@ xfs_file_compat_ioctl(
 	case XFS_IOC_GETVERSION_32:
 		cmd = _NATIVE_IOC(cmd, long);
 		return xfs_file_ioctl(filp, cmd, p);
-	case XFS_IOC_SWAPEXT: {
+	case XFS_IOC_SWAPEXT_32: {
 		struct xfs_swapext	  sxp;
 		struct compat_xfs_swapext __user *sxu = arg;
 
-- 
cgit v1.2.3-70-g09d2


From 3725867dccfb83e4b0cff64e916a04258f300591 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 1 Sep 2009 14:03:08 -0400
Subject: xfs: actually enable the swapext compat handler

Fix a small typo in the compat ioctl handler that cause the swapext
compat handler to never be called.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Torsten Kaiser <just.for.lkml@googlemail.com>
Tested-by: Torsten Kaiser <just.for.lkml@googlemail.com>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0882d166239a..eafcc7c18706 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -619,7 +619,7 @@ xfs_file_compat_ioctl(
 	case XFS_IOC_GETVERSION_32:
 		cmd = _NATIVE_IOC(cmd, long);
 		return xfs_file_ioctl(filp, cmd, p);
-	case XFS_IOC_SWAPEXT: {
+	case XFS_IOC_SWAPEXT_32: {
 		struct xfs_swapext	  sxp;
 		struct compat_xfs_swapext __user *sxu = arg;
 
-- 
cgit v1.2.3-70-g09d2


From 8c58b54574d18b3782b2a261b9dae3c4e90b6b04 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 17 Aug 2009 16:26:49 -0400
Subject: cifs: send IPv6 addr in upcall with colon delimiters

Make it easier on the upcall program by adding ':' delimiters between
each group of hex digits.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_spnego.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 051caecf7d67..8ec7736ce954 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -125,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	if (server->addr.sockAddr.sin_family == AF_INET)
 		sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
 	else if (server->addr.sockAddr.sin_family == AF_INET6)
-		sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr);
+		sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
 	else
 		goto out;
 
-- 
cgit v1.2.3-70-g09d2


From 8e047d09ee1143058b71f40c5f4d028dde52d883 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 28 Aug 2009 10:11:10 -0400
Subject: cifs: use tcon pointer in cifs_show_options

Minor nit: we already have a tcon pointer so we don't need to
dereference cifs_sb again.

Also initialize the vars in the declaration.

Reported-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 84b75253b05a..b750aa502ff7 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -361,13 +361,10 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 static int
 cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *tcon;
-
-	cifs_sb = CIFS_SB(m->mnt_sb);
-	tcon = cifs_sb->tcon;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
+	struct cifsTconInfo *tcon = cifs_sb->tcon;
 
-	seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName);
+	seq_printf(s, ",unc=%s", tcon->treeName);
 	if (tcon->ses->userName)
 		seq_printf(s, ",username=%s", tcon->ses->userName);
 	if (tcon->ses->domainName)
-- 
cgit v1.2.3-70-g09d2


From 1b49c5566136455764a8d17ead25784f534c202d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 28 Aug 2009 10:11:11 -0400
Subject: cifs: protect GlobalOplock_Q with its own spinlock

Right now, the GlobalOplock_Q is protected by the GlobalMid_Lock. That
lock is also used for completely unrelated purposes (mostly for managing
the global mid queue). Give the list its own dedicated spinlock
(cifs_oplock_lock) and rename the list to cifs_oplock_list to
eliminate the camel-case.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c    | 13 +++++++------
 fs/cifs/cifsglob.h  |  6 +++++-
 fs/cifs/transport.c | 17 ++++++++---------
 3 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index b750aa502ff7..3610e9958b4c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -986,19 +986,19 @@ static int cifs_oplock_thread(void *dummyarg)
 		if (try_to_freeze())
 			continue;
 
-		spin_lock(&GlobalMid_Lock);
-		if (list_empty(&GlobalOplock_Q)) {
-			spin_unlock(&GlobalMid_Lock);
+		spin_lock(&cifs_oplock_lock);
+		if (list_empty(&cifs_oplock_list)) {
+			spin_unlock(&cifs_oplock_lock);
 			set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(39*HZ);
 		} else {
-			oplock_item = list_entry(GlobalOplock_Q.next,
+			oplock_item = list_entry(cifs_oplock_list.next,
 						struct oplock_q_entry, qhead);
 			cFYI(1, ("found oplock item to write out"));
 			pTcon = oplock_item->tcon;
 			inode = oplock_item->pinode;
 			netfid = oplock_item->netfid;
-			spin_unlock(&GlobalMid_Lock);
+			spin_unlock(&cifs_oplock_lock);
 			DeleteOplockQEntry(oplock_item);
 			/* can not grab inode sem here since it would
 				deadlock when oplock received on delete
@@ -1055,7 +1055,7 @@ init_cifs(void)
 	int rc = 0;
 	cifs_proc_init();
 	INIT_LIST_HEAD(&cifs_tcp_ses_list);
-	INIT_LIST_HEAD(&GlobalOplock_Q);
+	INIT_LIST_HEAD(&cifs_oplock_list);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	INIT_LIST_HEAD(&GlobalDnotifyReqList);
 	INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
@@ -1084,6 +1084,7 @@ init_cifs(void)
 	rwlock_init(&GlobalSMBSeslock);
 	rwlock_init(&cifs_tcp_ses_lock);
 	spin_lock_init(&GlobalMid_Lock);
+	spin_lock_init(&cifs_oplock_lock);
 
 	if (cifs_max_pending < 2) {
 		cifs_max_pending = 2;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6084d6379c03..f100399ee05e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -656,7 +656,11 @@ GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
  */
 GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
 
-GLOBAL_EXTERN struct list_head GlobalOplock_Q;
+/* Global list of oplocks */
+GLOBAL_EXTERN struct list_head cifs_oplock_list;
+
+/* Protects the cifs_oplock_list */
+GLOBAL_EXTERN spinlock_t cifs_oplock_lock;
 
 /* Outstanding dir notify requests */
 GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0ad3e2d116a6..1da4ab250eae 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -119,20 +119,19 @@ AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon)
 		temp->pinode = pinode;
 		temp->tcon = tcon;
 		temp->netfid = fid;
-		spin_lock(&GlobalMid_Lock);
-		list_add_tail(&temp->qhead, &GlobalOplock_Q);
-		spin_unlock(&GlobalMid_Lock);
+		spin_lock(&cifs_oplock_lock);
+		list_add_tail(&temp->qhead, &cifs_oplock_list);
+		spin_unlock(&cifs_oplock_lock);
 	}
 	return temp;
-
 }
 
 void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry)
 {
-	spin_lock(&GlobalMid_Lock);
+	spin_lock(&cifs_oplock_lock);
     /* should we check if list empty first? */
 	list_del(&oplockEntry->qhead);
-	spin_unlock(&GlobalMid_Lock);
+	spin_unlock(&cifs_oplock_lock);
 	kmem_cache_free(cifs_oplock_cachep, oplockEntry);
 }
 
@@ -144,14 +143,14 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
 	if (tcon == NULL)
 		return;
 
-	spin_lock(&GlobalMid_Lock);
-	list_for_each_entry(temp, &GlobalOplock_Q, qhead) {
+	spin_lock(&cifs_oplock_lock);
+	list_for_each_entry(temp, &cifs_oplock_list, qhead) {
 		if ((temp->tcon) && (temp->tcon == tcon)) {
 			list_del(&temp->qhead);
 			kmem_cache_free(cifs_oplock_cachep, temp);
 		}
 	}
-	spin_unlock(&GlobalMid_Lock);
+	spin_unlock(&cifs_oplock_lock);
 }
 
 static int
-- 
cgit v1.2.3-70-g09d2


From 6ab409b53dcaf28f83d518a6702f904b7cee3f41 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Mon, 31 Aug 2009 11:07:12 -0400
Subject: cifs: Replace wrtPending with a real reference count

Currently, cifs_close() tries to wait until all I/O is complete and then
frees the file private data.  If I/O does not completely in a reasonable
amount of time it frees the structure anyway, leaving a potential use-
after-free situation.

This patch changes the wrtPending counter to a complete reference count and
lets the last user free the structure.

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Tested-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c  |  4 ++--
 fs/cifs/cifsglob.h | 15 ++++++++++++++-
 fs/cifs/dir.c      |  2 +-
 fs/cifs/file.c     | 43 +++++++++++--------------------------------
 fs/cifs/inode.c    |  6 +++---
 5 files changed, 31 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 6941c22398a6..7dfe0842a6f6 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -607,7 +607,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
 		return get_cifs_acl_by_path(cifs_sb, path, pacllen);
 
 	pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
-	atomic_dec(&open_file->wrtPending);
+	cifsFileInfo_put(open_file);
 	return pntsd;
 }
 
@@ -665,7 +665,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 		return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
 
 	rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
-	atomic_dec(&open_file->wrtPending);
+	cifsFileInfo_put(open_file);
 	return rc;
 }
 
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f100399ee05e..6cfc81a32703 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -351,11 +351,24 @@ struct cifsFileInfo {
 	bool closePend:1;	/* file is marked to close */
 	bool invalidHandle:1;	/* file closed via session abend */
 	bool messageMode:1;	/* for pipes: message vs byte mode */
-	atomic_t wrtPending;   /* handle in use - defer close */
+	atomic_t count;		/* reference count */
 	struct mutex fh_mutex; /* prevents reopen race after dead ses*/
 	struct cifs_search_info srch_inf;
 };
 
+/* Take a reference on the file private data */
+static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
+{
+	atomic_inc(&cifs_file->count);
+}
+
+/* Release a reference on the file private data */
+static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
+{
+	if (atomic_dec_and_test(&cifs_file->count))
+		kfree(cifs_file);
+}
+
 /*
  * One of these for each file inode
  */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 4326ffd90fa9..a6424cfc0121 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -153,7 +153,7 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
 	mutex_init(&pCifsFile->fh_mutex);
 	mutex_init(&pCifsFile->lock_mutex);
 	INIT_LIST_HEAD(&pCifsFile->llist);
-	atomic_set(&pCifsFile->wrtPending, 0);
+	atomic_set(&pCifsFile->count, 1);
 
 	/* set the following in open now
 			pCifsFile->pfile = file; */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c34b7f8a217b..fa7beac8b80e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -53,11 +53,9 @@ static inline struct cifsFileInfo *cifs_init_private(
 	private_data->pInode = inode;
 	private_data->invalidHandle = false;
 	private_data->closePend = false;
-	/* we have to track num writers to the inode, since writepages
-	does not tell us which handle the write is for so there can
-	be a close (overlapping with write) of the filehandle that
-	cifs_writepages chose to use */
-	atomic_set(&private_data->wrtPending, 0);
+	/* Initialize reference count to one.  The private data is
+	freed on the release of the last reference */
+	atomic_set(&private_data->count, 1);
 
 	return private_data;
 }
@@ -643,7 +641,7 @@ int cifs_close(struct inode *inode, struct file *file)
 			if (!pTcon->need_reconnect) {
 				write_unlock(&GlobalSMBSeslock);
 				timeout = 2;
-				while ((atomic_read(&pSMBFile->wrtPending) != 0)
+				while ((atomic_read(&pSMBFile->count) != 1)
 					&& (timeout <= 2048)) {
 					/* Give write a better chance to get to
 					server ahead of the close.  We do not
@@ -657,8 +655,6 @@ int cifs_close(struct inode *inode, struct file *file)
 					msleep(timeout);
 					timeout *= 4;
 				}
-				if (atomic_read(&pSMBFile->wrtPending))
-					cERROR(1, ("close with pending write"));
 				if (!pTcon->need_reconnect &&
 				    !pSMBFile->invalidHandle)
 					rc = CIFSSMBClose(xid, pTcon,
@@ -681,24 +677,7 @@ int cifs_close(struct inode *inode, struct file *file)
 		list_del(&pSMBFile->flist);
 		list_del(&pSMBFile->tlist);
 		write_unlock(&GlobalSMBSeslock);
-		timeout = 10;
-		/* We waited above to give the SMBWrite a chance to issue
-		   on the wire (so we do not get SMBWrite returning EBADF
-		   if writepages is racing with close.  Note that writepages
-		   does not specify a file handle, so it is possible for a file
-		   to be opened twice, and the application close the "wrong"
-		   file handle - in these cases we delay long enough to allow
-		   the SMBWrite to get on the wire before the SMB Close.
-		   We allow total wait here over 45 seconds, more than
-		   oplock break time, and more than enough to allow any write
-		   to complete on the server, or to time out on the client */
-		while ((atomic_read(&pSMBFile->wrtPending) != 0)
-				&& (timeout <= 50000)) {
-			cERROR(1, ("writes pending, delay free of handle"));
-			msleep(timeout);
-			timeout *= 8;
-		}
-		kfree(file->private_data);
+		cifsFileInfo_put(file->private_data);
 		file->private_data = NULL;
 	} else
 		rc = -EBADF;
@@ -1236,7 +1215,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
 			if (!open_file->invalidHandle) {
 				/* found a good file */
 				/* lock it so it will not be closed on us */
-				atomic_inc(&open_file->wrtPending);
+				cifsFileInfo_get(open_file);
 				read_unlock(&GlobalSMBSeslock);
 				return open_file;
 			} /* else might as well continue, and look for
@@ -1276,7 +1255,7 @@ refind_writable:
 		if (open_file->pfile &&
 		    ((open_file->pfile->f_flags & O_RDWR) ||
 		     (open_file->pfile->f_flags & O_WRONLY))) {
-			atomic_inc(&open_file->wrtPending);
+			cifsFileInfo_get(open_file);
 
 			if (!open_file->invalidHandle) {
 				/* found a good writable file */
@@ -1293,7 +1272,7 @@ refind_writable:
 				else { /* start over in case this was deleted */
 				       /* since the list could be modified */
 					read_lock(&GlobalSMBSeslock);
-					atomic_dec(&open_file->wrtPending);
+					cifsFileInfo_put(open_file);
 					goto refind_writable;
 				}
 			}
@@ -1309,7 +1288,7 @@ refind_writable:
 			read_lock(&GlobalSMBSeslock);
 			/* can not use this handle, no write
 			   pending on this one after all */
-			atomic_dec(&open_file->wrtPending);
+			cifsFileInfo_put(open_file);
 
 			if (open_file->closePend) /* list could have changed */
 				goto refind_writable;
@@ -1373,7 +1352,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 	if (open_file) {
 		bytes_written = cifs_write(open_file->pfile, write_data,
 					   to-from, &offset);
-		atomic_dec(&open_file->wrtPending);
+		cifsFileInfo_put(open_file);
 		/* Does mm or vfs already set times? */
 		inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
 		if ((bytes_written > 0) && (offset))
@@ -1562,7 +1541,7 @@ retry:
 						   bytes_to_write, offset,
 						   &bytes_written, iov, n_iov,
 						   long_op);
-				atomic_dec(&open_file->wrtPending);
+				cifsFileInfo_put(open_file);
 				cifs_update_eof(cifsi, offset, bytes_written);
 
 				if (rc || bytes_written < bytes_to_write) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 82d83839655e..1f09c7619319 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -800,7 +800,7 @@ set_via_filehandle:
 	if (open_file == NULL)
 		CIFSSMBClose(xid, pTcon, netfid);
 	else
-		atomic_dec(&open_file->wrtPending);
+		cifsFileInfo_put(open_file);
 out:
 	return rc;
 }
@@ -1635,7 +1635,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 		__u32 npid = open_file->pid;
 		rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
 					npid, false);
-		atomic_dec(&open_file->wrtPending);
+		cifsFileInfo_put(open_file);
 		cFYI(1, ("SetFSize for attrs rc = %d", rc));
 		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			unsigned int bytes_written;
@@ -1790,7 +1790,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 		u16 nfid = open_file->netfid;
 		u32 npid = open_file->pid;
 		rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
-		atomic_dec(&open_file->wrtPending);
+		cifsFileInfo_put(open_file);
 	} else {
 		rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
 				    cifs_sb->local_nls,
-- 
cgit v1.2.3-70-g09d2


From 81e251766e8f8c9d7abb5db784e58c5c45f82797 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 1 Sep 2009 19:56:55 -0400
Subject: xfs: un-static xfs_inobt_lookup

xfs_inobt_lookup is also used in xfs_itable.c, remove the STATIC modifier
from it's declaration to fix non-debug builds.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index d12d805f20a5..ab64f3efb43b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -59,7 +59,7 @@ xfs_ialloc_cluster_alignment(
 /*
  * Lookup a record by ino in the btree given by cur.
  */
-STATIC int				/* error */
+int					/* error */
 xfs_inobt_lookup(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agino_t		ino,	/* starting inode of chunk */
-- 
cgit v1.2.3-70-g09d2


From e0e817392b9acf2c98d3be80c233dddb1b52003d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 2 Sep 2009 09:13:40 +0100
Subject: CRED: Add some configurable debugging [try #6]

Add a config option (CONFIG_DEBUG_CREDENTIALS) to turn on some debug checking
for credential management.  The additional code keeps track of the number of
pointers from task_structs to any given cred struct, and checks to see that
this number never exceeds the usage count of the cred struct (which includes
all references, not just those from task_structs).

Furthermore, if SELinux is enabled, the code also checks that the security
pointer in the cred struct is never seen to be invalid.

This attempts to catch the bug whereby inode_has_perm() faults in an nfsd
kernel thread on seeing cred->security be a NULL pointer (it appears that the
credential struct has been previously released):

	http://www.kerneloops.org/oops.php?number=252883

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/nfsd/auth.c           |   4 +
 fs/nfsd/nfssvc.c         |   2 +
 fs/nfsd/vfs.c            |   3 +
 fs/open.c                |   2 +
 include/linux/cred.h     |  65 +++++++++++-
 kernel/cred.c            | 250 +++++++++++++++++++++++++++++++++++++++++++++--
 kernel/exit.c            |   4 +
 kernel/fork.c            |   6 +-
 kernel/kmod.c            |   1 +
 lib/Kconfig.debug        |  15 +++
 security/selinux/hooks.c |   6 +-
 11 files changed, 346 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 5573508f707f..36fcabbf5186 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	int flags = nfsexp_flags(rqstp, exp);
 	int ret;
 
+	validate_process_creds();
+
 	/* discard any old override before preparing the new set */
 	revert_creds(get_cred(current->real_cred));
 	new = prepare_creds();
@@ -86,8 +88,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	else
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
 							new->cap_permitted);
+	validate_process_creds();
 	put_cred(override_creds(new));
 	put_cred(new);
+	validate_process_creds();
 	return 0;
 
 oom:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 492c79b7800b..24d58adfe5fd 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -496,7 +496,9 @@ nfsd(void *vrqstp)
 		/* Lock the export hash tables for reading. */
 		exp_readlock();
 
+		validate_process_creds();
 		svc_process(rqstp);
+		validate_process_creds();
 
 		/* Unlock export hash tables */
 		exp_readunlock();
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 23341c1063bc..8fa09bfbcba7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -684,6 +684,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	__be32		err;
 	int		host_err;
 
+	validate_process_creds();
+
 	/*
 	 * If we get here, then the client has already done an "open",
 	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
@@ -740,6 +742,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 out_nfserr:
 	err = nfserrno(host_err);
 out:
+	validate_process_creds();
 	return err;
 }
 
diff --git a/fs/open.c b/fs/open.c
index 40d1fa25f5aa..31191bf513e4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -959,6 +959,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
 	int error;
 	struct file *f;
 
+	validate_creds(cred);
+
 	/*
 	 * We must always pass in a valid mount pointer.   Historically
 	 * callers got away with not passing it, but we must enforce this at
diff --git a/include/linux/cred.h b/include/linux/cred.h
index b3c76e815d66..85439abdbc80 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -114,6 +114,13 @@ struct thread_group_cred {
  */
 struct cred {
 	atomic_t	usage;
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	atomic_t	subscribers;	/* number of processes subscribed */
+	void		*put_addr;
+	unsigned	magic;
+#define CRED_MAGIC	0x43736564
+#define CRED_MAGIC_DEAD	0x44656144
+#endif
 	uid_t		uid;		/* real UID of the task */
 	gid_t		gid;		/* real GID of the task */
 	uid_t		suid;		/* saved UID of the task */
@@ -143,6 +150,7 @@ struct cred {
 };
 
 extern void __put_cred(struct cred *);
+extern void exit_creds(struct task_struct *);
 extern int copy_creds(struct task_struct *, unsigned long);
 extern struct cred *prepare_creds(void);
 extern struct cred *prepare_exec_creds(void);
@@ -158,6 +166,60 @@ extern int set_security_override_from_ctx(struct cred *, const char *);
 extern int set_create_files_as(struct cred *, struct inode *);
 extern void __init cred_init(void);
 
+/*
+ * check for validity of credentials
+ */
+#ifdef CONFIG_DEBUG_CREDENTIALS
+extern void __invalid_creds(const struct cred *, const char *, unsigned);
+extern void __validate_process_creds(struct task_struct *,
+				     const char *, unsigned);
+
+static inline bool creds_are_invalid(const struct cred *cred)
+{
+	if (cred->magic != CRED_MAGIC)
+		return true;
+	if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
+		return true;
+#ifdef CONFIG_SECURITY_SELINUX
+	if ((unsigned long) cred->security < PAGE_SIZE)
+		return true;
+	if ((*(u32*)cred->security & 0xffffff00) ==
+	    (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
+		return true;
+#endif
+	return false;
+}
+
+static inline void __validate_creds(const struct cred *cred,
+				    const char *file, unsigned line)
+{
+	if (unlikely(creds_are_invalid(cred)))
+		__invalid_creds(cred, file, line);
+}
+
+#define validate_creds(cred)				\
+do {							\
+	__validate_creds((cred), __FILE__, __LINE__);	\
+} while(0)
+
+#define validate_process_creds()				\
+do {								\
+	__validate_process_creds(current, __FILE__, __LINE__);	\
+} while(0)
+
+extern void validate_creds_for_do_exit(struct task_struct *);
+#else
+static inline void validate_creds(const struct cred *cred)
+{
+}
+static inline void validate_creds_for_do_exit(struct task_struct *tsk)
+{
+}
+static inline void validate_process_creds(void)
+{
+}
+#endif
+
 /**
  * get_new_cred - Get a reference on a new set of credentials
  * @cred: The new credentials to reference
@@ -187,6 +249,7 @@ static inline struct cred *get_new_cred(struct cred *cred)
 static inline const struct cred *get_cred(const struct cred *cred)
 {
 	struct cred *nonconst_cred = (struct cred *) cred;
+	validate_creds(cred);
 	return get_new_cred(nonconst_cred);
 }
 
@@ -205,7 +268,7 @@ static inline void put_cred(const struct cred *_cred)
 {
 	struct cred *cred = (struct cred *) _cred;
 
-	BUG_ON(atomic_read(&(cred)->usage) <= 0);
+	validate_creds(cred);
 	if (atomic_dec_and_test(&(cred)->usage))
 		__put_cred(cred);
 }
diff --git a/kernel/cred.c b/kernel/cred.c
index 1bb4d7e5d616..24dd2f5104b1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -18,6 +18,18 @@
 #include <linux/cn_proc.h>
 #include "cred-internals.h"
 
+#if 0
+#define kdebug(FMT, ...) \
+	printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#else
+static inline __attribute__((format(printf, 1, 2)))
+void no_printk(const char *fmt, ...)
+{
+}
+#define kdebug(FMT, ...) \
+	no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#endif
+
 static struct kmem_cache *cred_jar;
 
 /*
@@ -36,6 +48,10 @@ static struct thread_group_cred init_tgcred = {
  */
 struct cred init_cred = {
 	.usage			= ATOMIC_INIT(4),
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	.subscribers		= ATOMIC_INIT(2),
+	.magic			= CRED_MAGIC,
+#endif
 	.securebits		= SECUREBITS_DEFAULT,
 	.cap_inheritable	= CAP_INIT_INH_SET,
 	.cap_permitted		= CAP_FULL_SET,
@@ -48,6 +64,31 @@ struct cred init_cred = {
 #endif
 };
 
+static inline void set_cred_subscribers(struct cred *cred, int n)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	atomic_set(&cred->subscribers, n);
+#endif
+}
+
+static inline int read_cred_subscribers(const struct cred *cred)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	return atomic_read(&cred->subscribers);
+#else
+	return 0;
+#endif
+}
+
+static inline void alter_cred_subscribers(const struct cred *_cred, int n)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	struct cred *cred = (struct cred *) _cred;
+
+	atomic_add(n, &cred->subscribers);
+#endif
+}
+
 /*
  * Dispose of the shared task group credentials
  */
@@ -85,9 +126,22 @@ static void put_cred_rcu(struct rcu_head *rcu)
 {
 	struct cred *cred = container_of(rcu, struct cred, rcu);
 
+	kdebug("put_cred_rcu(%p)", cred);
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	if (cred->magic != CRED_MAGIC_DEAD ||
+	    atomic_read(&cred->usage) != 0 ||
+	    read_cred_subscribers(cred) != 0)
+		panic("CRED: put_cred_rcu() sees %p with"
+		      " mag %x, put %p, usage %d, subscr %d\n",
+		      cred, cred->magic, cred->put_addr,
+		      atomic_read(&cred->usage),
+		      read_cred_subscribers(cred));
+#else
 	if (atomic_read(&cred->usage) != 0)
 		panic("CRED: put_cred_rcu() sees %p with usage %d\n",
 		      cred, atomic_read(&cred->usage));
+#endif
 
 	security_cred_free(cred);
 	key_put(cred->thread_keyring);
@@ -106,12 +160,47 @@ static void put_cred_rcu(struct rcu_head *rcu)
  */
 void __put_cred(struct cred *cred)
 {
+	kdebug("__put_cred(%p{%d,%d})", cred,
+	       atomic_read(&cred->usage),
+	       read_cred_subscribers(cred));
+
 	BUG_ON(atomic_read(&cred->usage) != 0);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	BUG_ON(read_cred_subscribers(cred) != 0);
+	cred->magic = CRED_MAGIC_DEAD;
+	cred->put_addr = __builtin_return_address(0);
+#endif
+	BUG_ON(cred == current->cred);
+	BUG_ON(cred == current->real_cred);
 
 	call_rcu(&cred->rcu, put_cred_rcu);
 }
 EXPORT_SYMBOL(__put_cred);
 
+/*
+ * Clean up a task's credentials when it exits
+ */
+void exit_creds(struct task_struct *tsk)
+{
+	struct cred *cred;
+
+	kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
+	       atomic_read(&tsk->cred->usage),
+	       read_cred_subscribers(tsk->cred));
+
+	cred = (struct cred *) tsk->real_cred;
+	tsk->real_cred = NULL;
+	validate_creds(cred);
+	alter_cred_subscribers(cred, -1);
+	put_cred(cred);
+
+	cred = (struct cred *) tsk->cred;
+	tsk->cred = NULL;
+	validate_creds(cred);
+	alter_cred_subscribers(cred, -1);
+	put_cred(cred);
+}
+
 /**
  * prepare_creds - Prepare a new set of credentials for modification
  *
@@ -132,16 +221,19 @@ struct cred *prepare_creds(void)
 	const struct cred *old;
 	struct cred *new;
 
-	BUG_ON(atomic_read(&task->real_cred->usage) < 1);
+	validate_process_creds();
 
 	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
 	if (!new)
 		return NULL;
 
+	kdebug("prepare_creds() alloc %p", new);
+
 	old = task->cred;
 	memcpy(new, old, sizeof(struct cred));
 
 	atomic_set(&new->usage, 1);
+	set_cred_subscribers(new, 0);
 	get_group_info(new->group_info);
 	get_uid(new->user);
 
@@ -157,6 +249,7 @@ struct cred *prepare_creds(void)
 
 	if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
 		goto error;
+	validate_creds(new);
 	return new;
 
 error:
@@ -229,9 +322,12 @@ struct cred *prepare_usermodehelper_creds(void)
 	if (!new)
 		return NULL;
 
+	kdebug("prepare_usermodehelper_creds() alloc %p", new);
+
 	memcpy(new, &init_cred, sizeof(struct cred));
 
 	atomic_set(&new->usage, 1);
+	set_cred_subscribers(new, 0);
 	get_group_info(new->group_info);
 	get_uid(new->user);
 
@@ -250,6 +346,7 @@ struct cred *prepare_usermodehelper_creds(void)
 #endif
 	if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
 		goto error;
+	validate_creds(new);
 
 	BUG_ON(atomic_read(&new->usage) != 1);
 	return new;
@@ -286,6 +383,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	    ) {
 		p->real_cred = get_cred(p->cred);
 		get_cred(p->cred);
+		alter_cred_subscribers(p->cred, 2);
+		kdebug("share_creds(%p{%d,%d})",
+		       p->cred, atomic_read(&p->cred->usage),
+		       read_cred_subscribers(p->cred));
 		atomic_inc(&p->cred->user->processes);
 		return 0;
 	}
@@ -331,6 +432,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 
 	atomic_inc(&new->user->processes);
 	p->cred = p->real_cred = get_cred(new);
+	alter_cred_subscribers(new, 2);
+	validate_creds(new);
 	return 0;
 
 error_put:
@@ -355,13 +458,20 @@ error_put:
 int commit_creds(struct cred *new)
 {
 	struct task_struct *task = current;
-	const struct cred *old;
+	const struct cred *old = task->real_cred;
 
-	BUG_ON(task->cred != task->real_cred);
-	BUG_ON(atomic_read(&task->real_cred->usage) < 2);
+	kdebug("commit_creds(%p{%d,%d})", new,
+	       atomic_read(&new->usage),
+	       read_cred_subscribers(new));
+
+	BUG_ON(task->cred != old);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	BUG_ON(read_cred_subscribers(old) < 2);
+	validate_creds(old);
+	validate_creds(new);
+#endif
 	BUG_ON(atomic_read(&new->usage) < 1);
 
-	old = task->real_cred;
 	security_commit_creds(new, old);
 
 	get_cred(new); /* we will require a ref for the subj creds too */
@@ -390,12 +500,14 @@ int commit_creds(struct cred *new)
 	 *   cheaply with the new uid cache, so if it matters
 	 *   we should be checking for it.  -DaveM
 	 */
+	alter_cred_subscribers(new, 2);
 	if (new->user != old->user)
 		atomic_inc(&new->user->processes);
 	rcu_assign_pointer(task->real_cred, new);
 	rcu_assign_pointer(task->cred, new);
 	if (new->user != old->user)
 		atomic_dec(&old->user->processes);
+	alter_cred_subscribers(old, -2);
 
 	sched_switch_user(task);
 
@@ -428,6 +540,13 @@ EXPORT_SYMBOL(commit_creds);
  */
 void abort_creds(struct cred *new)
 {
+	kdebug("abort_creds(%p{%d,%d})", new,
+	       atomic_read(&new->usage),
+	       read_cred_subscribers(new));
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	BUG_ON(read_cred_subscribers(new) != 0);
+#endif
 	BUG_ON(atomic_read(&new->usage) < 1);
 	put_cred(new);
 }
@@ -444,7 +563,20 @@ const struct cred *override_creds(const struct cred *new)
 {
 	const struct cred *old = current->cred;
 
-	rcu_assign_pointer(current->cred, get_cred(new));
+	kdebug("override_creds(%p{%d,%d})", new,
+	       atomic_read(&new->usage),
+	       read_cred_subscribers(new));
+
+	validate_creds(old);
+	validate_creds(new);
+	get_cred(new);
+	alter_cred_subscribers(new, 1);
+	rcu_assign_pointer(current->cred, new);
+	alter_cred_subscribers(old, -1);
+
+	kdebug("override_creds() = %p{%d,%d}", old,
+	       atomic_read(&old->usage),
+	       read_cred_subscribers(old));
 	return old;
 }
 EXPORT_SYMBOL(override_creds);
@@ -460,7 +592,15 @@ void revert_creds(const struct cred *old)
 {
 	const struct cred *override = current->cred;
 
+	kdebug("revert_creds(%p{%d,%d})", old,
+	       atomic_read(&old->usage),
+	       read_cred_subscribers(old));
+
+	validate_creds(old);
+	validate_creds(override);
+	alter_cred_subscribers(old, 1);
 	rcu_assign_pointer(current->cred, old);
+	alter_cred_subscribers(override, -1);
 	put_cred(override);
 }
 EXPORT_SYMBOL(revert_creds);
@@ -502,11 +642,15 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 	if (!new)
 		return NULL;
 
+	kdebug("prepare_kernel_cred() alloc %p", new);
+
 	if (daemon)
 		old = get_task_cred(daemon);
 	else
 		old = get_cred(&init_cred);
 
+	validate_creds(old);
+
 	*new = *old;
 	get_uid(new->user);
 	get_group_info(new->group_info);
@@ -526,7 +670,9 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 		goto error;
 
 	atomic_set(&new->usage, 1);
+	set_cred_subscribers(new, 0);
 	put_cred(old);
+	validate_creds(new);
 	return new;
 
 error:
@@ -589,3 +735,95 @@ int set_create_files_as(struct cred *new, struct inode *inode)
 	return security_kernel_create_files_as(new, inode);
 }
 EXPORT_SYMBOL(set_create_files_as);
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+
+/*
+ * dump invalid credentials
+ */
+static void dump_invalid_creds(const struct cred *cred, const char *label,
+			       const struct task_struct *tsk)
+{
+	printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
+	       label, cred,
+	       cred == &init_cred ? "[init]" : "",
+	       cred == tsk->real_cred ? "[real]" : "",
+	       cred == tsk->cred ? "[eff]" : "");
+	printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
+	       cred->magic, cred->put_addr);
+	printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
+	       atomic_read(&cred->usage),
+	       read_cred_subscribers(cred));
+	printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
+	       cred->uid, cred->euid, cred->suid, cred->fsuid);
+	printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
+	       cred->gid, cred->egid, cred->sgid, cred->fsgid);
+#ifdef CONFIG_SECURITY
+	printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
+	if ((unsigned long) cred->security >= PAGE_SIZE &&
+	    (((unsigned long) cred->security & 0xffffff00) !=
+	     (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
+		printk(KERN_ERR "CRED: ->security {%x, %x}\n",
+		       ((u32*)cred->security)[0],
+		       ((u32*)cred->security)[1]);
+#endif
+}
+
+/*
+ * report use of invalid credentials
+ */
+void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
+{
+	printk(KERN_ERR "CRED: Invalid credentials\n");
+	printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+	dump_invalid_creds(cred, "Specified", current);
+	BUG();
+}
+EXPORT_SYMBOL(__invalid_creds);
+
+/*
+ * check the credentials on a process
+ */
+void __validate_process_creds(struct task_struct *tsk,
+			      const char *file, unsigned line)
+{
+	if (tsk->cred == tsk->real_cred) {
+		if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
+			     creds_are_invalid(tsk->cred)))
+			goto invalid_creds;
+	} else {
+		if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
+			     read_cred_subscribers(tsk->cred) < 1 ||
+			     creds_are_invalid(tsk->real_cred) ||
+			     creds_are_invalid(tsk->cred)))
+			goto invalid_creds;
+	}
+	return;
+
+invalid_creds:
+	printk(KERN_ERR "CRED: Invalid process credentials\n");
+	printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+
+	dump_invalid_creds(tsk->real_cred, "Real", tsk);
+	if (tsk->cred != tsk->real_cred)
+		dump_invalid_creds(tsk->cred, "Effective", tsk);
+	else
+		printk(KERN_ERR "CRED: Effective creds == Real creds\n");
+	BUG();
+}
+EXPORT_SYMBOL(__validate_process_creds);
+
+/*
+ * check creds for do_exit()
+ */
+void validate_creds_for_do_exit(struct task_struct *tsk)
+{
+	kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
+	       tsk->real_cred, tsk->cred,
+	       atomic_read(&tsk->cred->usage),
+	       read_cred_subscribers(tsk->cred));
+
+	__validate_process_creds(tsk, __FILE__, __LINE__);
+}
+
+#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/exit.c b/kernel/exit.c
index 869dc221733e..c98ff7a8025f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -901,6 +901,8 @@ NORET_TYPE void do_exit(long code)
 
 	tracehook_report_exit(&code);
 
+	validate_creds_for_do_exit(tsk);
+
 	/*
 	 * We're taking recursive faults here in do_exit. Safest is to just
 	 * leave this task alone and wait for reboot.
@@ -1009,6 +1011,8 @@ NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
+	validate_creds_for_do_exit(tsk);
+
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
diff --git a/kernel/fork.c b/kernel/fork.c
index 144326b7af50..043b5d88049b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -152,8 +152,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
-	put_cred(tsk->real_cred);
-	put_cred(tsk->cred);
+	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
 
 	if (!profile_handoff_task(tsk))
@@ -1307,8 +1306,7 @@ bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
-	put_cred(p->real_cred);
-	put_cred(p->cred);
+	exit_creds(p);
 bad_fork_free:
 	free_task(p);
 fork_out:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5a7ae57f983f..4e8cae2e9148 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -466,6 +466,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 	int retval = 0;
 
 	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+	validate_creds(sub_info->cred);
 
 	helper_lock();
 	if (sub_info->path[0] == '\0')
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 12327b2bb785..fbb87cf138c5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -653,6 +653,21 @@ config DEBUG_NOTIFIERS
 	  This is a relatively cheap check but if you care about maximum
 	  performance, say N.
 
+config DEBUG_CREDENTIALS
+	bool "Debug credential management"
+	depends on DEBUG_KERNEL
+	help
+	  Enable this to turn on some debug checking for credential
+	  management.  The additional code keeps track of the number of
+	  pointers from task_structs to any given cred struct, and checks to
+	  see that this number never exceeds the usage count of the cred
+	  struct.
+
+	  Furthermore, if SELinux is enabled, this also checks that the
+	  security pointer in the cred struct is never seen to be invalid.
+
+	  If unsure, say N.
+
 #
 # Select this config option from the architecture Kconfig, if it
 # it is preferred to always offer frame pointers as a config
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 27b4c5527358..c3bb31ecc5aa 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1531,6 +1531,8 @@ static int inode_has_perm(const struct cred *cred,
 	struct common_audit_data ad;
 	u32 sid;
 
+	validate_creds(cred);
+
 	if (unlikely(IS_PRIVATE(inode)))
 		return 0;
 
@@ -3236,7 +3238,9 @@ static int selinux_task_create(unsigned long clone_flags)
 static void selinux_cred_free(struct cred *cred)
 {
 	struct task_security_struct *tsec = cred->security;
-	cred->security = NULL;
+
+	BUG_ON((unsigned long) cred->security < PAGE_SIZE);
+	cred->security = (void *) 0x7UL;
 	kfree(tsec);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 988abe4075e5748d9f7c79d9dfffa0cf5291611b Mon Sep 17 00:00:00 2001
From: Alex Elder <aelder@sgi.com>
Date: Wed, 2 Sep 2009 17:02:24 -0500
Subject: xfs: xfs_showargs() reports group *and* project quotas enabled

If you enable group or project quotas on an XFS file system, then the
mount table presented through /proc/self/mounts erroneously shows
that both options are in effect for the file system.  The root of
the problem is some bad logic in the xfs_showargs() function, which
is used to format the file system type-specific options in effect
for a file system.

The problem originated in this GIT commit:
    Move platform specific mount option parse out of core XFS code
    Date: 11/22/07
    Author: Dave Chinner
    SHA1 ID: a67d7c5f5d25d0b13a4dfb182697135b014fa478

For XFS quotas, project and group quota management are mutually
exclusive--only one can be in effect at a time.  There are two
parts to managing quotas:  aggregating usage information; and
enforcing limits.  It is possible to have a quota in effect
(aggregating usage) but not enforced.

These features are recorded on an XFS mount point using these flags:
    XFS_PQUOTA_ACCT - Project quotas are aggregated
    XFS_GQUOTA_ACCT - Group quotas are aggregated
    XFS_OQUOTA_ENFD - Project/group quotas are enforced

The code in error is in fs/xfs/linux-2.6/xfs_super.c:

        if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
                seq_puts(m, "," MNTOPT_PRJQUOTA);
        else if (mp->m_qflags & XFS_PQUOTA_ACCT)
                seq_puts(m, "," MNTOPT_PQUOTANOENF);

        if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD))
                seq_puts(m, "," MNTOPT_GRPQUOTA);
        else if (mp->m_qflags & XFS_GQUOTA_ACCT)
                seq_puts(m, "," MNTOPT_GQUOTANOENF);

The problem is that XFS_OQUOTA_ENFD will be set in mp->m_qflags
if either group or project quotas are enforced, and as a result
both MNTOPT_PRJQUOTA and MNTOPT_GRPQUOTA will be shown as mount
options.

Signed-off-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index c709ed61a53e..5d7c60ac77b4 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -579,15 +579,19 @@ xfs_showargs(
 	else if (mp->m_qflags & XFS_UQUOTA_ACCT)
 		seq_puts(m, "," MNTOPT_UQUOTANOENF);
 
-	if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
-		seq_puts(m, "," MNTOPT_PRJQUOTA);
-	else if (mp->m_qflags & XFS_PQUOTA_ACCT)
-		seq_puts(m, "," MNTOPT_PQUOTANOENF);
-
-	if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD))
-		seq_puts(m, "," MNTOPT_GRPQUOTA);
-	else if (mp->m_qflags & XFS_GQUOTA_ACCT)
-		seq_puts(m, "," MNTOPT_GQUOTANOENF);
+	/* Either project or group quotas can be active, not both */
+
+	if (mp->m_qflags & XFS_PQUOTA_ACCT) {
+		if (mp->m_qflags & XFS_OQUOTA_ENFD)
+			seq_puts(m, "," MNTOPT_PRJQUOTA);
+		else
+			seq_puts(m, "," MNTOPT_PQUOTANOENF);
+	} else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+		if (mp->m_qflags & XFS_OQUOTA_ENFD)
+			seq_puts(m, "," MNTOPT_GRPQUOTA);
+		else
+			seq_puts(m, "," MNTOPT_GQUOTANOENF);
+	}
 
 	if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
 		seq_puts(m, "," MNTOPT_NOQUOTA);
-- 
cgit v1.2.3-70-g09d2


From 6c1488fd581a447ec87c4b59f0d33f95f0aa441b Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Wed, 2 Sep 2009 11:40:32 -0400
Subject: IMA: open new file for read

When creating a new file, ima_path_check() assumed the new file
was being opened for write. Call ima_path_check() with the
appropriate acc_mode so that the read/write counters are
incremented correctly.

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/namei.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f3c5b278895a..ee01308a01d1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1533,9 +1533,11 @@ int may_open(struct path *path, int acc_mode, int flag)
 	if (error)
 		return error;
 
-	error = ima_path_check(path,
-			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
+	error = ima_path_check(path, acc_mode ?
+			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
+			       ACC_MODE(flag) & (MAY_READ | MAY_WRITE),
 			       IMA_COUNT_UPDATE);
+
 	if (error)
 		return error;
 	/*
-- 
cgit v1.2.3-70-g09d2


From bc8cec0dff072f1a45ce7f6b2c5234bb3411ac51 Mon Sep 17 00:00:00 2001
From: Massimo Cirillo <maxcir@gmail.com>
Date: Thu, 27 Aug 2009 10:44:09 +0200
Subject: JFFS2: add missing verify buffer allocation/deallocation

The function jffs2_nor_wbuf_flash_setup() doesn't allocate the verify buffer
if CONFIG_JFFS2_FS_WBUF_VERIFY is defined, so causing a kernel panic when
that macro is enabled and the verify function is called. Similarly the
jffs2_nor_wbuf_flash_cleanup() must free the buffer if
CONFIG_JFFS2_FS_WBUF_VERIFY is enabled.
The following patch fixes the problem.
The following patch applies to 2.6.30 kernel.

Signed-off-by: Massimo Cirillo <maxcir@gmail.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: stable@kernel.org
---
 fs/jffs2/wbuf.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index d9a721e6db70..5ef7bac265e5 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1268,10 +1268,20 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
 	if (!c->wbuf)
 		return -ENOMEM;
 
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf_verify) {
+		kfree(c->wbuf);
+		return -ENOMEM;
+	}
+#endif
 	return 0;
 }
 
 void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+	kfree(c->wbuf_verify);
+#endif
 	kfree(c->wbuf);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 9162ab2000e08be076883b5a295a771223264ce8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 3 Sep 2009 12:07:17 -0400
Subject: cifs: consolidate reconnect logic in smb_init routines

There's a large cut and paste chunk of code in smb_init and
small_smb_init to handle reconnects. Break it out into a separate
function, clean it up and have both routines call it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 312 +++++++++++++++++++++---------------------------------
 1 file changed, 123 insertions(+), 189 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5f0b80d39923..301e307e1279 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -100,110 +100,138 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
 	   to this tcon */
 }
 
-/* Allocate and return pointer to an SMB request buffer, and set basic
-   SMB information in the SMB header.  If the return code is zero, this
-   function must have filled in request_buf pointer */
+/* reconnect the socket, tcon, and smb session if needed */
 static int
-small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
-		void **request_buf)
+cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 {
 	int rc = 0;
+	struct cifsSesInfo *ses;
+	struct TCP_Server_Info *server;
+	struct nls_table *nls_codepage;
 
-	/* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so
-	   check for tcp and smb session status done differently
-	   for those three - in the calling routine */
-	if (tcon) {
-		if (tcon->tidStatus == CifsExiting) {
-			/* only tree disconnect, open, and write,
-			(and ulogoff which does not have tcon)
-			are allowed as we start force umount */
-			if ((smb_command != SMB_COM_WRITE_ANDX) &&
-			   (smb_command != SMB_COM_OPEN_ANDX) &&
-			   (smb_command != SMB_COM_TREE_DISCONNECT)) {
-				cFYI(1, ("can not send cmd %d while umounting",
-					smb_command));
-				return -ENODEV;
-			}
+	/*
+	 * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
+	 * tcp and smb session status done differently for those three - in the
+	 * calling routine
+	 */
+	if (!tcon)
+		return 0;
+
+	ses = tcon->ses;
+	server = ses->server;
+
+	/*
+	 * only tree disconnect, open, and write, (and ulogoff which does not
+	 * have tcon) are allowed as we start force umount
+	 */
+	if (tcon->tidStatus == CifsExiting) {
+		if (smb_command != SMB_COM_WRITE_ANDX &&
+		    smb_command != SMB_COM_OPEN_ANDX &&
+		    smb_command != SMB_COM_TREE_DISCONNECT) {
+			cFYI(1, ("can not send cmd %d while umounting",
+				smb_command));
+			return -ENODEV;
 		}
-		if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
-				  (tcon->ses->server)) {
-			struct nls_table *nls_codepage;
-				/* Give Demultiplex thread up to 10 seconds to
-				   reconnect, should be greater than cifs socket
-				   timeout which is 7 seconds */
-			while (tcon->ses->server->tcpStatus ==
-							 CifsNeedReconnect) {
-				wait_event_interruptible_timeout(tcon->ses->server->response_q,
-					(tcon->ses->server->tcpStatus ==
-							CifsGood), 10 * HZ);
-				if (tcon->ses->server->tcpStatus ==
-							CifsNeedReconnect) {
-					/* on "soft" mounts we wait once */
-					if (!tcon->retry ||
-					   (tcon->ses->status == CifsExiting)) {
-						cFYI(1, ("gave up waiting on "
-						      "reconnect in smb_init"));
-						return -EHOSTDOWN;
-					} /* else "hard" mount - keep retrying
-					     until process is killed or server
-					     comes back on-line */
-				} else /* TCP session is reestablished now */
-					break;
-			}
+	}
 
-			nls_codepage = load_nls_default();
-		/* need to prevent multiple threads trying to
-		simultaneously reconnect the same SMB session */
-			down(&tcon->ses->sesSem);
-			if (tcon->ses->need_reconnect)
-				rc = cifs_setup_session(0, tcon->ses,
-							nls_codepage);
-			if (!rc && (tcon->need_reconnect)) {
-				mark_open_files_invalid(tcon);
-				rc = CIFSTCon(0, tcon->ses, tcon->treeName,
-					      tcon, nls_codepage);
-				up(&tcon->ses->sesSem);
-				/* BB FIXME add code to check if wsize needs
-				   update due to negotiated smb buffer size
-				   shrinking */
-				if (rc == 0) {
-					atomic_inc(&tconInfoReconnectCount);
-					/* tell server Unix caps we support */
-					if (tcon->ses->capabilities & CAP_UNIX)
-						reset_cifs_unix_caps(
-						0 /* no xid */,
-						tcon,
-						NULL /* we do not know sb */,
-						NULL /* no vol info */);
-				}
+	if (ses->status == CifsExiting)
+		return -EIO;
 
-				cFYI(1, ("reconnect tcon rc = %d", rc));
-				/* Removed call to reopen open files here.
-				   It is safer (and faster) to reopen files
-				   one at a time as needed in read and write */
-
-				/* Check if handle based operation so we
-				   know whether we can continue or not without
-				   returning to caller to reset file handle */
-				switch (smb_command) {
-					case SMB_COM_READ_ANDX:
-					case SMB_COM_WRITE_ANDX:
-					case SMB_COM_CLOSE:
-					case SMB_COM_FIND_CLOSE2:
-					case SMB_COM_LOCKING_ANDX: {
-						unload_nls(nls_codepage);
-						return -EAGAIN;
-					}
-				}
-			} else {
-				up(&tcon->ses->sesSem);
-			}
-			unload_nls(nls_codepage);
+	/*
+	 * Give demultiplex thread up to 10 seconds to reconnect, should be
+	 * greater than cifs socket timeout which is 7 seconds
+	 */
+	while (server->tcpStatus == CifsNeedReconnect) {
+		wait_event_interruptible_timeout(server->response_q,
+			(server->tcpStatus == CifsGood), 10 * HZ);
 
-		} else {
-			return -EIO;
+		/* is TCP session is reestablished now ?*/
+		if (server->tcpStatus != CifsNeedReconnect)
+			break;
+
+		/*
+		 * on "soft" mounts we wait once. Hard mounts keep
+		 * retrying until process is killed or server comes
+		 * back on-line
+		 */
+		if (!tcon->retry || ses->status == CifsExiting) {
+			cFYI(1, ("gave up waiting on reconnect in smb_init"));
+			return -EHOSTDOWN;
 		}
 	}
+
+	if (!ses->need_reconnect && !tcon->need_reconnect)
+		return 0;
+
+	nls_codepage = load_nls_default();
+
+	/*
+	 * need to prevent multiple threads trying to simultaneously
+	 * reconnect the same SMB session
+	 */
+	down(&ses->sesSem);
+	if (ses->need_reconnect)
+		rc = cifs_setup_session(0, ses, nls_codepage);
+
+	/* do we need to reconnect tcon? */
+	if (rc || !tcon->need_reconnect) {
+		up(&ses->sesSem);
+		goto out;
+	}
+
+	mark_open_files_invalid(tcon);
+	rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
+	up(&ses->sesSem);
+	cFYI(1, ("reconnect tcon rc = %d", rc));
+
+	if (rc)
+		goto out;
+
+	/*
+	 * FIXME: check if wsize needs updated due to negotiated smb buffer
+	 * 	  size shrinking
+	 */
+	atomic_inc(&tconInfoReconnectCount);
+
+	/* tell server Unix caps we support */
+	if (ses->capabilities & CAP_UNIX)
+		reset_cifs_unix_caps(0, tcon, NULL, NULL);
+
+	/*
+	 * Removed call to reopen open files here. It is safer (and faster) to
+	 * reopen files one at a time as needed in read and write.
+	 *
+	 * FIXME: what about file locks? don't we need to reclaim them ASAP?
+	 */
+
+out:
+	/*
+	 * Check if handle based operation so we know whether we can continue
+	 * or not without returning to caller to reset file handle
+	 */
+	switch (smb_command) {
+	case SMB_COM_READ_ANDX:
+	case SMB_COM_WRITE_ANDX:
+	case SMB_COM_CLOSE:
+	case SMB_COM_FIND_CLOSE2:
+	case SMB_COM_LOCKING_ANDX:
+		rc = -EAGAIN;
+	}
+
+	unload_nls(nls_codepage);
+	return rc;
+}
+
+/* Allocate and return pointer to an SMB request buffer, and set basic
+   SMB information in the SMB header.  If the return code is zero, this
+   function must have filled in request_buf pointer */
+static int
+small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+		void **request_buf)
+{
+	int rc = 0;
+
+	rc = cifs_reconnect_tcon(tcon, smb_command);
 	if (rc)
 		return rc;
 
@@ -256,101 +284,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 {
 	int rc = 0;
 
-	/* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so
-	   check for tcp and smb session status done differently
-	   for those three - in the calling routine */
-	if (tcon) {
-		if (tcon->tidStatus == CifsExiting) {
-			/* only tree disconnect, open, and write,
-			  (and ulogoff which does not have tcon)
-			  are allowed as we start force umount */
-			if ((smb_command != SMB_COM_WRITE_ANDX) &&
-			   (smb_command != SMB_COM_OPEN_ANDX) &&
-			   (smb_command != SMB_COM_TREE_DISCONNECT)) {
-				cFYI(1, ("can not send cmd %d while umounting",
-					smb_command));
-				return -ENODEV;
-			}
-		}
-
-		if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
-				  (tcon->ses->server)) {
-			struct nls_table *nls_codepage;
-				/* Give Demultiplex thread up to 10 seconds to
-				   reconnect, should be greater than cifs socket
-				   timeout which is 7 seconds */
-			while (tcon->ses->server->tcpStatus ==
-							CifsNeedReconnect) {
-				wait_event_interruptible_timeout(tcon->ses->server->response_q,
-					(tcon->ses->server->tcpStatus ==
-							CifsGood), 10 * HZ);
-				if (tcon->ses->server->tcpStatus ==
-						CifsNeedReconnect) {
-					/* on "soft" mounts we wait once */
-					if (!tcon->retry ||
-					   (tcon->ses->status == CifsExiting)) {
-						cFYI(1, ("gave up waiting on "
-						      "reconnect in smb_init"));
-						return -EHOSTDOWN;
-					} /* else "hard" mount - keep retrying
-					     until process is killed or server
-					     comes on-line */
-				} else /* TCP session is reestablished now */
-					break;
-			}
-			nls_codepage = load_nls_default();
-		/* need to prevent multiple threads trying to
-		simultaneously reconnect the same SMB session */
-			down(&tcon->ses->sesSem);
-			if (tcon->ses->need_reconnect)
-				rc = cifs_setup_session(0, tcon->ses,
-							nls_codepage);
-			if (!rc && (tcon->need_reconnect)) {
-				mark_open_files_invalid(tcon);
-				rc = CIFSTCon(0, tcon->ses, tcon->treeName,
-					      tcon, nls_codepage);
-				up(&tcon->ses->sesSem);
-				/* BB FIXME add code to check if wsize needs
-				update due to negotiated smb buffer size
-				shrinking */
-				if (rc == 0) {
-					atomic_inc(&tconInfoReconnectCount);
-					/* tell server Unix caps we support */
-					if (tcon->ses->capabilities & CAP_UNIX)
-						reset_cifs_unix_caps(
-						0 /* no xid */,
-						tcon,
-						NULL /* do not know sb */,
-						NULL /* no vol info */);
-				}
-
-				cFYI(1, ("reconnect tcon rc = %d", rc));
-				/* Removed call to reopen open files here.
-				   It is safer (and faster) to reopen files
-				   one at a time as needed in read and write */
-
-				/* Check if handle based operation so we
-				   know whether we can continue or not without
-				   returning to caller to reset file handle */
-				switch (smb_command) {
-					case SMB_COM_READ_ANDX:
-					case SMB_COM_WRITE_ANDX:
-					case SMB_COM_CLOSE:
-					case SMB_COM_FIND_CLOSE2:
-					case SMB_COM_LOCKING_ANDX: {
-						unload_nls(nls_codepage);
-						return -EAGAIN;
-					}
-				}
-			} else {
-				up(&tcon->ses->sesSem);
-			}
-			unload_nls(nls_codepage);
-
-		} else {
-			return -EIO;
-		}
-	}
+	rc = cifs_reconnect_tcon(tcon, smb_command);
 	if (rc)
 		return rc;
 
-- 
cgit v1.2.3-70-g09d2


From 8379e7c46cc48f51197dd663fc6676f47f2a1e71 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Fri, 4 Sep 2009 11:12:01 -0700
Subject: ocfs2: ocfs2_write_begin_nolock() should handle len=0

Bug introduced by mainline commit e7432675f8ca868a4af365759a8d4c3779a3d922
The bug causes ocfs2_write_begin_nolock() to oops when len=0.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Cc: stable@kernel.org
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/aops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b401654011a2..8a1e61545f41 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1747,8 +1747,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	 * we know zeros will only be needed in the first and/or last cluster.
 	 */
 	if (clusters_to_alloc || extents_to_split ||
-	    wc->w_desc[0].c_needs_zero ||
-	    wc->w_desc[wc->w_clen - 1].c_needs_zero)
+	    (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+			    wc->w_desc[wc->w_clen - 1].c_needs_zero)))
 		cluster_of_pages = 1;
 	else
 		cluster_of_pages = 0;
-- 
cgit v1.2.3-70-g09d2


From 7f1346a9de6a689b03f2c1c3a387e49ec64da267 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Sat, 5 Sep 2009 09:28:54 -0400
Subject: ext4: Declare seq_operations and file_operations structures as const

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 3086b3c65adc..5ef6daf0cdc6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2202,7 +2202,7 @@ static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
 {
 }
 
-static struct seq_operations ext4_mb_seq_history_ops = {
+static const struct seq_operations ext4_mb_seq_history_ops = {
 	.start  = ext4_mb_seq_history_start,
 	.next   = ext4_mb_seq_history_next,
 	.stop   = ext4_mb_seq_history_stop,
@@ -2284,7 +2284,7 @@ static ssize_t ext4_mb_seq_history_write(struct file *file,
 	return count;
 }
 
-static struct file_operations ext4_mb_seq_history_fops = {
+static const struct file_operations ext4_mb_seq_history_fops = {
 	.owner		= THIS_MODULE,
 	.open		= ext4_mb_seq_history_open,
 	.read		= seq_read,
@@ -2363,7 +2363,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
 {
 }
 
-static struct seq_operations ext4_mb_seq_groups_ops = {
+static const struct seq_operations ext4_mb_seq_groups_ops = {
 	.start  = ext4_mb_seq_groups_start,
 	.next   = ext4_mb_seq_groups_next,
 	.stop   = ext4_mb_seq_groups_stop,
@@ -2384,7 +2384,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
 
 }
 
-static struct file_operations ext4_mb_seq_groups_fops = {
+static const struct file_operations ext4_mb_seq_groups_fops = {
 	.owner		= THIS_MODULE,
 	.open		= ext4_mb_seq_groups_open,
 	.read		= seq_read,
-- 
cgit v1.2.3-70-g09d2


From d0646f7b636d067d715fab52a2ba9c6f0f46b0d7 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 5 Sep 2009 12:50:43 -0400
Subject: ext4: Remove journal_checksum mount option and enable it by default

There's no real cost for the journal checksum feature, and we should
make sure it is enabled all the time.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt |  8 +-------
 fs/ext4/ext4.h                     |  1 -
 fs/ext4/super.c                    | 20 ++++++--------------
 3 files changed, 7 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 7be02ac5fa36..3e329dbac785 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -134,15 +134,9 @@ ro                   	Mount filesystem read only. Note that ext4 will
                      	mount options "ro,noload" can be used to prevent
 		     	writes to the filesystem.
 
-journal_checksum	Enable checksumming of the journal transactions.
-			This will allow the recovery code in e2fsck and the
-			kernel to detect corruption in the kernel.  It is a
-			compatible change and will be ignored by older kernels.
-
 journal_async_commit	Commit block can be written to disk without waiting
 			for descriptor blocks. If enabled older kernels cannot
-			mount the device. This will enable 'journal_checksum'
-			internally.
+			mount the device.
 
 journal=update		Update the ext4 file system's journal to the current
 			format.
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 81014f4ed22d..4dc64ed58d26 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -711,7 +711,6 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
-#define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4037fe0b5a5c..f1815d3bcfd5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1280,11 +1280,9 @@ static int parse_options(char *options, struct super_block *sb,
 			*journal_devnum = option;
 			break;
 		case Opt_journal_checksum:
-			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
-			break;
+			break;	/* Kept for backwards compatibility */
 		case Opt_journal_async_commit:
 			set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
-			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
 			break;
 		case Opt_noload:
 			set_opt(sbi->s_mount_opt, NOLOAD);
@@ -2751,20 +2749,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount4;
 	}
 
-	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
-		jbd2_journal_set_features(sbi->s_journal,
-				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+	jbd2_journal_set_features(sbi->s_journal,
+				  JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
+	if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+		jbd2_journal_set_features(sbi->s_journal, 0, 0,
 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
-		jbd2_journal_set_features(sbi->s_journal,
-				JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
+	else
 		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-	} else {
-		jbd2_journal_clear_features(sbi->s_journal,
-				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
-				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-	}
 
 	/* We have now updated the journal if required, so we can
 	 * validate the data journaling mode. */
-- 
cgit v1.2.3-70-g09d2


From a2a8474c3fff88d8dd52d05cb450563fb26fd26c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 5 Sep 2009 11:17:13 -0700
Subject: exec: do not sleep in TASK_TRACED under ->cred_guard_mutex

Tom Horsley reports that his debugger hangs when it tries to read
/proc/pid_of_tracee/maps, this happens since

	"mm_for_maps: take ->cred_guard_mutex to fix the race with exec"
	04b836cbf19e885f8366bccb2e4b0474346c02d

commit in 2.6.31.

But the root of the problem lies in the fact that do_execve() path calls
tracehook_report_exec() which can stop if the tracer sets PT_TRACE_EXEC.

The tracee must not sleep in TASK_TRACED holding this mutex.  Even if we
remove ->cred_guard_mutex from mm_for_maps() and proc_pid_attr_write(),
another task doing PTRACE_ATTACH should not hang until it is killed or the
tracee resumes.

With this patch do_execve() does not use ->cred_guard_mutex directly and
we do not hold it throughout, instead:

	- introduce prepare_bprm_creds() helper, it locks the mutex
	  and calls prepare_exec_creds() to initialize bprm->cred.

	- install_exec_creds() drops the mutex after commit_creds(),
	  and thus before tracehook_report_exec()->ptrace_stop().

	  or, if exec fails,

	  free_bprm() drops this mutex when bprm->cred != NULL which
	  indicates install_exec_creds() was not called.

Reported-by: Tom Horsley <tom.horsley@att.net>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat.c             | 17 ++++---------
 fs/exec.c               | 63 +++++++++++++++++++++++++++++--------------------
 include/linux/binfmts.h |  1 +
 3 files changed, 43 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 94502dab972a..6d6f98fe64a0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1485,20 +1485,15 @@ int compat_do_execve(char * filename,
 	if (!bprm)
 		goto out_files;
 
-	retval = -ERESTARTNOINTR;
-	if (mutex_lock_interruptible(&current->cred_guard_mutex))
+	retval = prepare_bprm_creds(bprm);
+	if (retval)
 		goto out_free;
-	current->in_execve = 1;
-
-	retval = -ENOMEM;
-	bprm->cred = prepare_exec_creds();
-	if (!bprm->cred)
-		goto out_unlock;
 
 	retval = check_unsafe_exec(bprm);
 	if (retval < 0)
-		goto out_unlock;
+		goto out_free;
 	clear_in_exec = retval;
+	current->in_execve = 1;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
@@ -1547,7 +1542,6 @@ int compat_do_execve(char * filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_guard_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
 	if (displaced)
@@ -1567,10 +1561,7 @@ out_file:
 out_unmark:
 	if (clear_in_exec)
 		current->fs->in_exec = 0;
-
-out_unlock:
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_guard_mutex);
 
 out_free:
 	free_bprm(bprm);
diff --git a/fs/exec.c b/fs/exec.c
index fb4f3cdda78c..172ceb6edde4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1015,6 +1015,35 @@ out:
 
 EXPORT_SYMBOL(flush_old_exec);
 
+/*
+ * Prepare credentials and lock ->cred_guard_mutex.
+ * install_exec_creds() commits the new creds and drops the lock.
+ * Or, if exec fails before, free_bprm() should release ->cred and
+ * and unlock.
+ */
+int prepare_bprm_creds(struct linux_binprm *bprm)
+{
+	if (mutex_lock_interruptible(&current->cred_guard_mutex))
+		return -ERESTARTNOINTR;
+
+	bprm->cred = prepare_exec_creds();
+	if (likely(bprm->cred))
+		return 0;
+
+	mutex_unlock(&current->cred_guard_mutex);
+	return -ENOMEM;
+}
+
+void free_bprm(struct linux_binprm *bprm)
+{
+	free_arg_pages(bprm);
+	if (bprm->cred) {
+		mutex_unlock(&current->cred_guard_mutex);
+		abort_creds(bprm->cred);
+	}
+	kfree(bprm);
+}
+
 /*
  * install the new credentials for this executable
  */
@@ -1024,12 +1053,13 @@ void install_exec_creds(struct linux_binprm *bprm)
 
 	commit_creds(bprm->cred);
 	bprm->cred = NULL;
-
-	/* cred_guard_mutex must be held at least to this point to prevent
+	/*
+	 * cred_guard_mutex must be held at least to this point to prevent
 	 * ptrace_attach() from altering our determination of the task's
-	 * credentials; any time after this it may be unlocked */
-
+	 * credentials; any time after this it may be unlocked.
+	 */
 	security_bprm_committed_creds(bprm);
+	mutex_unlock(&current->cred_guard_mutex);
 }
 EXPORT_SYMBOL(install_exec_creds);
 
@@ -1246,14 +1276,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 
 EXPORT_SYMBOL(search_binary_handler);
 
-void free_bprm(struct linux_binprm *bprm)
-{
-	free_arg_pages(bprm);
-	if (bprm->cred)
-		abort_creds(bprm->cred);
-	kfree(bprm);
-}
-
 /*
  * sys_execve() executes a new program.
  */
@@ -1277,20 +1299,15 @@ int do_execve(char * filename,
 	if (!bprm)
 		goto out_files;
 
-	retval = -ERESTARTNOINTR;
-	if (mutex_lock_interruptible(&current->cred_guard_mutex))
+	retval = prepare_bprm_creds(bprm);
+	if (retval)
 		goto out_free;
-	current->in_execve = 1;
-
-	retval = -ENOMEM;
-	bprm->cred = prepare_exec_creds();
-	if (!bprm->cred)
-		goto out_unlock;
 
 	retval = check_unsafe_exec(bprm);
 	if (retval < 0)
-		goto out_unlock;
+		goto out_free;
 	clear_in_exec = retval;
+	current->in_execve = 1;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
@@ -1340,7 +1357,6 @@ int do_execve(char * filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_guard_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
 	if (displaced)
@@ -1360,10 +1376,7 @@ out_file:
 out_unmark:
 	if (clear_in_exec)
 		current->fs->in_exec = 0;
-
-out_unlock:
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_guard_mutex);
 
 out_free:
 	free_bprm(bprm);
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 61ee18c1bdb4..2046b5b8af48 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -117,6 +117,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm,
 			   int executable_stack);
 extern int bprm_mm_init(struct linux_binprm *bprm);
 extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm);
+extern int prepare_bprm_creds(struct linux_binprm *bprm);
 extern void install_exec_creds(struct linux_binprm *bprm);
 extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
 extern int set_binfmt(struct linux_binfmt *new);
-- 
cgit v1.2.3-70-g09d2


From 9de6886ec6e37f45807266a702bb7621498395ad Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Sat, 5 Sep 2009 00:25:37 -0400
Subject: ext2: fix unbalanced kmap()/kunmap()

In ext2_rename(), dir_page is acquired through ext2_dotdot().  It is
then released through ext2_set_link() but only if old_dir != new_dir.
Failing that, the pkmap reference count is never decremented and the
page remains pinned forever.  Repeat that a couple times with highmem
pages and all pkmap slots get exhausted, and every further kmap() calls
end up stalling on the pkmap_map_wait queue at which point the whole
system comes to a halt.

Signed-off-by: Nicolas Pitre <nico@marvell.com>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/namei.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index e1dedb0f7873..78d9b925fc94 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -362,6 +362,10 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 	if (dir_de) {
 		if (old_dir != new_dir)
 			ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
+		else {
+			kunmap(dir_page);
+			page_cache_release(dir_page);
+		}
 		inode_dec_link_count(old_dir);
 	}
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 5f3481e9a80c240f169b36ea886e2325b9aeb745 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 5 Sep 2009 21:42:42 -0400
Subject: ext4: fix cache flush in ext4_sync_file

We need to flush the write cache unconditionally in ->fsync, otherwise
writes into already allocated blocks can get lost.  Writes into fully
allocated files are very common when using disk images for
virtualization, and without this fix can easily lose data after
an fdatasync, which is the typical implementation for a cache flush on
the virtual drive.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/fsync.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 83cf6415f599..ab418c0f502d 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -92,9 +92,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 			.nr_to_write = 0, /* sys_fsync did this */
 		};
 		ret = sync_inode(inode, &wbc);
-		if (journal && (journal->j_flags & JBD2_BARRIER))
-			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	}
 out:
+	if (journal && (journal->j_flags & JBD2_BARRIER))
+		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 70d5d3dcea47c16058d2b093c29e07fdf61b56ad Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Wed, 16 Sep 2009 14:28:22 -0400
Subject: ext4: Fix wrong comparisons in mext_check_arguments()

The mext_check_arguments() function in move_extents.c has wrong
comparisons.  orig_start which is passed from user-space is block
unit, but i_size of inode is byte unit, therefore the checks do not
work fine.  This mis-check leads to the overflow of 'len' and then
hits BUG_ON() in ext4_move_extents().  The patch fixes this issue.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Reviewed-by: Greg Freemyer <greg.freemyer@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 46 +++++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5821e0bee917..c593eb2b193a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -898,6 +898,10 @@ mext_check_arguments(struct inode *orig_inode,
 			  struct inode *donor_inode, __u64 orig_start,
 			  __u64 donor_start, __u64 *len, __u64 moved_len)
 {
+	ext4_lblk_t orig_blocks, donor_blocks;
+	unsigned int blkbits = orig_inode->i_blkbits;
+	unsigned int blocksize = 1 << blkbits;
+
 	/* Regular file check */
 	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
 		ext4_debug("ext4 move extent: The argument files should be "
@@ -972,43 +976,47 @@ mext_check_arguments(struct inode *orig_inode,
 	}
 
 	if (orig_inode->i_size > donor_inode->i_size) {
-		if (orig_start >= donor_inode->i_size) {
+		donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
+		/* TODO: eliminate this artificial restriction */
+		if (orig_start >= donor_blocks) {
 			ext4_debug("ext4 move extent: orig start offset "
-			"[%llu] should be less than donor file size "
-			"[%lld] [ino:orig %lu, donor_inode %lu]\n",
-			orig_start, donor_inode->i_size,
+			"[%llu] should be less than donor file blocks "
+			"[%u] [ino:orig %lu, donor %lu]\n",
+			orig_start, donor_blocks,
 			orig_inode->i_ino, donor_inode->i_ino);
 			return -EINVAL;
 		}
 
-		if (orig_start + *len > donor_inode->i_size) {
+		/* TODO: eliminate this artificial restriction */
+		if (orig_start + *len > donor_blocks) {
 			ext4_debug("ext4 move extent: End offset [%llu] should "
-				"be less than donor file size [%lld]."
-				"So adjust length from %llu to %lld "
+				"be less than donor file blocks [%u]."
+				"So adjust length from %llu to %llu "
 				"[ino:orig %lu, donor %lu]\n",
-				orig_start + *len, donor_inode->i_size,
-				*len, donor_inode->i_size - orig_start,
+				orig_start + *len, donor_blocks,
+				*len, donor_blocks - orig_start,
 				orig_inode->i_ino, donor_inode->i_ino);
-			*len = donor_inode->i_size - orig_start;
+			*len = donor_blocks - orig_start;
 		}
 	} else {
-		if (orig_start >= orig_inode->i_size) {
+		orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
+		if (orig_start >= orig_blocks) {
 			ext4_debug("ext4 move extent: start offset [%llu] "
-				"should be less than original file size "
-				"[%lld] [inode:orig %lu, donor %lu]\n",
-				 orig_start, orig_inode->i_size,
+				"should be less than original file blocks "
+				"[%u] [ino:orig %lu, donor %lu]\n",
+				 orig_start, orig_blocks,
 				orig_inode->i_ino, donor_inode->i_ino);
 			return -EINVAL;
 		}
 
-		if (orig_start + *len > orig_inode->i_size) {
+		if (orig_start + *len > orig_blocks) {
 			ext4_debug("ext4 move extent: Adjust length "
-				"from %llu to %lld. Because it should be "
-				"less than original file size "
+				"from %llu to %llu. Because it should be "
+				"less than original file blocks "
 				"[ino:orig %lu, donor %lu]\n",
-				*len, orig_inode->i_size - orig_start,
+				*len, orig_blocks - orig_start,
 				orig_inode->i_ino, donor_inode->i_ino);
-			*len = orig_inode->i_size - orig_start;
+			*len = orig_blocks - orig_start;
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From daea696dbac0e33af3cfe304efbfb8d74e0effe6 Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Sat, 5 Sep 2009 22:11:55 -0400
Subject: ext4: Remove unneeded BUG_ON() in ext4_move_extents()

The ext4_move_extents() functions checks with BUG_ON() whether the
exchanged blocks count accords with request blocks count.  But, if the
target range (orig_start + len) includes sparse block(s), 'moved_len'
(exchanged blocks count) does not agree with 'len' (request blocks
count), since sparse block is not counted in 'moved_len'.  This causes
us to hit the BUG_ON(), even though the function succeeded.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c593eb2b193a..c8c66b167cd7 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -1322,8 +1322,5 @@ out2:
 	if (ret)
 		return ret;
 
-	/* All of the specified blocks must be exchanged in succeed */
-	BUG_ON(*moved_len != len);
-
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 8d6669133d8cdbb7cbe0e1f0f3744e7802a84afe Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Sat, 5 Sep 2009 22:46:29 -0400
Subject: ext4: Return exchanged blocks count to user space in failure

Return exchanged blocks count (moved_len) to user space,
if ext4_move_extents() failed on the way.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ioctl.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7050a9cd04a4..c1cdf613e725 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -243,10 +243,9 @@ setversion_out:
 					me.donor_start, me.len, &me.moved_len);
 		fput(donor_filp);
 
-		if (!err)
-			if (copy_to_user((struct move_extent *)arg,
-				&me, sizeof(me)))
-				return -EFAULT;
+		if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
+			return -EFAULT;
+
 		return err;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 44fc48f7048ab9657b524938a832fec4e0acea98 Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Sat, 5 Sep 2009 23:12:41 -0400
Subject: ext4: Fix small typo for move_extent_per_page()

This function means moving extents every page, so change its name from
move_exgtent_par_page().

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.co.jp>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c8c66b167cd7..4c4491cef357 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -740,7 +740,7 @@ out:
  * on success, or a negative error value on failure.
  */
 static int
-move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
+move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 		  pgoff_t orig_page_offset, int data_offset_in_page,
 		  int block_len_in_page, int uninit)
 {
@@ -1267,7 +1267,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		while (orig_page_offset <= seq_end_page) {
 
 			/* Swap original branches with new branches */
-			ret = move_extent_par_page(o_filp, donor_inode,
+			ret = move_extent_per_page(o_filp, donor_inode,
 						orig_page_offset,
 						data_offset_in_page,
 						block_len_in_page, uninit);
-- 
cgit v1.2.3-70-g09d2


From acd0c935178649f72c44ec49ca83bee35ce1f79e Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Fri, 4 Sep 2009 13:08:46 -0400
Subject: IMA: update ima_counts_put

- As ima_counts_put() may be called after the inode has been freed,
verify that the inode is not NULL, before dereferencing it.

- Maintain the IMA file counters in may_open() properly, decrementing
any counter increments on subsequent errors.

Reported-by: Ciprian Docan <docan@eden.rutgers.edu>
Reported-by: J.R. Okajima <hooanon05@yahoo.co.jp>
Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Eric Paris <eparis@redhat.com
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/namei.c                        | 22 +++++++++++++++-------
 security/integrity/ima/ima_main.c |  6 +++++-
 2 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f3c5b278895a..1f13751693a5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1542,28 +1542,31 @@ int may_open(struct path *path, int acc_mode, int flag)
 	 * An append-only file must be opened in append mode for writing.
 	 */
 	if (IS_APPEND(inode)) {
+		error = -EPERM;
 		if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
-			return -EPERM;
+			goto err_out;
 		if (flag & O_TRUNC)
-			return -EPERM;
+			goto err_out;
 	}
 
 	/* O_NOATIME can only be set by the owner or superuser */
 	if (flag & O_NOATIME)
-		if (!is_owner_or_cap(inode))
-			return -EPERM;
+		if (!is_owner_or_cap(inode)) {
+			error = -EPERM;
+			goto err_out;
+		}
 
 	/*
 	 * Ensure there are no outstanding leases on the file.
 	 */
 	error = break_lease(inode, flag);
 	if (error)
-		return error;
+		goto err_out;
 
 	if (flag & O_TRUNC) {
 		error = get_write_access(inode);
 		if (error)
-			return error;
+			goto err_out;
 
 		/*
 		 * Refuse to truncate files with mandatory locks held on them.
@@ -1581,12 +1584,17 @@ int may_open(struct path *path, int acc_mode, int flag)
 		}
 		put_write_access(inode);
 		if (error)
-			return error;
+			goto err_out;
 	} else
 		if (flag & FMODE_WRITE)
 			vfs_dq_init(inode);
 
 	return 0;
+err_out:
+	ima_counts_put(path, acc_mode ?
+		       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
+		       ACC_MODE(flag) & (MAY_READ | MAY_WRITE));
+	return error;
 }
 
 /*
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 4732f5e5d127..b85e61bcf246 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -249,7 +249,11 @@ void ima_counts_put(struct path *path, int mask)
 	struct inode *inode = path->dentry->d_inode;
 	struct ima_iint_cache *iint;
 
-	if (!ima_initialized || !S_ISREG(inode->i_mode))
+	/* The inode may already have been freed, freeing the iint
+	 * with it. Verify the inode is not NULL before dereferencing
+	 * it.
+	 */
+	if (!ima_initialized || !inode || !S_ISREG(inode->i_mode))
 		return;
 	iint = ima_iint_find_insert_get(inode);
 	if (!iint)
-- 
cgit v1.2.3-70-g09d2


From a827eaffff07c7d58a4cb32158cbeb4849f4e33a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 9 Sep 2009 22:36:03 -0400
Subject: ext4: Take page lock before looking at attached buffer_heads flags

In order to check whether the buffer_heads are mapped we need to hold
page lock. Otherwise a reclaim can cleanup the attached buffer_heads.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 17802a96af9f..5c0d17066f4a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5286,12 +5286,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	else
 		len = PAGE_CACHE_SIZE;
 
+	lock_page(page);
+	/*
+	 * return if we have all the buffers mapped. This avoid
+	 * the need to call write_begin/write_end which does a
+	 * journal_start/journal_stop which can block and take
+	 * long time
+	 */
 	if (page_has_buffers(page)) {
-		/* return if we have all the buffers mapped */
 		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-				       ext4_bh_unmapped))
+					ext4_bh_unmapped)) {
+			unlock_page(page);
 			goto out_unlock;
+		}
 	}
+	unlock_page(page);
 	/*
 	 * OK, we need to fill the hole... Do write_begin write_end
 	 * to do block allocation/reservation.We are not holding
-- 
cgit v1.2.3-70-g09d2


From 80e42468d65475e92651e62175bb7807773321d0 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 8 Sep 2009 08:21:26 -0400
Subject: ext4: print more sysadmin-friendly message in check_block_validity()

Drop the WARN_ON(1), as he stack trace is not appropriate, since it is
triggered by file system corruption, and it misleads users into
thinking there is a kernel bug.  In addition, change the message
displayed by ext4_error() to make it clear that this is a file system
corruption problem.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5c0d17066f4a..6253ecdac67f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1121,16 +1121,15 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 		ext4_discard_preallocations(inode);
 }
 
-static int check_block_validity(struct inode *inode, sector_t logical,
-				sector_t phys, int len)
+static int check_block_validity(struct inode *inode, const char *msg,
+				sector_t logical, sector_t phys, int len)
 {
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
-		ext4_error(inode->i_sb, "check_block_validity",
+		ext4_error(inode->i_sb, msg,
 			   "inode #%lu logical block %llu mapped to %llu "
 			   "(size %d)", inode->i_ino,
 			   (unsigned long long) logical,
 			   (unsigned long long) phys, len);
-		WARN_ON(1);
 		return -EIO;
 	}
 	return 0;
@@ -1182,8 +1181,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 	if (retval > 0 && buffer_mapped(bh)) {
-		int ret = check_block_validity(inode, block,
-					       bh->b_blocknr, retval);
+		int ret = check_block_validity(inode, "file system corruption",
+					       block, bh->b_blocknr, retval);
 		if (ret != 0)
 			return ret;
 	}
@@ -1264,8 +1263,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && buffer_mapped(bh)) {
-		int ret = check_block_validity(inode, block,
-					       bh->b_blocknr, retval);
+		int ret = check_block_validity(inode, "file system "
+					       "corruption after allocation",
+					       block, bh->b_blocknr, retval);
 		if (ret != 0)
 			return ret;
 	}
-- 
cgit v1.2.3-70-g09d2


From acf7e2444acfaf4c8540603b76d71010eea3fc24 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 8 Sep 2009 18:00:30 +0100
Subject: GFS2: Be extra careful about deallocating inodes

There is a potential race in the inode deallocation code if two
nodes try to deallocate the same inode at the same time. Most of
the issue is solved by the iopen locking. There is still a small
window which is not covered by the iopen lock. This patches fixes
that and also makes the deallocation code more robust in the face of
any errors in the rgrp bitmaps, or erroneous iopen callbacks from
other nodes.

This does introduce one extra disk read, but that is generally not
an issue since its the same block that must be written to later
in the deallocation process. The total disk accesses therefore stay
the same,

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/export.c | 36 ++++--------------------------------
 fs/gfs2/rgrp.c   | 42 +++++++++++++++++++++++++++++++++++++++++-
 fs/gfs2/rgrp.h   |  4 ++--
 fs/gfs2/super.c  |  4 ++++
 4 files changed, 51 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9200ef221716..d15876e9aa26 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -143,17 +143,14 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
 }
 
 static struct dentry *gfs2_get_dentry(struct super_block *sb,
-		struct gfs2_inum_host *inum)
+				      struct gfs2_inum_host *inum)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct gfs2_holder i_gh, ri_gh, rgd_gh;
-	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder i_gh;
 	struct inode *inode;
 	struct dentry *dentry;
 	int error;
 
-	/* System files? */
-
 	inode = gfs2_ilookup(sb, inum->no_addr);
 	if (inode) {
 		if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
@@ -168,29 +165,11 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 	if (error)
 		return ERR_PTR(error);
 
-	error = gfs2_rindex_hold(sdp, &ri_gh);
+	error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
 	if (error)
 		goto fail;
 
-	error = -EINVAL;
-	rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
-	if (!rgd)
-		goto fail_rindex;
-
-	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
-	if (error)
-		goto fail_rindex;
-
-	error = -ESTALE;
-	if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
-		goto fail_rgd;
-
-	gfs2_glock_dq_uninit(&rgd_gh);
-	gfs2_glock_dq_uninit(&ri_gh);
-
-	inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
-					inum->no_addr,
-					0, 0);
+	inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
 	if (IS_ERR(inode)) {
 		error = PTR_ERR(inode);
 		goto fail;
@@ -224,13 +203,6 @@ out_inode:
 	if (!IS_ERR(dentry))
 		dentry->d_op = &gfs2_dops;
 	return dentry;
-
-fail_rgd:
-	gfs2_glock_dq_uninit(&rgd_gh);
-
-fail_rindex:
-	gfs2_glock_dq_uninit(&ri_gh);
-
 fail:
 	gfs2_glock_dq_uninit(&i_gh);
 	return ERR_PTR(error);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 388a61d12fc8..caaa665d4033 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1256,7 +1256,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
  * Returns: The block type (GFS2_BLKST_*)
  */
 
-unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
+static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
 {
 	struct gfs2_bitmap *bi = NULL;
 	u32 length, rgrp_block, buf_block;
@@ -1693,6 +1693,46 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 	gfs2_meta_wipe(ip, ip->i_no_addr, 1);
 }
 
+/**
+ * gfs2_check_blk_type - Check the type of a block
+ * @sdp: The superblock
+ * @no_addr: The block number to check
+ * @type: The block type we are looking for
+ *
+ * Returns: 0 if the block type matches the expected type
+ *          -ESTALE if it doesn't match
+ *          or -ve errno if something went wrong while checking
+ */
+
+int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
+{
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder ri_gh, rgd_gh;
+	int error;
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		goto fail;
+
+	error = -EINVAL;
+	rgd = gfs2_blk2rgrpd(sdp, no_addr);
+	if (!rgd)
+		goto fail_rindex;
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+	if (error)
+		goto fail_rindex;
+
+	if (gfs2_get_block_type(rgd, no_addr) != type)
+		error = -ESTALE;
+
+	gfs2_glock_dq_uninit(&rgd_gh);
+fail_rindex:
+	gfs2_glock_dq_uninit(&ri_gh);
+fail:
+	return error;
+}
+
 /**
  * gfs2_rlist_add - add a RG to a list of RGs
  * @sdp: the filesystem
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index a8dedd78245b..b4106ddaaa98 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -44,8 +44,6 @@ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
 
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
-extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
-
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
 
@@ -53,6 +51,8 @@ extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
 extern void gfs2_unlink_di(struct inode *inode);
+extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
+			       unsigned int type);
 
 struct gfs2_rgrp_list {
 	unsigned int rl_rgrps;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d95cf777d244..0ec3ec672de1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1286,6 +1286,10 @@ static void gfs2_delete_inode(struct inode *inode)
 		goto out;
 	}
 
+	error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+	if (error)
+		goto out_truncate;
+
 	gfs2_glock_dq_wait(&ip->i_iopen_gh);
 	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
 	error = gfs2_glock_nq(&ip->i_iopen_gh);
-- 
cgit v1.2.3-70-g09d2


From e8e66ed25b5cbeebed69c475f6c108e52078a3b3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Aug 2009 10:05:33 -0700
Subject: Do not call 'ima_path_check()' for each path component

Not only is that a supremely timing-critical path, but it's hopefully
some day going to be lockless for the common case, and ima can't do
that.

Plus the integrity code doesn't even care about non-regular files, so it
was always a total waste of time and effort.

Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 1f13751693a5..a005d8b7afad 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -856,9 +856,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 		if (err == -EAGAIN)
 			err = inode_permission(nd->path.dentry->d_inode,
 					       MAY_EXEC);
-		if (!err)
-			err = ima_path_check(&nd->path, MAY_EXEC,
-				             IMA_COUNT_UPDATE);
  		if (err)
 			break;
 
-- 
cgit v1.2.3-70-g09d2


From b7a437b08a44a3ed7e3a052eb39d2c5f618b603b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Aug 2009 10:50:37 -0700
Subject: Simplify exec_permission_lite() logic

Instead of returning EAGAIN and having the caller do something
special for that case,  just do the special case directly.

Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index a005d8b7afad..8c3580610eec 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -435,7 +435,7 @@ static int exec_permission_lite(struct inode *inode)
 	umode_t	mode = inode->i_mode;
 
 	if (inode->i_op->permission)
-		return -EAGAIN;
+		return inode_permission(inode, MAY_EXEC);
 
 	if (current_fsuid() == inode->i_uid)
 		mode >>= 6;
@@ -853,9 +853,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 
 		nd->flags |= LOOKUP_CONTINUE;
 		err = exec_permission_lite(inode);
-		if (err == -EAGAIN)
-			err = inode_permission(nd->path.dentry->d_inode,
-					       MAY_EXEC);
  		if (err)
 			break;
 
-- 
cgit v1.2.3-70-g09d2


From f1ac9f6bfea6f21e8ab6dbbe46879d62a6fba8c0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Aug 2009 10:53:56 -0700
Subject: Simplify exec_permission_lite() further

This function is only called for path components that are already known
to be directories (they have a '->lookup' method).  So don't bother
doing that whole S_ISDIR() testing, the whole point of the 'lite()'
version is that we know that we are looking at a directory component,
and that we're only checking name lookup permission.

Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 8c3580610eec..929f535fb225 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -445,13 +445,7 @@ static int exec_permission_lite(struct inode *inode)
 	if (mode & MAY_EXEC)
 		goto ok;
 
-	if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
-		goto ok;
-
-	if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
-		goto ok;
-
-	if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH))
+	if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
 		goto ok;
 
 	return -EACCES;
-- 
cgit v1.2.3-70-g09d2


From cb9179ead0aa0e3b7b4087cdba59baf16bbeef6d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Aug 2009 11:08:31 -0700
Subject: Simplify exec_permission_lite(), part 3

Don't call down to the generic inode_permission() function just to
call the inode-specific permission function - just do it directly.

The generic inode_permission() code does things like checking MAY_WRITE
and devcgroup_inode_permission(), neither of which are relevant for the
light pathname walk permission checks (we always do just MAY_EXEC, and
the inode is never a special device).

Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 929f535fb225..e645e3070360 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -434,8 +434,12 @@ static int exec_permission_lite(struct inode *inode)
 {
 	umode_t	mode = inode->i_mode;
 
-	if (inode->i_op->permission)
-		return inode_permission(inode, MAY_EXEC);
+	if (inode->i_op->permission) {
+		int ret = inode->i_op->permission(inode, MAY_EXEC);
+		if (!ret)
+			goto ok;
+		return ret;
+	}
 
 	if (current_fsuid() == inode->i_uid)
 		mode >>= 6;
-- 
cgit v1.2.3-70-g09d2


From 5909ccaa300a4a834ffa275327af4df0b9cb5295 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Aug 2009 11:51:25 -0700
Subject: Make 'check_acl()' a first-class filesystem op

This is stage one in flattening out the callchains for the common
permission testing.  Rather than have most filesystem implement their
own inode->i_op->permission function that just calls back down to the
VFS layers 'generic_permission()' with the per-filesystem ACL checking
function, the filesystem can just expose its 'check_acl' function
directly, and let the VFS layer do everything for it.

This is all just preparatory - no filesystem actually enables this yet.

Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c         | 62 ++++++++++++++++++++++++++++++------------------------
 include/linux/fs.h |  1 +
 2 files changed, 36 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index e645e3070360..ed27bb205b7e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,19 +169,10 @@ void putname(const char *name)
 EXPORT_SYMBOL(putname);
 #endif
 
-
-/**
- * generic_permission  -  check for access rights on a Posix-like filesystem
- * @inode:	inode to check access rights for
- * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- * @check_acl:	optional callback to check for Posix ACLs
- *
- * Used to check for read/write/execute permissions on a file.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things..
+/*
+ * This does basic POSIX ACL permission checking
  */
-int generic_permission(struct inode *inode, int mask,
+static int acl_permission_check(struct inode *inode, int mask,
 		int (*check_acl)(struct inode *inode, int mask))
 {
 	umode_t			mode = inode->i_mode;
@@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask,
 	else {
 		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
 			int error = check_acl(inode, mask);
-			if (error == -EACCES)
-				goto check_capabilities;
-			else if (error != -EAGAIN)
+			if (error != -EAGAIN)
 				return error;
 		}
 
@@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask,
 	 */
 	if ((mask & ~mode) == 0)
 		return 0;
+	return -EACCES;
+}
+
+/**
+ * generic_permission  -  check for access rights on a Posix-like filesystem
+ * @inode:	inode to check access rights for
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @check_acl:	optional callback to check for Posix ACLs
+ *
+ * Used to check for read/write/execute permissions on a file.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things..
+ */
+int generic_permission(struct inode *inode, int mask,
+		int (*check_acl)(struct inode *inode, int mask))
+{
+	int ret;
+
+	/*
+	 * Do the basic POSIX ACL permission checks.
+	 */
+	ret = acl_permission_check(inode, mask, check_acl);
+	if (ret != -EACCES)
+		return ret;
 
- check_capabilities:
 	/*
 	 * Read/write DACs are always overridable.
 	 * Executable DACs are overridable if at least one exec bit is set.
@@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask)
 	if (inode->i_op->permission)
 		retval = inode->i_op->permission(inode, mask);
 	else
-		retval = generic_permission(inode, mask, NULL);
+		retval = generic_permission(inode, mask, inode->i_op->check_acl);
 
 	if (retval)
 		return retval;
@@ -432,27 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
  */
 static int exec_permission_lite(struct inode *inode)
 {
-	umode_t	mode = inode->i_mode;
+	int ret;
 
 	if (inode->i_op->permission) {
-		int ret = inode->i_op->permission(inode, MAY_EXEC);
+		ret = inode->i_op->permission(inode, MAY_EXEC);
 		if (!ret)
 			goto ok;
 		return ret;
 	}
-
-	if (current_fsuid() == inode->i_uid)
-		mode >>= 6;
-	else if (in_group_p(inode->i_gid))
-		mode >>= 3;
-
-	if (mode & MAY_EXEC)
+	ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
+	if (!ret)
 		goto ok;
 
 	if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
 		goto ok;
 
-	return -EACCES;
+	return ret;
 ok:
 	return security_inode_permission(inode, MAY_EXEC);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 73e9b643e455..c1f993515f51 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1528,6 +1528,7 @@ struct inode_operations {
 	void (*put_link) (struct dentry *, struct nameidata *, void *);
 	void (*truncate) (struct inode *);
 	int (*permission) (struct inode *, int);
+	int (*check_acl)(struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
 	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
-- 
cgit v1.2.3-70-g09d2


From 1d5ccd1c422d7d292a9e45248aa36771900c6331 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Aug 2009 12:12:24 -0700
Subject: ext[234]: move over to 'check_acl' permission model

Don't implement per-filesystem 'extX_permission()' functions that have
to be called for every path component operation, and instead just expose
the actual ACL checking so that the VFS layer can now do it for us.

Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/acl.c   | 8 +-------
 fs/ext2/acl.h   | 4 ++--
 fs/ext2/file.c  | 2 +-
 fs/ext2/namei.c | 4 ++--
 fs/ext3/acl.c   | 8 +-------
 fs/ext3/acl.h   | 4 ++--
 fs/ext3/file.c  | 2 +-
 fs/ext3/namei.c | 4 ++--
 fs/ext4/acl.c   | 8 +-------
 fs/ext4/acl.h   | 4 ++--
 fs/ext4/file.c  | 2 +-
 fs/ext4/namei.c | 4 ++--
 12 files changed, 18 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d636e1297cad..a63d44256a70 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	return error;
 }
 
-static int
+int
 ext2_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
@@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int
-ext2_permission(struct inode *inode, int mask)
-{
-	return generic_permission(inode, mask, ext2_check_acl);
-}
-
 /*
  * Initialize the ACLs of a new inode. Called from ext2_new_inode.
  *
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index ecefe478898f..3ff6cbb9ac44 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size)
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext2_permission (struct inode *, int);
+extern int ext2_check_acl (struct inode *, int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
 
 #else
 #include <linux/sched.h>
-#define ext2_permission NULL
+#define ext2_check_acl	NULL
 #define ext2_get_acl	NULL
 #define ext2_set_acl	NULL
 
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 2b9e47dc9222..a2f3afd1a1c1 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.setattr	= ext2_setattr,
-	.permission	= ext2_permission,
+	.check_acl	= ext2_check_acl,
 	.fiemap		= ext2_fiemap,
 };
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 78d9b925fc94..23701f289e98 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -400,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.setattr	= ext2_setattr,
-	.permission	= ext2_permission,
+	.check_acl	= ext2_check_acl,
 };
 
 const struct inode_operations ext2_special_inode_operations = {
@@ -411,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.setattr	= ext2_setattr,
-	.permission	= ext2_permission,
+	.check_acl	= ext2_check_acl,
 };
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e167bae37ef0..c9b0df376b5f 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 	return error;
 }
 
-static int
+int
 ext3_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
@@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int
-ext3_permission(struct inode *inode, int mask)
-{
-	return generic_permission(inode, mask, ext3_check_acl);
-}
-
 /*
  * Initialize the ACLs of a new inode. Called from ext3_new_inode.
  *
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 07d15a3a5969..597334626de9 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size)
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext3_permission (struct inode *, int);
+extern int ext3_check_acl (struct inode *, int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
 
 #else  /* CONFIG_EXT3_FS_POSIX_ACL */
 #include <linux/sched.h>
-#define ext3_permission NULL
+#define ext3_check_acl NULL
 
 static inline int
 ext3_acl_chmod(struct inode *inode)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704b231b..299253214789 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -137,7 +137,7 @@ const struct inode_operations ext3_file_inode_operations = {
 	.listxattr	= ext3_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
-	.permission	= ext3_permission,
+	.check_acl	= ext3_check_acl,
 	.fiemap		= ext3_fiemap,
 };
 
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 6ff7b9730234..aad6400c9b77 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = {
 	.listxattr	= ext3_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
-	.permission	= ext3_permission,
+	.check_acl	= ext3_check_acl,
 };
 
 const struct inode_operations ext3_special_inode_operations = {
@@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = {
 	.listxattr	= ext3_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
-	.permission	= ext3_permission,
+	.check_acl	= ext3_check_acl,
 };
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index f6d8967149ca..0df88b2a69b0 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 	return error;
 }
 
-static int
+int
 ext4_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
@@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int
-ext4_permission(struct inode *inode, int mask)
-{
-	return generic_permission(inode, mask, ext4_check_acl);
-}
-
 /*
  * Initialize the ACLs of a new inode. Called from ext4_new_inode.
  *
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 949789d2bba6..9d843d5deac4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext4_permission(struct inode *, int);
+extern int ext4_check_acl(struct inode *, int);
 extern int ext4_acl_chmod(struct inode *);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
 #else  /* CONFIG_EXT4_FS_POSIX_ACL */
 #include <linux/sched.h>
-#define ext4_permission NULL
+#define ext4_check_acl NULL
 
 static inline int
 ext4_acl_chmod(struct inode *inode)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3f1873fef1c6..27f3c5354c0e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -207,7 +207,7 @@ const struct inode_operations ext4_file_inode_operations = {
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
-	.permission	= ext4_permission,
+	.check_acl	= ext4_check_acl,
 	.fallocate	= ext4_fallocate,
 	.fiemap		= ext4_fiemap,
 };
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index de04013d16ff..114abe5d2c1d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2536,7 +2536,7 @@ const struct inode_operations ext4_dir_inode_operations = {
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
-	.permission	= ext4_permission,
+	.check_acl	= ext4_check_acl,
 	.fiemap         = ext4_fiemap,
 };
 
@@ -2548,5 +2548,5 @@ const struct inode_operations ext4_special_inode_operations = {
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
-	.permission	= ext4_permission,
+	.check_acl	= ext4_check_acl,
 };
-- 
cgit v1.2.3-70-g09d2


From 18f4c644773bc8de1fd9c5182b30c231aafb94ef Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 28 Aug 2009 12:29:03 -0700
Subject: jffs2/jfs/xfs: switch over to 'check_acl' rather than 'permission()'

This avoids an indirect call in the VFS for each path component lookup.

Well, at least as long as you own the directory in question, and the ACL
check is unnecessary.

Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jffs2/acl.c              |  7 +------
 fs/jffs2/acl.h              |  4 ++--
 fs/jffs2/dir.c              |  2 +-
 fs/jffs2/file.c             |  2 +-
 fs/jffs2/symlink.c          |  2 +-
 fs/jfs/acl.c                |  7 +------
 fs/jfs/file.c               |  2 +-
 fs/jfs/jfs_acl.h            |  2 +-
 fs/jfs/namei.c              |  2 +-
 fs/xfs/linux-2.6/xfs_iops.c | 16 ++++------------
 10 files changed, 14 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 8fcb6239218e..7edb62e97419 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	return rc;
 }
 
-static int jffs2_check_acl(struct inode *inode, int mask)
+int jffs2_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl;
 	int rc;
@@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int jffs2_permission(struct inode *inode, int mask)
-{
-	return generic_permission(inode, mask, jffs2_check_acl);
-}
-
 int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 {
 	struct posix_acl *acl, *clone;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index fc929f2a14f6..f0ba63e3c36b 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
 
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 
-extern int jffs2_permission(struct inode *, int);
+extern int jffs2_check_acl(struct inode *, int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
@@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
 
 #else
 
-#define jffs2_permission			(NULL)
+#define jffs2_check_acl				(NULL)
 #define jffs2_acl_chmod(inode)			(0)
 #define jffs2_init_acl_pre(dir_i,inode,mode)	(0)
 #define jffs2_init_acl_post(inode)		(0)
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 6f60cc910f4c..7aa4417e085f 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations =
 	.rmdir =	jffs2_rmdir,
 	.mknod =	jffs2_mknod,
 	.rename =	jffs2_rename,
-	.permission =	jffs2_permission,
+	.check_acl =	jffs2_check_acl,
 	.setattr =	jffs2_setattr,
 	.setxattr =	jffs2_setxattr,
 	.getxattr =	jffs2_getxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 23c947539864..b7b74e299142 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations =
 
 const struct inode_operations jffs2_file_inode_operations =
 {
-	.permission =	jffs2_permission,
+	.check_acl =	jffs2_check_acl,
 	.setattr =	jffs2_setattr,
 	.setxattr =	jffs2_setxattr,
 	.getxattr =	jffs2_getxattr,
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index b7339c3b6ad9..4ec11e8bda8c 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations =
 {
 	.readlink =	generic_readlink,
 	.follow_link =	jffs2_follow_link,
-	.permission =	jffs2_permission,
+	.check_acl =	jffs2_check_acl,
 	.setattr =	jffs2_setattr,
 	.setxattr =	jffs2_setxattr,
 	.getxattr =	jffs2_getxattr,
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a29c7c3e3fb8..d66477c34306 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,7 +114,7 @@ out:
 	return rc;
 }
 
-static int jfs_check_acl(struct inode *inode, int mask)
+int jfs_check_acl(struct inode *inode, int mask)
 {
 	struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
 
@@ -129,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int jfs_permission(struct inode *inode, int mask)
-{
-	return generic_permission(inode, mask, jfs_check_acl);
-}
-
 int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
 {
 	struct posix_acl *acl = NULL;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 7f6063acaa3b..2b70fa78e4a7 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = {
 	.removexattr	= jfs_removexattr,
 #ifdef CONFIG_JFS_POSIX_ACL
 	.setattr	= jfs_setattr,
-	.permission	= jfs_permission,
+	.check_acl	= jfs_check_acl,
 #endif
 };
 
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 88475f10a389..b07bd417ef85 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
 
 #ifdef CONFIG_JFS_POSIX_ACL
 
-int jfs_permission(struct inode *, int);
+int jfs_check_acl(struct inode *, int);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 int jfs_setattr(struct dentry *, struct iattr *);
 
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 514ee2edb92a..c79a4270f083 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = {
 	.removexattr	= jfs_removexattr,
 #ifdef CONFIG_JFS_POSIX_ACL
 	.setattr	= jfs_setattr,
-	.permission	= jfs_permission,
+	.check_acl	= jfs_check_acl,
 #endif
 };
 
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 8070b34cc287..6c32f1d63d8c 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -484,14 +484,6 @@ xfs_vn_put_link(
 		kfree(s);
 }
 
-STATIC int
-xfs_vn_permission(
-	struct inode		*inode,
-	int			mask)
-{
-	return generic_permission(inode, mask, xfs_check_acl);
-}
-
 STATIC int
 xfs_vn_getattr(
 	struct vfsmount		*mnt,
@@ -696,7 +688,7 @@ xfs_vn_fiemap(
 }
 
 static const struct inode_operations xfs_inode_operations = {
-	.permission		= xfs_vn_permission,
+	.check_acl		= xfs_check_acl,
 	.truncate		= xfs_vn_truncate,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
@@ -724,7 +716,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
 	.rmdir			= xfs_vn_unlink,
 	.mknod			= xfs_vn_mknod,
 	.rename			= xfs_vn_rename,
-	.permission		= xfs_vn_permission,
+	.check_acl		= xfs_check_acl,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
 	.setxattr		= generic_setxattr,
@@ -749,7 +741,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
 	.rmdir			= xfs_vn_unlink,
 	.mknod			= xfs_vn_mknod,
 	.rename			= xfs_vn_rename,
-	.permission		= xfs_vn_permission,
+	.check_acl		= xfs_check_acl,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
 	.setxattr		= generic_setxattr,
@@ -762,7 +754,7 @@ static const struct inode_operations xfs_symlink_inode_operations = {
 	.readlink		= generic_readlink,
 	.follow_link		= xfs_vn_follow_link,
 	.put_link		= xfs_vn_put_link,
-	.permission		= xfs_vn_permission,
+	.check_acl		= xfs_check_acl,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
 	.setxattr		= generic_setxattr,
-- 
cgit v1.2.3-70-g09d2


From dbab8360ed8abca38121109feab47c4bea895994 Mon Sep 17 00:00:00 2001
From: Harshula Jayasuriya <harshula@redhat.com>
Date: Tue, 8 Sep 2009 19:49:33 -0400
Subject: NFS: out of date comment regarding O_EXCL above nfs3_proc_create()

Hi Trond,

Recently we were observing the behaviour difference between a 2.4.x and
2.6.x kernel with respect to O_EXCL. A comment from 2.4.x era, "For now,
we don't implement O_EXCL." seems inaccurate in TOT.

If so, here's a patch to remove the comment.

This patch is against:
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

Signed-off-by: Harshula Jayasuriya <harshula@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3proc.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d0cc5ce0edfe..ee6a13f05443 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -299,7 +299,6 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
 
 /*
  * Create a regular file.
- * For now, we don't implement O_EXCL.
  */
 static int
 nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
-- 
cgit v1.2.3-70-g09d2


From 4cfd74fc99a41fdc161f243e1c16199656d33ab1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 8 Sep 2009 19:49:47 -0400
Subject: NFS: Mount option parser should detect missing "port="

The meaning of not specifying the "port=" mount option is different
for "-t nfs" and "-t nfs4" mounts.  The default port value for
NFSv2/v3 mounts is 0, but the default for NFSv4 mounts is 2049.

To support "-t nfs -o vers=4", the mount option parser must detect
when "port=" is missing so that the correct default port value can be
set depending on which NFS version is requested.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h |  9 +++++++--
 fs/nfs/super.c    | 55 +++++++++++++++++++++++++++++--------------------------
 2 files changed, 36 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2e485677019c..5e686a4d744e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -48,6 +48,11 @@ struct nfs_clone_mount {
  */
 #define NFS_MAX_SECFLAVORS	(12)
 
+/*
+ * Value used if the user did not specify a port value.
+ */
+#define NFS_UNSPEC_PORT		(-1)
+
 /*
  * In-kernel mount arguments
  */
@@ -71,7 +76,7 @@ struct nfs_parsed_mount_data {
 		size_t			addrlen;
 		char			*hostname;
 		u32			version;
-		unsigned short		port;
+		int			port;
 		unsigned short		protocol;
 	} mount_server;
 
@@ -80,7 +85,7 @@ struct nfs_parsed_mount_data {
 		size_t			addrlen;
 		char			*hostname;
 		char			*export_path;
-		unsigned short		port;
+		int			port;
 		unsigned short		protocol;
 	} nfs_server;
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f3a95df4b95f..05544336f7de 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -746,6 +746,21 @@ static int nfs_verify_server_address(struct sockaddr *addr)
 	return 0;
 }
 
+/*
+ * Select between a default port value and a user-specified port value.
+ * If a zero value is set, then autobind will be used.
+ */
+static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port,
+				 const unsigned short default_port)
+{
+	unsigned short port = default_port;
+
+	if (parsed_port != NFS_UNSPEC_PORT)
+		port = parsed_port;
+
+	rpc_set_port(sap, port);
+}
+
 /*
  * Sanity check the NFS transport protocol.
  *
@@ -1415,11 +1430,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 		args->mount_server.addrlen = args->nfs_server.addrlen;
 	}
 	request.salen = args->mount_server.addrlen;
-
-	/*
-	 * autobind will be used if mount_server.port == 0
-	 */
-	rpc_set_port(request.sap, args->mount_server.port);
+	nfs_set_default_port(request.sap, args->mount_server.port, 0);
 
 	/*
 	 * Now ask the mount server to map our export path
@@ -1597,6 +1608,7 @@ static int nfs_validate_mount_data(void *options,
 				   const char *dev_name)
 {
 	struct nfs_mount_data *data = (struct nfs_mount_data *)options;
+	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
 
 	if (data == NULL)
 		goto out_no_data;
@@ -1608,8 +1620,8 @@ static int nfs_validate_mount_data(void *options,
 	args->acregmax		= NFS_DEF_ACREGMAX;
 	args->acdirmin		= NFS_DEF_ACDIRMIN;
 	args->acdirmax		= NFS_DEF_ACDIRMAX;
-	args->mount_server.port	= 0;	/* autobind unless user sets port */
-	args->nfs_server.port	= 0;	/* autobind unless user sets port */
+	args->mount_server.port	= NFS_UNSPEC_PORT;
+	args->nfs_server.port	= NFS_UNSPEC_PORT;
 	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 	args->auth_flavors[0]	= RPC_AUTH_UNIX;
 	args->auth_flavor_len	= 1;
@@ -1657,11 +1669,9 @@ static int nfs_validate_mount_data(void *options,
 		args->acdirmin		= data->acdirmin;
 		args->acdirmax		= data->acdirmax;
 
-		memcpy(&args->nfs_server.address, &data->addr,
-		       sizeof(data->addr));
+		memcpy(sap, &data->addr, sizeof(data->addr));
 		args->nfs_server.addrlen = sizeof(data->addr);
-		if (!nfs_verify_server_address((struct sockaddr *)
-						&args->nfs_server.address))
+		if (!nfs_verify_server_address(sap))
 			goto out_no_address;
 
 		if (!(data->flags & NFS_MOUNT_TCP))
@@ -1709,12 +1719,10 @@ static int nfs_validate_mount_data(void *options,
 		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
 
-		if (!nfs_verify_server_address((struct sockaddr *)
-						&args->nfs_server.address))
+		if (!nfs_verify_server_address(sap))
 			goto out_no_address;
 
-		rpc_set_port((struct sockaddr *)&args->nfs_server.address,
-				args->nfs_server.port);
+		nfs_set_default_port(sap, args->nfs_server.port, 0);
 
 		nfs_set_mount_transport_protocol(args);
 
@@ -2261,7 +2269,7 @@ static int nfs4_validate_mount_data(void *options,
 				    struct nfs_parsed_mount_data *args,
 				    const char *dev_name)
 {
-	struct sockaddr_in *ap;
+	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
 	struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
 	char *c;
 
@@ -2274,23 +2282,21 @@ static int nfs4_validate_mount_data(void *options,
 	args->acregmax		= NFS_DEF_ACREGMAX;
 	args->acdirmin		= NFS_DEF_ACDIRMIN;
 	args->acdirmax		= NFS_DEF_ACDIRMAX;
-	args->nfs_server.port	= NFS_PORT; /* 2049 unless user set port= */
+	args->nfs_server.port	= NFS_UNSPEC_PORT;
 	args->auth_flavors[0]	= RPC_AUTH_UNIX;
 	args->auth_flavor_len	= 1;
 	args->minorversion	= 0;
 
 	switch (data->version) {
 	case 1:
-		ap = (struct sockaddr_in *)&args->nfs_server.address;
 		if (data->host_addrlen > sizeof(args->nfs_server.address))
 			goto out_no_address;
 		if (data->host_addrlen == 0)
 			goto out_no_address;
 		args->nfs_server.addrlen = data->host_addrlen;
-		if (copy_from_user(ap, data->host_addr, data->host_addrlen))
+		if (copy_from_user(sap, data->host_addr, data->host_addrlen))
 			return -EFAULT;
-		if (!nfs_verify_server_address((struct sockaddr *)
-						&args->nfs_server.address))
+		if (!nfs_verify_server_address(sap))
 			goto out_no_address;
 
 		if (data->auth_flavourlen) {
@@ -2342,12 +2348,9 @@ static int nfs4_validate_mount_data(void *options,
 		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
 
-		if (!nfs_verify_server_address((struct sockaddr *)
-						&args->nfs_server.address))
+		if (!nfs_verify_server_address(sap))
 			return -EINVAL;
-
-		rpc_set_port((struct sockaddr *)&args->nfs_server.address,
-				args->nfs_server.port);
+		nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT);
 
 		nfs_validate_transport_protocol(args);
 
-- 
cgit v1.2.3-70-g09d2


From 7630c852e19c7fffb85b50d98eeb5516fec7c088 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 8 Sep 2009 19:49:57 -0400
Subject: NFS: Refactor NFSv4 text-based mount option validation

Clean up: Refactor the part of nfs4_validate_mount_options() that
handles text-based options, so we can call it from the NFSv2/v3
option validation function.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 63 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 35 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 05544336f7de..599e8c5fe6ce 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -272,6 +272,8 @@ static const struct super_operations nfs_sops = {
 };
 
 #ifdef CONFIG_NFS_V4
+static int nfs4_validate_text_mount_data(void *options,
+	struct nfs_parsed_mount_data *args, const char *dev_name);
 static int nfs4_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static int nfs4_remote_get_sb(struct file_system_type *fs_type,
@@ -2262,6 +2264,37 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
 	args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
 }
 
+static int nfs4_validate_text_mount_data(void *options,
+					 struct nfs_parsed_mount_data *args,
+					 const char *dev_name)
+{
+	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
+
+	nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT);
+
+	nfs_validate_transport_protocol(args);
+
+	nfs4_validate_mount_flags(args);
+
+	if (args->auth_flavor_len > 1) {
+		dfprintk(MOUNT,
+			 "NFS4: Too many RPC auth flavours specified\n");
+		return -EINVAL;
+	}
+
+	if (args->client_address == NULL) {
+		dfprintk(MOUNT,
+			 "NFS4: mount program didn't pass callback address\n");
+		return -EINVAL;
+	}
+
+	return nfs_parse_devname(dev_name,
+				   &args->nfs_server.hostname,
+				   NFS4_MAXNAMLEN,
+				   &args->nfs_server.export_path,
+				   NFS4_MAXPATHLEN);
+}
+
 /*
  * Validate NFSv4 mount options
  */
@@ -2342,36 +2375,14 @@ static int nfs4_validate_mount_data(void *options,
 		nfs_validate_transport_protocol(args);
 
 		break;
-	default: {
-		int status;
-
+	default:
 		if (nfs_parse_mount_options((char *)options, args) == 0)
 			return -EINVAL;
 
 		if (!nfs_verify_server_address(sap))
 			return -EINVAL;
-		nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT);
-
-		nfs_validate_transport_protocol(args);
-
-		nfs4_validate_mount_flags(args);
-
-		if (args->auth_flavor_len > 1)
-			goto out_inval_auth;
-
-		if (args->client_address == NULL)
-			goto out_no_client_address;
-
-		status = nfs_parse_devname(dev_name,
-					   &args->nfs_server.hostname,
-					   NFS4_MAXNAMLEN,
-					   &args->nfs_server.export_path,
-					   NFS4_MAXPATHLEN);
-		if (status < 0)
-			return status;
 
-		break;
-		}
+		return nfs4_validate_text_mount_data(options, args, dev_name);
 	}
 
 	return 0;
@@ -2388,10 +2399,6 @@ out_inval_auth:
 out_no_address:
 	dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
 	return -EINVAL;
-
-out_no_client_address:
-	dfprintk(MOUNT, "NFS4: mount program didn't pass callback address\n");
-	return -EINVAL;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From a6fe23be90aa78783523a25330e09bfaa43a1581 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 8 Sep 2009 19:50:00 -0400
Subject: NFS: Move details of nfs4_get_sb() to a helper

Clean up: Refactor nfs4_get_sb() to allow its guts to be invoked by
nfs_get_sb().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 44 +++++++++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 599e8c5fe6ce..c105e12197c2 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -274,6 +274,8 @@ static const struct super_operations nfs_sops = {
 #ifdef CONFIG_NFS_V4
 static int nfs4_validate_text_mount_data(void *options,
 	struct nfs_parsed_mount_data *args, const char *dev_name);
+static int nfs4_try_mount(int flags, const char *dev_name,
+	struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
 static int nfs4_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static int nfs4_remote_get_sb(struct file_system_type *fs_type,
@@ -2565,6 +2567,34 @@ out_err:
 	return ret;
 }
 
+static int nfs4_try_mount(int flags, const char *dev_name,
+			 struct nfs_parsed_mount_data *data,
+			 struct vfsmount *mnt)
+{
+	char *export_path;
+	struct vfsmount *root_mnt;
+	int error;
+
+	dfprintk(MOUNT, "--> nfs4_try_mount()\n");
+
+	export_path = data->nfs_server.export_path;
+	data->nfs_server.export_path = "/";
+	root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
+			data->nfs_server.hostname);
+	data->nfs_server.export_path = export_path;
+
+	error = PTR_ERR(root_mnt);
+	if (IS_ERR(root_mnt))
+		goto out;
+
+	error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+
+out:
+	dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error,
+			error != 0 ? " [error]" : "");
+	return error;
+}
+
 /*
  * Get the superblock for an NFS4 mountpoint
  */
@@ -2572,8 +2602,6 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
 	struct nfs_parsed_mount_data *data;
-	char *export_path;
-	struct vfsmount *root_mnt;
 	int error = -ENOMEM;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
@@ -2585,17 +2613,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	if (error < 0)
 		goto out;
 
-	export_path = data->nfs_server.export_path;
-	data->nfs_server.export_path = "/";
-	root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
-			data->nfs_server.hostname);
-	data->nfs_server.export_path = export_path;
-
-	error = PTR_ERR(root_mnt);
-	if (IS_ERR(root_mnt))
-		goto out;
-
-	error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+	error = nfs4_try_mount(flags, dev_name, data, mnt);
 
 out:
 	kfree(data->client_address);
-- 
cgit v1.2.3-70-g09d2


From 764302ccb88dd0df062eccd507b6c6de24f1c560 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 8 Sep 2009 19:50:03 -0400
Subject: NFS: Allow the "nfs" file system type to support NFSv4

When mounting an "nfs" type file system, recognize "v4," "vers=4," or
"nfsvers=4" mount options, and convert the file system to "nfs4" under
the covers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
[trondmy: fixed up binary mount code so it sets the 'version' field too]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h    |  1 +
 fs/nfs/super.c       | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/nfs4.h |  1 +
 3 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5e686a4d744e..e21b1bb9972f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -68,6 +68,7 @@ struct nfs_parsed_mount_data {
 	unsigned int		auth_flavor_len;
 	rpc_authflavor_t	auth_flavors[1];
 	char			*client_address;
+	unsigned int		version;
 	unsigned int		minorversion;
 	char			*fscache_uniq;
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index c105e12197c2..34b1ccf51adf 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -73,7 +73,7 @@ enum {
 	Opt_cto, Opt_nocto,
 	Opt_ac, Opt_noac,
 	Opt_lock, Opt_nolock,
-	Opt_v2, Opt_v3,
+	Opt_v2, Opt_v3, Opt_v4,
 	Opt_udp, Opt_tcp, Opt_rdma,
 	Opt_acl, Opt_noacl,
 	Opt_rdirplus, Opt_nordirplus,
@@ -127,6 +127,7 @@ static const match_table_t nfs_mount_option_tokens = {
 	{ Opt_nolock, "nolock" },
 	{ Opt_v2, "v2" },
 	{ Opt_v3, "v3" },
+	{ Opt_v4, "v4" },
 	{ Opt_udp, "udp" },
 	{ Opt_tcp, "tcp" },
 	{ Opt_rdma, "rdma" },
@@ -934,10 +935,18 @@ static int nfs_parse_mount_options(char *raw,
 			break;
 		case Opt_v2:
 			mnt->flags &= ~NFS_MOUNT_VER3;
+			mnt->version = 2;
 			break;
 		case Opt_v3:
 			mnt->flags |= NFS_MOUNT_VER3;
+			mnt->version = 3;
 			break;
+#ifdef CONFIG_NFS_V4
+		case Opt_v4:
+			mnt->flags &= ~NFS_MOUNT_VER3;
+			mnt->version = 4;
+			break;
+#endif
 		case Opt_udp:
 			mnt->flags &= ~NFS_MOUNT_TCP;
 			mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1151,10 +1160,18 @@ static int nfs_parse_mount_options(char *raw,
 			switch (option) {
 			case NFS2_VERSION:
 				mnt->flags &= ~NFS_MOUNT_VER3;
+				mnt->version = 2;
 				break;
 			case NFS3_VERSION:
 				mnt->flags |= NFS_MOUNT_VER3;
+				mnt->version = 3;
+				break;
+#ifdef CONFIG_NFS_V4
+			case NFS4_VERSION:
+				mnt->flags &= ~NFS_MOUNT_VER3;
+				mnt->version = 4;
 				break;
+#endif
 			default:
 				goto out_invalid_value;
 			}
@@ -1629,6 +1646,7 @@ static int nfs_validate_mount_data(void *options,
 	args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 	args->auth_flavors[0]	= RPC_AUTH_UNIX;
 	args->auth_flavor_len	= 1;
+	args->minorversion	= 0;
 
 	switch (data->version) {
 	case 1:
@@ -1650,8 +1668,11 @@ static int nfs_validate_mount_data(void *options,
 			if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
 				goto out_invalid_fh;
 			mntfh->size = data->root.size;
-		} else
+			args->version = 3;
+		} else {
 			mntfh->size = NFS2_FHSIZE;
+			args->version = 2;
+		}
 
 
 		memcpy(mntfh->data, data->root.data, mntfh->size);
@@ -1726,6 +1747,14 @@ static int nfs_validate_mount_data(void *options,
 		if (!nfs_verify_server_address(sap))
 			goto out_no_address;
 
+		if (args->version == 4)
+#ifdef CONFIG_NFS_V4
+			return nfs4_validate_text_mount_data(options,
+							     args, dev_name);
+#else
+			goto out_v4_not_compiled;
+#endif
+
 		nfs_set_default_port(sap, args->nfs_server.port, 0);
 
 		nfs_set_mount_transport_protocol(args);
@@ -1774,6 +1803,12 @@ out_v3_not_compiled:
 	return -EPROTONOSUPPORT;
 #endif /* !CONFIG_NFS_V3 */
 
+#ifndef CONFIG_NFS_V4
+out_v4_not_compiled:
+	dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
+	return -EPROTONOSUPPORT;
+#endif /* !CONFIG_NFS_V4 */
+
 out_nomem:
 	dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
 	return -ENOMEM;
@@ -2069,6 +2104,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	if (error < 0)
 		goto out;
 
+#ifdef CONFIG_NFS_V4
+	if (data->version == 4) {
+		error = nfs4_try_mount(flags, dev_name, data, mnt);
+		kfree(data->client_address);
+		goto out;
+	}
+#endif	/* CONFIG_NFS_V4 */
+
 	/* Get a volume representation */
 	server = nfs_create_server(data, mntfh);
 	if (IS_ERR(server)) {
@@ -2320,6 +2363,7 @@ static int nfs4_validate_mount_data(void *options,
 	args->nfs_server.port	= NFS_UNSPEC_PORT;
 	args->auth_flavors[0]	= RPC_AUTH_UNIX;
 	args->auth_flavor_len	= 1;
+	args->version		= 4;
 	args->minorversion	= 0;
 
 	switch (data->version) {
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index bd2eba530667..33b283601f62 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -472,6 +472,7 @@ enum lock_type4 {
 
 #define NFSPROC4_NULL 0
 #define NFSPROC4_COMPOUND 1
+#define NFS4_VERSION 4
 #define NFS4_MINOR_VERSION 0
 
 #if defined(CONFIG_NFS_V4_1)
-- 
cgit v1.2.3-70-g09d2


From 2ecda72b49a0849ce41e7fa1fa974a245b9119f8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 8 Sep 2009 19:50:07 -0400
Subject: NFSv4: Disallow 'mount -t nfs4 -overs=2' and 'mount -t nfs4 -overs=3'

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 34b1ccf51adf..867f70504531 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2321,6 +2321,12 @@ static int nfs4_validate_text_mount_data(void *options,
 
 	nfs4_validate_mount_flags(args);
 
+	if (args->version != 4) {
+		dfprintk(MOUNT,
+			 "NFS4: Illegal mount version\n");
+		return -EINVAL;
+	}
+
 	if (args->auth_flavor_len > 1) {
 		dfprintk(MOUNT,
 			 "NFS4: Too many RPC auth flavours specified\n");
-- 
cgit v1.2.3-70-g09d2


From 2b88f7c535a8125213def012a67c1b0a667ceb2e Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 9 Sep 2009 15:59:35 +0100
Subject: GFS2: Remove unused sysfs file

The /sys/fs/gfs2/<fsname>/lock_module/id file has been unused for
some time now, so we can remove it. We still accept the mount option
though, as userspace still sends that.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     | 1 -
 fs/gfs2/ops_fstype.c | 6 +-----
 fs/gfs2/sys.c        | 8 --------
 3 files changed, 1 insertion(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 52436abc3b4d..6edb423f90b3 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -496,7 +496,6 @@ struct gfs2_sb_host {
  */
 
 struct lm_lockstruct {
-	u32 ls_id;
 	unsigned int ls_jid;
 	unsigned int ls_first;
 	unsigned int ls_first_done;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8e3f00e1430d..52fb6c048981 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1028,7 +1028,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 
 	ls->ls_ops = lm;
 	ls->ls_first = 1;
-	ls->ls_id = 0;
 
 	for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
 		substring_t tmp[MAX_OPT_ARGS];
@@ -1046,10 +1045,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 			ls->ls_jid = option;
 			break;
 		case Opt_id:
-			ret = match_int(&tmp[0], &option);
-			if (ret)
-				goto hostdata_error;
-			ls->ls_id = option;
+			/* Obsolete, but left for backward compat purposes */
 			break;
 		case Opt_first:
 			ret = match_int(&tmp[0], &option);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 0d4f7e974e9f..446329728d52 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -320,12 +320,6 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 	return ret;
 }
 
-static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
-{
-	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	return sprintf(buf, "%u\n", ls->ls_id);
-}
-
 static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
 {
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -390,7 +384,6 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 GDLM_ATTR(proto_name,		0444, proto_name_show,		NULL);
 GDLM_ATTR(block,		0644, block_show,		block_store);
 GDLM_ATTR(withdraw,		0644, withdraw_show,		withdraw_store);
-GDLM_ATTR(id,			0444, lkid_show,		NULL);
 GDLM_ATTR(jid,			0444, jid_show,			NULL);
 GDLM_ATTR(first,		0444, lkfirst_show,		NULL);
 GDLM_ATTR(first_done,		0444, first_done_show,		NULL);
@@ -402,7 +395,6 @@ static struct attribute *lock_module_attrs[] = {
 	&gdlm_attr_proto_name.attr,
 	&gdlm_attr_block.attr,
 	&gdlm_attr_withdraw.attr,
-	&gdlm_attr_id.attr,
 	&gdlm_attr_jid.attr,
 	&gdlm_attr_first.attr,
 	&gdlm_attr_first_done.attr,
-- 
cgit v1.2.3-70-g09d2


From 4734d401d43c6469d568caf223d37aa0fc1bf4dc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 9 Sep 2009 18:19:02 -0500
Subject: xfs: use correct log reservation when handling ENOSPC in xfs_create

We added the ENOSPC handling patch in xfs_create just after it got mered
with xfs_mkdir.  Change the log reservation to the variable for either
the create or mkdir value so it does the right thing if get here for creating
a directory.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 03d3100559ac..a434f287962d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1471,8 +1471,8 @@ xfs_create(
 	if (error == ENOSPC) {
 		/* flush outstanding delalloc blocks and retry */
 		xfs_flush_inodes(dp);
-		error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
-			XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+		error = xfs_trans_reserve(tp, resblks, log_res, 0,
+				XFS_TRANS_PERM_LOG_RES, log_count);
 	}
 	if (error == ENOSPC) {
 		/* No space at all so try a "no-allocation" reservation */
-- 
cgit v1.2.3-70-g09d2


From b1ab7e4b2a88d3ac13771463be8f302ce1616cfc Mon Sep 17 00:00:00 2001
From: "David P. Quigley" <dpquigl@tycho.nsa.gov>
Date: Thu, 3 Sep 2009 14:25:56 -0400
Subject: VFS: Factor out part of vfs_setxattr so it can be called from the
 SELinux hook for inode_setsecctx.

This factors out the part of the vfs_setxattr function that performs the
setting of the xattr and its notification. This is needed so the SELinux
implementation of inode_setsecctx can handle the setting of the xattr while
maintaining the proper separation of layers.

Signed-off-by: David P. Quigley <dpquigl@tycho.nsa.gov>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/xattr.c            | 55 +++++++++++++++++++++++++++++++++++++++------------
 include/linux/xattr.h |  1 +
 2 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xattr.c b/fs/xattr.c
index 1c3d0af59ddf..6d4f6d3449fb 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -66,22 +66,28 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 	return inode_permission(inode, mask);
 }
 
-int
-vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		size_t size, int flags)
+/**
+ *  __vfs_setxattr_noperm - perform setxattr operation without performing
+ *  permission checks.
+ *
+ *  @dentry - object to perform setxattr on
+ *  @name - xattr name to set
+ *  @value - value to set @name to
+ *  @size - size of @value
+ *  @flags - flags to pass into filesystem operations
+ *
+ *  returns the result of the internal setxattr or setsecurity operations.
+ *
+ *  This function requires the caller to lock the inode's i_mutex before it
+ *  is executed. It also assumes that the caller will make the appropriate
+ *  permission checks.
+ */
+int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
-	int error;
-
-	error = xattr_permission(inode, name, MAY_WRITE);
-	if (error)
-		return error;
+	int error = -EOPNOTSUPP;
 
-	mutex_lock(&inode->i_mutex);
-	error = security_inode_setxattr(dentry, name, value, size, flags);
-	if (error)
-		goto out;
-	error = -EOPNOTSUPP;
 	if (inode->i_op->setxattr) {
 		error = inode->i_op->setxattr(dentry, name, value, size, flags);
 		if (!error) {
@@ -97,6 +103,29 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		if (!error)
 			fsnotify_xattr(dentry);
 	}
+
+	return error;
+}
+
+
+int
+vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = xattr_permission(inode, name, MAY_WRITE);
+	if (error)
+		return error;
+
+	mutex_lock(&inode->i_mutex);
+	error = security_inode_setxattr(dentry, name, value, size, flags);
+	if (error)
+		goto out;
+
+	error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
+
 out:
 	mutex_unlock(&inode->i_mutex);
 	return error;
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index d131e352cfe1..5c84af8c5f6f 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -49,6 +49,7 @@ struct xattr_handler {
 ssize_t xattr_getsecurity(struct inode *, const char *, void *, size_t);
 ssize_t vfs_getxattr(struct dentry *, const char *, void *, size_t);
 ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size);
+int __vfs_setxattr_noperm(struct dentry *, const char *, const void *, size_t, int);
 int vfs_setxattr(struct dentry *, const char *, const void *, size_t, int);
 int vfs_removexattr(struct dentry *, const char *);
 
-- 
cgit v1.2.3-70-g09d2


From ddd29ec6597125c830f7badb608a86c98b936b64 Mon Sep 17 00:00:00 2001
From: "David P. Quigley" <dpquigl@tycho.nsa.gov>
Date: Wed, 9 Sep 2009 14:25:37 -0400
Subject: sysfs: Add labeling support for sysfs

This patch adds a setxattr handler to the file, directory, and symlink
inode_operations structures for sysfs. The patch uses hooks introduced in the
previous patch to handle the getting and setting of security information for
the sysfs inodes. As was suggested by Eric Biederman the struct iattr in the
sysfs_dirent structure has been replaced by a structure which contains the
iattr, secdata and secdata length to allow the changes to persist in the event
that the inode representing the sysfs_dirent is evicted. Because sysfs only
stores this information when a change is made all the optional data is moved
into one dynamically allocated field.

This patch addresses an issue where SELinux was denying virtd access to the PCI
configuration entries in sysfs. The lack of setxattr handlers for sysfs
required that a single label be assigned to all entries in sysfs. Granting virtd
access to every entry in sysfs is not an acceptable solution so fine grained
labeling of sysfs is required such that individual entries can be labeled
appropriately.

[sds:  Fixed compile-time warnings, coding style, and setting of inode security init flags.]

Signed-off-by: David P. Quigley <dpquigl@tycho.nsa.gov>
Signed-off-by: Stephen D. Smalley <sds@tycho.nsa.gov>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/sysfs/dir.c             |   1 +
 fs/sysfs/inode.c           | 134 +++++++++++++++++++++++++++++++++------------
 fs/sysfs/symlink.c         |   2 +
 fs/sysfs/sysfs.h           |  12 +++-
 security/selinux/hooks.c   |   5 ++
 security/smack/smack_lsm.c |   1 +
 6 files changed, 118 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 14f2d71ea3ce..0050fc40e8c9 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -760,6 +760,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 const struct inode_operations sysfs_dir_inode_operations = {
 	.lookup		= sysfs_lookup,
 	.setattr	= sysfs_setattr,
+	.setxattr	= sysfs_setxattr,
 };
 
 static void remove_dir(struct sysfs_dirent *sd)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 555f0ff988df..2b6a8d9de73d 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,8 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
 #include "sysfs.h"
 
 extern struct super_block * sysfs_sb;
@@ -35,6 +37,7 @@ static struct backing_dev_info sysfs_backing_dev_info = {
 
 static const struct inode_operations sysfs_inode_operations ={
 	.setattr	= sysfs_setattr,
+	.setxattr	= sysfs_setxattr,
 };
 
 int __init sysfs_inode_init(void)
@@ -42,18 +45,37 @@ int __init sysfs_inode_init(void)
 	return bdi_init(&sysfs_backing_dev_info);
 }
 
+struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
+{
+	struct sysfs_inode_attrs *attrs;
+	struct iattr *iattrs;
+
+	attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
+	if (!attrs)
+		return NULL;
+	iattrs = &attrs->ia_iattr;
+
+	/* assign default attributes */
+	iattrs->ia_mode = sd->s_mode;
+	iattrs->ia_uid = 0;
+	iattrs->ia_gid = 0;
+	iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+
+	return attrs;
+}
 int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
 {
 	struct inode * inode = dentry->d_inode;
 	struct sysfs_dirent * sd = dentry->d_fsdata;
-	struct iattr * sd_iattr;
+	struct sysfs_inode_attrs *sd_attrs;
+	struct iattr *iattrs;
 	unsigned int ia_valid = iattr->ia_valid;
 	int error;
 
 	if (!sd)
 		return -EINVAL;
 
-	sd_iattr = sd->s_iattr;
+	sd_attrs = sd->s_iattr;
 
 	error = inode_change_ok(inode, iattr);
 	if (error)
@@ -65,42 +87,77 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
 	if (error)
 		return error;
 
-	if (!sd_iattr) {
+	if (!sd_attrs) {
 		/* setting attributes for the first time, allocate now */
-		sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
-		if (!sd_iattr)
+		sd_attrs = sysfs_init_inode_attrs(sd);
+		if (!sd_attrs)
 			return -ENOMEM;
-		/* assign default attributes */
-		sd_iattr->ia_mode = sd->s_mode;
-		sd_iattr->ia_uid = 0;
-		sd_iattr->ia_gid = 0;
-		sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
-		sd->s_iattr = sd_iattr;
+		sd->s_iattr = sd_attrs;
+	} else {
+		/* attributes were changed at least once in past */
+		iattrs = &sd_attrs->ia_iattr;
+
+		if (ia_valid & ATTR_UID)
+			iattrs->ia_uid = iattr->ia_uid;
+		if (ia_valid & ATTR_GID)
+			iattrs->ia_gid = iattr->ia_gid;
+		if (ia_valid & ATTR_ATIME)
+			iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
+					inode->i_sb->s_time_gran);
+		if (ia_valid & ATTR_MTIME)
+			iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
+					inode->i_sb->s_time_gran);
+		if (ia_valid & ATTR_CTIME)
+			iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
+					inode->i_sb->s_time_gran);
+		if (ia_valid & ATTR_MODE) {
+			umode_t mode = iattr->ia_mode;
+
+			if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+				mode &= ~S_ISGID;
+			iattrs->ia_mode = sd->s_mode = mode;
+		}
 	}
+	return error;
+}
 
-	/* attributes were changed atleast once in past */
-
-	if (ia_valid & ATTR_UID)
-		sd_iattr->ia_uid = iattr->ia_uid;
-	if (ia_valid & ATTR_GID)
-		sd_iattr->ia_gid = iattr->ia_gid;
-	if (ia_valid & ATTR_ATIME)
-		sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime,
-						inode->i_sb->s_time_gran);
-	if (ia_valid & ATTR_MTIME)
-		sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime,
-						inode->i_sb->s_time_gran);
-	if (ia_valid & ATTR_CTIME)
-		sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime,
-						inode->i_sb->s_time_gran);
-	if (ia_valid & ATTR_MODE) {
-		umode_t mode = iattr->ia_mode;
-
-		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
-			mode &= ~S_ISGID;
-		sd_iattr->ia_mode = sd->s_mode = mode;
-	}
+int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags)
+{
+	struct sysfs_dirent *sd = dentry->d_fsdata;
+	struct sysfs_inode_attrs *iattrs;
+	void *secdata;
+	int error;
+	u32 secdata_len = 0;
+
+	if (!sd)
+		return -EINVAL;
+	if (!sd->s_iattr)
+		sd->s_iattr = sysfs_init_inode_attrs(sd);
+	if (!sd->s_iattr)
+		return -ENOMEM;
+
+	iattrs = sd->s_iattr;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+		error = security_inode_setsecurity(dentry->d_inode, suffix,
+						value, size, flags);
+		if (error)
+			goto out;
+		error = security_inode_getsecctx(dentry->d_inode,
+						&secdata, &secdata_len);
+		if (error)
+			goto out;
+		if (iattrs->ia_secdata)
+			security_release_secctx(iattrs->ia_secdata,
+						iattrs->ia_secdata_len);
+		iattrs->ia_secdata = secdata;
+		iattrs->ia_secdata_len = secdata_len;
 
+	} else
+		return -EINVAL;
+out:
 	return error;
 }
 
@@ -146,6 +203,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
 static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
 	struct bin_attribute *bin_attr;
+	struct sysfs_inode_attrs *iattrs;
 
 	inode->i_private = sysfs_get(sd);
 	inode->i_mapping->a_ops = &sysfs_aops;
@@ -154,16 +212,20 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 	inode->i_ino = sd->s_ino;
 	lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
 
-	if (sd->s_iattr) {
+	iattrs = sd->s_iattr;
+	if (iattrs) {
 		/* sysfs_dirent has non-default attributes
 		 * get them for the new inode from persistent copy
 		 * in sysfs_dirent
 		 */
-		set_inode_attr(inode, sd->s_iattr);
+		set_inode_attr(inode, &iattrs->ia_iattr);
+		if (iattrs->ia_secdata)
+			security_inode_notifysecctx(inode,
+						iattrs->ia_secdata,
+						iattrs->ia_secdata_len);
 	} else
 		set_default_inode_attr(inode, sd->s_mode);
 
-
 	/* initialize inode according to type */
 	switch (sysfs_type(sd)) {
 	case SYSFS_DIR:
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1d897ad808e0..c5081ad77026 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -16,6 +16,7 @@
 #include <linux/kobject.h>
 #include <linux/namei.h>
 #include <linux/mutex.h>
+#include <linux/security.h>
 
 #include "sysfs.h"
 
@@ -209,6 +210,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co
 }
 
 const struct inode_operations sysfs_symlink_inode_operations = {
+	.setxattr = sysfs_setxattr,
 	.readlink = generic_readlink,
 	.follow_link = sysfs_follow_link,
 	.put_link = sysfs_put_link,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3fa0d98481e2..af4c4e7482ac 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,6 +8,8 @@
  * This file is released under the GPLv2.
  */
 
+#include <linux/fs.h>
+
 struct sysfs_open_dirent;
 
 /* type-specific structures for sysfs_dirent->s_* union members */
@@ -31,6 +33,12 @@ struct sysfs_elem_bin_attr {
 	struct hlist_head	buffers;
 };
 
+struct sysfs_inode_attrs {
+	struct iattr	ia_iattr;
+	void		*ia_secdata;
+	u32		ia_secdata_len;
+};
+
 /*
  * sysfs_dirent - the building block of sysfs hierarchy.  Each and
  * every sysfs node is represented by single sysfs_dirent.
@@ -56,7 +64,7 @@ struct sysfs_dirent {
 	unsigned int		s_flags;
 	ino_t			s_ino;
 	umode_t			s_mode;
-	struct iattr		*s_iattr;
+	struct sysfs_inode_attrs *s_iattr;
 };
 
 #define SD_DEACTIVATED_BIAS		INT_MIN
@@ -148,6 +156,8 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
 void sysfs_delete_inode(struct inode *inode);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
+int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		size_t size, int flags);
 int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
 int sysfs_inode_init(void);
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7118be2a74a5..417f7c994522 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -448,6 +448,10 @@ static int sb_finish_set_opts(struct super_block *sb)
 	    sbsec->behavior > ARRAY_SIZE(labeling_behaviors))
 		sbsec->flags &= ~SE_SBLABELSUPP;
 
+	/* Special handling for sysfs. Is genfs but also has setxattr handler*/
+	if (strncmp(sb->s_type->name, "sysfs", sizeof("sysfs")) == 0)
+		sbsec->flags |= SE_SBLABELSUPP;
+
 	/* Initialize the root inode. */
 	rc = inode_doinit_with_dentry(root_inode, root);
 
@@ -2923,6 +2927,7 @@ static int selinux_inode_setsecurity(struct inode *inode, const char *name,
 		return rc;
 
 	isec->sid = newsid;
+	isec->initialized = 1;
 	return 0;
 }
 
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 0b3bb646f90e..acae7ef4092d 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1666,6 +1666,7 @@ static int smack_inode_setsecurity(struct inode *inode, const char *name,
 
 	if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) {
 		nsp->smk_inode = sp;
+		nsp->smk_flags |= SMK_INODE_INSTANT;
 		return 0;
 	}
 	/*
-- 
cgit v1.2.3-70-g09d2


From c7acb4c16646943180bd221c167a077e0a084f9c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 9 Sep 2009 21:32:41 -0400
Subject: ext4: Use bforget() in no journal mode for
 ext4_journal_{forget,revoke}()

When ext4 is using a journal, a metadata block which is deallocated
must be passed into the journal layer so it can be dropped from the
current transaction and/or revoked.  This is done by calling the
functions ext4_journal_forget() and ext4_journal_revoke(), which call
jbd2_journal_forget(), and jbd2_journal_revoke(), respectively.

Since the jbd2_journal_forget() and jbd2_journal_revoke() call
bforget(), if ext4 is not using a journal, ext4_journal_forget() and
ext4_journal_revoke() must call bforget() to avoid a dirty metadata
block overwriting a block after it has been reallocated and reused for
another inode's data block.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_jbd2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index eb27fd0f2ee8..ecb9ca455fd5 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
 						  handle, err);
 	}
 	else
-		brelse(bh);
+		bforget(bh);
 	return err;
 }
 
@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
 						  handle, err);
 	}
 	else
-		brelse(bh);
+		bforget(bh);
 	return err;
 }
 
-- 
cgit v1.2.3-70-g09d2


From fe188c0e084bdf3038dc0ac963c21d764f53f7da Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 12 Sep 2009 13:41:55 -0400
Subject: ext4: Assure that metadata blocks are written during fsync in no
 journal mode

When there is no journal present, we must attach buffer heads
associated with extent tree and indirect blocks to the inode's
mapping->private_list via mark_buffer_dirty_inode() so that
ext4_sync_file() --- which is called to service fsync() and
fdatasync() system calls --- can write out the inode's metadata blocks
by calling sync_mapping_buffers().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_jbd2.c | 5 ++++-
 fs/ext4/fsync.c     | 9 +++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index ecb9ca455fd5..6a9409920dee 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
 			ext4_journal_abort_handle(where, __func__, bh,
 						  handle, err);
 	} else {
-		mark_buffer_dirty(bh);
+		if (inode && bh)
+			mark_buffer_dirty_inode(bh, inode);
+		else
+			mark_buffer_dirty(bh);
 		if (inode && inode_needs_sync(inode)) {
 			sync_dirty_buffer(bh);
 			if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index ab418c0f502d..07475740b512 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -50,7 +50,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-	int ret = 0;
+	int err, ret = 0;
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
@@ -79,6 +79,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
+	if (!journal)
+		ret = sync_mapping_buffers(inode->i_mapping);
+
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
 		goto out;
 
@@ -91,7 +94,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 			.sync_mode = WB_SYNC_ALL,
 			.nr_to_write = 0, /* sys_fsync did this */
 		};
-		ret = sync_inode(inode, &wbc);
+		err = sync_inode(inode, &wbc);
+		if (ret == 0)
+			ret = err;
 	}
 out:
 	if (journal && (journal->j_flags & JBD2_BARRIER))
-- 
cgit v1.2.3-70-g09d2


From 91ac6f43317c0bf99969665f98016548011dfa38 Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Wed, 9 Sep 2009 22:33:47 -0400
Subject: ext4: Make non-journal fsync work properly

Teach ext4_write_inode() and ext4_do_update_inode() about non-journal
mode:  If we're not using a journal, ext4_write_inode() now calls
ext4_do_update_inode() (after getting the iloc via ext4_get_inode_loc())
with a new "do_sync" parameter.  If that parameter is nonzero _and_ we're
not using a journal, ext4_do_update_inode() calls sync_dirty_buffer()
instead of ext4_handle_dirty_metadata().

This problem was found in power-fail testing, checking the amount of
loss of files and blocks after a power failure when using fsync() and
when not using fsync().  It turned out that using fsync() was actually
worse than not doing so, possibly because it increased the likelihood
that the inodes would remain unflushed and would therefore be lost at
the power failure.

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 54 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6253ecdac67f..d04c8428bde2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4538,7 +4538,8 @@ static int ext4_inode_blocks_set(handle_t *handle,
  */
 static int ext4_do_update_inode(handle_t *handle,
 				struct inode *inode,
-				struct ext4_iloc *iloc)
+				struct ext4_iloc *iloc,
+				int do_sync)
 {
 	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
 	struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4640,10 +4641,22 @@ static int ext4_do_update_inode(handle_t *handle,
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
 	}
 
-	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-	rc = ext4_handle_dirty_metadata(handle, inode, bh);
-	if (!err)
-		err = rc;
+	/*
+	 * If we're not using a journal and we were called from
+	 * ext4_write_inode() to sync the inode (making do_sync true),
+	 * we can just use sync_dirty_buffer() directly to do our dirty
+	 * work.  Testing s_journal here is a bit redundant but it's
+	 * worth it to avoid potential future trouble.
+	 */
+	if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
+		BUFFER_TRACE(bh, "call sync_dirty_buffer");
+		sync_dirty_buffer(bh);
+	} else {
+		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+		rc = ext4_handle_dirty_metadata(handle, inode, bh);
+		if (!err)
+			err = rc;
+	}
 	ei->i_state &= ~EXT4_STATE_NEW;
 
 out_brelse:
@@ -4689,19 +4702,32 @@ out_brelse:
  */
 int ext4_write_inode(struct inode *inode, int wait)
 {
+	int err;
+
 	if (current->flags & PF_MEMALLOC)
 		return 0;
 
-	if (ext4_journal_current_handle()) {
-		jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
-		dump_stack();
-		return -EIO;
-	}
+	if (EXT4_SB(inode->i_sb)->s_journal) {
+		if (ext4_journal_current_handle()) {
+			jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+			dump_stack();
+			return -EIO;
+		}
 
-	if (!wait)
-		return 0;
+		if (!wait)
+			return 0;
+
+		err = ext4_force_commit(inode->i_sb);
+	} else {
+		struct ext4_iloc iloc;
 
-	return ext4_force_commit(inode->i_sb);
+		err = ext4_get_inode_loc(inode, &iloc);
+		if (err)
+			return err;
+		err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
+					   inode, &iloc, wait);
+	}
+	return err;
 }
 
 /*
@@ -4995,7 +5021,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
 	get_bh(iloc->bh);
 
 	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
-	err = ext4_do_update_inode(handle, inode, iloc);
+	err = ext4_do_update_inode(handle, inode, iloc, 0);
 	put_bh(iloc->bh);
 	return err;
 }
-- 
cgit v1.2.3-70-g09d2


From 752015d1b0683a8c623ebfe4c62893413e9b30d3 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 8 Sep 2009 19:49:40 -0700
Subject: binfmt_elf: fix PT_INTERP bss handling

In fs/binfmt_elf.c, load_elf_interp() calls padzero() for .bss even if
the PT_LOAD has no PROT_WRITE and no .bss.  This generates EFAULT.

Here is a small test case.  (Yes, there are other, useful PT_INTERP
which have only .text and no .data/.bss.)

	----- ptinterp.S
	_start: .globl _start
		 nop
		 int3
	-----
	$ gcc -m32 -nostartfiles -nostdlib -o ptinterp ptinterp.S
	$ gcc -m32 -Wl,--dynamic-linker=ptinterp -o hello hello.c
	$ ./hello
	Segmentation fault  # during execve() itself

	After applying the patch:
	$ ./hello
	Trace trap  # user-mode execution after execve() finishes

If the ELF headers are actually self-inconsistent, then dying is fine.
But having no PROT_WRITE segment is perfectly normal and correct if
there is no segment with p_memsz > p_filesz (i.e. bss).  John Reiser
suggested checking for PROT_WRITE in the bss logic.  I think it makes
most sense to simply apply the bss logic only when there is bss.

This patch looks less trivial than it is due to some reindentation.
It just moves the "if (last_bss > elf_bss) {" test up to include the
partial-page bss logic as well as the more-pages bss logic.

Reported-by: John Reiser <jreiser@bitwagon.com>
Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b7c1603cd4bd..7c1e65d54872 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		}
 	}
 
-	/*
-	 * Now fill out the bss section.  First pad the last page up
-	 * to the page boundary, and then perform a mmap to make sure
-	 * that there are zero-mapped pages up to and including the 
-	 * last bss page.
-	 */
-	if (padzero(elf_bss)) {
-		error = -EFAULT;
-		goto out_close;
-	}
+	if (last_bss > elf_bss) {
+		/*
+		 * Now fill out the bss section.  First pad the last page up
+		 * to the page boundary, and then perform a mmap to make sure
+		 * that there are zero-mapped pages up to and including the
+		 * last bss page.
+		 */
+		if (padzero(elf_bss)) {
+			error = -EFAULT;
+			goto out_close;
+		}
 
-	/* What we have mapped so far */
-	elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
+		/* What we have mapped so far */
+		elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
 
-	/* Map the last of the bss segment */
-	if (last_bss > elf_bss) {
+		/* Map the last of the bss segment */
 		down_write(&current->mm->mmap_sem);
 		error = do_brk(elf_bss, last_bss - elf_bss);
 		up_write(&current->mm->mmap_sem);
-- 
cgit v1.2.3-70-g09d2


From b6a758ec3af3ec236dbfdcf6a06b84ac8f94957e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 9 Sep 2009 23:47:46 -0400
Subject: ext4: move ext4_mb_init_group() function earlier in the mballoc.c

This moves the function around so that it can be called from
ext4_mb_load_buddy().

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 182 +++++++++++++++++++++++++++---------------------------
 1 file changed, 91 insertions(+), 91 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5ef6daf0cdc6..fed5ac699141 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -910,6 +910,97 @@ out:
 	return err;
 }
 
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+	int ret = 0;
+	void *bitmap;
+	int blocks_per_page;
+	int block, pnum, poff;
+	int num_grp_locked = 0;
+	struct ext4_group_info *this_grp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
+	struct page *page = NULL, *bitmap_page = NULL;
+
+	mb_debug(1, "init group %u\n", group);
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	this_grp = ext4_get_group_info(sb, group);
+	/*
+	 * This ensures we don't add group
+	 * to this buddy cache via resize
+	 */
+	num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
+	if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+		/*
+		 * somebody initialized the group
+		 * return without doing anything
+		 */
+		ret = 0;
+		goto err;
+	}
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (page) {
+		BUG_ON(page->mapping != inode->i_mapping);
+		ret = ext4_mb_init_cache(page, NULL);
+		if (ret) {
+			unlock_page(page);
+			goto err;
+		}
+		unlock_page(page);
+	}
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+	mark_page_accessed(page);
+	bitmap_page = page;
+	bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+	/* init buddy cache */
+	block++;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (page == bitmap_page) {
+		/*
+		 * If both the bitmap and buddy are in
+		 * the same page we don't need to force
+		 * init the buddy
+		 */
+		unlock_page(page);
+	} else if (page) {
+		BUG_ON(page->mapping != inode->i_mapping);
+		ret = ext4_mb_init_cache(page, bitmap);
+		if (ret) {
+			unlock_page(page);
+			goto err;
+		}
+		unlock_page(page);
+	}
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+	mark_page_accessed(page);
+err:
+	ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+	if (bitmap_page)
+		page_cache_release(bitmap_page);
+	if (page)
+		page_cache_release(page);
+	return ret;
+}
+
 static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 					struct ext4_buddy *e4b)
@@ -1839,97 +1930,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
 
 }
 
-static noinline_for_stack
-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
-{
-
-	int ret;
-	void *bitmap;
-	int blocks_per_page;
-	int block, pnum, poff;
-	int num_grp_locked = 0;
-	struct ext4_group_info *this_grp;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct inode *inode = sbi->s_buddy_cache;
-	struct page *page = NULL, *bitmap_page = NULL;
-
-	mb_debug(1, "init group %u\n", group);
-	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-	this_grp = ext4_get_group_info(sb, group);
-	/*
-	 * This ensures we don't add group
-	 * to this buddy cache via resize
-	 */
-	num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
-	if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
-		/*
-		 * somebody initialized the group
-		 * return without doing anything
-		 */
-		ret = 0;
-		goto err;
-	}
-	/*
-	 * the buddy cache inode stores the block bitmap
-	 * and buddy information in consecutive blocks.
-	 * So for each group we need two blocks.
-	 */
-	block = group * 2;
-	pnum = block / blocks_per_page;
-	poff = block % blocks_per_page;
-	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-	if (page) {
-		BUG_ON(page->mapping != inode->i_mapping);
-		ret = ext4_mb_init_cache(page, NULL);
-		if (ret) {
-			unlock_page(page);
-			goto err;
-		}
-		unlock_page(page);
-	}
-	if (page == NULL || !PageUptodate(page)) {
-		ret = -EIO;
-		goto err;
-	}
-	mark_page_accessed(page);
-	bitmap_page = page;
-	bitmap = page_address(page) + (poff * sb->s_blocksize);
-
-	/* init buddy cache */
-	block++;
-	pnum = block / blocks_per_page;
-	poff = block % blocks_per_page;
-	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-	if (page == bitmap_page) {
-		/*
-		 * If both the bitmap and buddy are in
-		 * the same page we don't need to force
-		 * init the buddy
-		 */
-		unlock_page(page);
-	} else if (page) {
-		BUG_ON(page->mapping != inode->i_mapping);
-		ret = ext4_mb_init_cache(page, bitmap);
-		if (ret) {
-			unlock_page(page);
-			goto err;
-		}
-		unlock_page(page);
-	}
-	if (page == NULL || !PageUptodate(page)) {
-		ret = -EIO;
-		goto err;
-	}
-	mark_page_accessed(page);
-err:
-	ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
-	if (bitmap_page)
-		page_cache_release(bitmap_page);
-	if (page)
-		page_cache_release(page);
-	return ret;
-}
-
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-- 
cgit v1.2.3-70-g09d2


From f41c0750538667b87a19c93952e5d42fcc069bd7 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 9 Sep 2009 23:34:50 -0400
Subject: ext4: check for need init flag in ext4_mb_load_buddy

We should check for need init flag with the group's alloc_sem held, to
make sure while we are loading the buddy cache and holding a reference
to it, a file system resize can't add new blocks to same group.

The patch also drops the need init flag check in
ext4_mb_regular_allocator() because doing the check without holding
alloc_sem is racy.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/mballoc.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index fed5ac699141..484c9d10c7fa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1034,8 +1034,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	 * groups mapped by the page is blocked
 	 * till we are done with allocation
 	 */
+repeat_load_buddy:
 	down_read(e4b->alloc_semp);
 
+	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+		/* we need to check for group need init flag
+		 * with alloc_semp held so that we can be sure
+		 * that new blocks didn't get added to the group
+		 * when we are loading the buddy cache
+		 */
+		up_read(e4b->alloc_semp);
+		/*
+		 * we need full data about the group
+		 * to make a good selection
+		 */
+		ret = ext4_mb_init_group(sb, group);
+		if (ret)
+			return ret;
+		goto repeat_load_buddy;
+	}
+
 	/*
 	 * the buddy cache inode stores the block bitmap
 	 * and buddy information in consecutive blocks.
@@ -2012,27 +2030,6 @@ repeat:
 			if (grp->bb_free == 0)
 				continue;
 
-			/*
-			 * if the group is already init we check whether it is
-			 * a good group and if not we don't load the buddy
-			 */
-			if (EXT4_MB_GRP_NEED_INIT(grp)) {
-				/*
-				 * we need full data about the group
-				 * to make a good selection
-				 */
-				err = ext4_mb_init_group(sb, group);
-				if (err)
-					goto out;
-			}
-
-			/*
-			 * If the particular group doesn't satisfy our
-			 * criteria we continue with the next group
-			 */
-			if (!ext4_mb_good_group(ac, group, cr))
-				continue;
-
 			err = ext4_mb_load_buddy(sb, group, &e4b);
 			if (err)
 				goto out;
-- 
cgit v1.2.3-70-g09d2


From 08c3a8133810d955d97f7146c50c43e4073f2148 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 9 Sep 2009 23:50:17 -0400
Subject: ext4: Clarify the locking details in mballoc

We don't need to take the alloc_sem lock when we are adding new
groups, since mballoc won't see the new group added until we bump
sbi->s_groups_count.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/mballoc.c | 7 +++++--
 fs/ext4/resize.c  | 7 +------
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 484c9d10c7fa..d23056d375b3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -928,8 +928,11 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 	this_grp = ext4_get_group_info(sb, group);
 	/*
-	 * This ensures we don't add group
-	 * to this buddy cache via resize
+	 * This ensures that we don't reinit the buddy cache
+	 * page which map to the group from which we are already
+	 * allocating. If we are looking at the buddy cache we would
+	 * have taken a reference using ext4_mb_load_buddy and that
+	 * would have taken the alloc_sem lock.
 	 */
 	num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
 	if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 68b0351fc647..3cfc343c41b5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	struct inode *inode = NULL;
 	handle_t *handle;
 	int gdb_off, gdb_num;
-	int num_grp_locked = 0;
 	int err, err2;
 
 	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
          * using the new disk blocks.
          */
 
-	num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
 	/* Update group descriptor block for new group */
 	gdp = (struct ext4_group_desc *)((char *)primary->b_data +
 					 gdb_off * EXT4_DESC_SIZE(sb));
@@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	 * descriptor
 	 */
 	err = ext4_mb_add_groupinfo(sb, input->group, gdp);
-	if (err) {
-		ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
+	if (err)
 		goto exit_journal;
-	}
 
 	/*
 	 * Make the new blocks and inodes valid next.  We do this before
@@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	/* Update the global fs size fields */
 	sbi->s_groups_count++;
-	ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
 
 	ext4_handle_dirty_metadata(handle, NULL, primary);
 
-- 
cgit v1.2.3-70-g09d2


From 9f0ab4a3f0fdb1ff404d150618ace2fa069bb2e1 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 8 Sep 2009 19:49:40 -0700
Subject: binfmt_elf: fix PT_INTERP bss handling

In fs/binfmt_elf.c, load_elf_interp() calls padzero() for .bss even if
the PT_LOAD has no PROT_WRITE and no .bss.  This generates EFAULT.

Here is a small test case.  (Yes, there are other, useful PT_INTERP
which have only .text and no .data/.bss.)

	----- ptinterp.S
	_start: .globl _start
		 nop
		 int3
	-----
	$ gcc -m32 -nostartfiles -nostdlib -o ptinterp ptinterp.S
	$ gcc -m32 -Wl,--dynamic-linker=ptinterp -o hello hello.c
	$ ./hello
	Segmentation fault  # during execve() itself

	After applying the patch:
	$ ./hello
	Trace trap  # user-mode execution after execve() finishes

If the ELF headers are actually self-inconsistent, then dying is fine.
But having no PROT_WRITE segment is perfectly normal and correct if
there is no segment with p_memsz > p_filesz (i.e. bss).  John Reiser
suggested checking for PROT_WRITE in the bss logic.  I think it makes
most sense to simply apply the bss logic only when there is bss.

This patch looks less trivial than it is due to some reindentation.
It just moves the "if (last_bss > elf_bss) {" test up to include the
partial-page bss logic as well as the more-pages bss logic.

Reported-by: John Reiser <jreiser@bitwagon.com>
Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/binfmt_elf.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b7c1603cd4bd..7c1e65d54872 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		}
 	}
 
-	/*
-	 * Now fill out the bss section.  First pad the last page up
-	 * to the page boundary, and then perform a mmap to make sure
-	 * that there are zero-mapped pages up to and including the 
-	 * last bss page.
-	 */
-	if (padzero(elf_bss)) {
-		error = -EFAULT;
-		goto out_close;
-	}
+	if (last_bss > elf_bss) {
+		/*
+		 * Now fill out the bss section.  First pad the last page up
+		 * to the page boundary, and then perform a mmap to make sure
+		 * that there are zero-mapped pages up to and including the
+		 * last bss page.
+		 */
+		if (padzero(elf_bss)) {
+			error = -EFAULT;
+			goto out_close;
+		}
 
-	/* What we have mapped so far */
-	elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
+		/* What we have mapped so far */
+		elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
 
-	/* Map the last of the bss segment */
-	if (last_bss > elf_bss) {
+		/* Map the last of the bss segment */
 		down_write(&current->mm->mmap_sem);
 		error = do_brk(elf_bss, last_bss - elf_bss);
 		up_write(&current->mm->mmap_sem);
-- 
cgit v1.2.3-70-g09d2


From 71290b368ad5e1e0b0b300c9d5638490a9fd1a2d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 10 Sep 2009 17:31:04 -0400
Subject: ext4: Don't update superblock write time when filesystem is read-only

This avoids updating the superblock write time when we are mounting
the root file system read/only but we need to replay the journal; at
that point, for people who are east of GMT and who make their clock
tick in localtime for Windows bug-for-bug compatibility, and this will
cause e2fsck to complain and force a full file system check.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f1815d3bcfd5..9f6fa3f74629 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3222,7 +3222,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 		clear_buffer_write_io_error(sbh);
 		set_buffer_uptodate(sbh);
 	}
-	es->s_wtime = cpu_to_le32(get_seconds());
+	/*
+	 * If the file system is mounted read-only, don't update the
+	 * superblock write time.  This avoids updating the superblock
+	 * write time when we are mounting the root file system
+	 * read/only but we need to replay the journal; at that point,
+	 * for people who are east of GMT and who make their clock
+	 * tick in localtime for Windows bug-for-bug compatibility,
+	 * the clock is set in the future, and this will cause e2fsck
+	 * to complain and force a full file system check.
+	 */
+	if (!(sb->s_flags & MS_RDONLY))
+		es->s_wtime = cpu_to_le32(get_seconds());
 	es->s_kbytes_written =
 		cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
 			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
-- 
cgit v1.2.3-70-g09d2


From 0e3d2a6313d03413d93327202a60256d1d726fdc Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 11 Sep 2009 09:30:12 -0400
Subject: ext4: Fix async commit mode to be safe by using a barrier

Previously the journal_async_commit mount option was equivalent to
using barrier=0 (and just as unsafe).  This patch fixes it so that we
eliminate the barrier before the commit block (by not using ordered
mode), and explicitly issuing an empty barrier bio after writing the
commit block.  Because of the journal checksum, it is safe to do this;
if the journal blocks are not all written before a power failure, the
checksum in the commit block will prevent the last transaction from
being replayed.

Using the fs_mark benchmark, using journal_async_commit shows a 50%
improvement:

FSUse%        Count         Size    Files/sec     App Overhead
     8         1000        10240         30.5            28242

vs.

FSUse%        Count         Size    Files/sec     App Overhead
     8         1000        10240         45.8            28620


Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0df600e9162d..26d991ddc1e6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
+#include <linux/blkdev.h>
 #include <trace/events/jbd2.h>
 
 /*
@@ -133,8 +134,8 @@ static int journal_submit_commit_record(journal_t *journal,
 	bh->b_end_io = journal_end_buffer_io_sync;
 
 	if (journal->j_flags & JBD2_BARRIER &&
-		!JBD2_HAS_INCOMPAT_FEATURE(journal,
-					 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
+				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 		set_buffer_ordered(bh);
 		barrier_done = 1;
 	}
@@ -706,11 +707,13 @@ start_journal_io:
 	/* Done it all: now write the commit record asynchronously. */
 
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
-		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 		err = journal_submit_commit_record(journal, commit_transaction,
 						 &cbh, crc32_sum);
 		if (err)
 			__jbd2_journal_abort_hard(journal);
+		if (journal->j_flags & JBD2_BARRIER)
+			blkdev_issue_flush(journal->j_dev, NULL);
 	}
 
 	/*
@@ -833,7 +836,7 @@ wait_for_iobuf:
 	jbd_debug(3, "JBD: commit phase 5\n");
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 		err = journal_submit_commit_record(journal, commit_transaction,
 						&cbh, crc32_sum);
 		if (err)
-- 
cgit v1.2.3-70-g09d2


From 1f7bebb9e911d870fa8f997ddff838e82b5715ea Mon Sep 17 00:00:00 2001
From: Andreas Schlick <schlick@lavabit.com>
Date: Thu, 10 Sep 2009 23:16:07 -0400
Subject: ext4: Always set dx_node's fake_dirent explicitly.

When ext4_dx_add_entry() has to split an index node, it has to ensure that
name_len of dx_node's fake_dirent is also zero, because otherwise e2fsck
won't recognise it as an intermediate htree node and consider the htree to
be corrupted.

Signed-off-by: Andreas Schlick <schlick@lavabit.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f27c8164c1fe..42f81d285cd5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1597,9 +1597,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			goto cleanup;
 		node2 = (struct dx_node *)(bh2->b_data);
 		entries2 = node2->entries;
+		memset(&node2->fake, 0, sizeof(struct fake_dirent));
 		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
 							   sb->s_blocksize);
-		node2->fake.inode = 0;
 		BUFFER_TRACE(frame->bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, frame->bh);
 		if (err)
-- 
cgit v1.2.3-70-g09d2


From d8a8559cd7a9ccac98d5f6f13297a2ff68a43627 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 2 Sep 2009 12:34:32 +0200
Subject: writeback: get rid of generic_sync_sb_inodes() export

This adds two new exported functions:

- writeback_inodes_sb(), which only attempts to writeback dirty inodes on
  this super_block, for WB_SYNC_NONE writeout.
- sync_inodes_sb(), which writes out all dirty inodes on this super_block
  and also waits for the IO to complete.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/staging/pohmelfs/inode.c |  9 +-----
 fs/fs-writeback.c                | 70 ++++++++++++++++++++++++----------------
 fs/sync.c                        | 18 ++++++-----
 fs/ubifs/budget.c                | 16 ++-------
 fs/ubifs/super.c                 |  8 +----
 include/linux/fs.h               |  2 --
 include/linux/writeback.h        |  3 +-
 7 files changed, 58 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index 7b605795b770..e63c9bea6c54 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -1950,14 +1950,7 @@ static int pohmelfs_get_sb(struct file_system_type *fs_type,
  */
 static void pohmelfs_kill_super(struct super_block *sb)
 {
-	struct writeback_control wbc = {
-		.sync_mode	= WB_SYNC_ALL,
-		.range_start	= 0,
-		.range_end	= LLONG_MAX,
-		.nr_to_write	= LONG_MAX,
-	};
-	generic_sync_sb_inodes(sb, &wbc);
-
+	sync_inodes_sb(sb);
 	kill_anon_super(sb);
 }
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c54226be5294..271e5f44e871 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -458,8 +458,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  * on the writer throttling path, and we get decent balancing between many
  * throttled threads: we don't want them all piling up on inode_sync_wait.
  */
-void generic_sync_sb_inodes(struct super_block *sb,
-				struct writeback_control *wbc)
+static void generic_sync_sb_inodes(struct super_block *sb,
+				   struct writeback_control *wbc)
 {
 	const unsigned long start = jiffies;	/* livelock avoidance */
 	int sync = wbc->sync_mode == WB_SYNC_ALL;
@@ -593,13 +593,6 @@ void generic_sync_sb_inodes(struct super_block *sb,
 
 	return;		/* Leave any unwritten inodes on s_io */
 }
-EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
-
-static void sync_sb_inodes(struct super_block *sb,
-				struct writeback_control *wbc)
-{
-	generic_sync_sb_inodes(sb, wbc);
-}
 
 /*
  * Start writeback of dirty pagecache data against all unlocked inodes.
@@ -640,7 +633,7 @@ restart:
 			 */
 			if (down_read_trylock(&sb->s_umount)) {
 				if (sb->s_root)
-					sync_sb_inodes(sb, wbc);
+					generic_sync_sb_inodes(sb, wbc);
 				up_read(&sb->s_umount);
 			}
 			spin_lock(&sb_lock);
@@ -653,35 +646,56 @@ restart:
 	spin_unlock(&sb_lock);
 }
 
-/*
- * writeback and wait upon the filesystem's dirty inodes.  The caller will
- * do this in two passes - one to write, and one to wait.
- *
- * A finite limit is set on the number of pages which will be written.
- * To prevent infinite livelock of sys_sync().
+/**
+ * writeback_inodes_sb	-	writeback dirty inodes from given super_block
+ * @sb: the superblock
  *
- * We add in the number of potentially dirty inodes, because each inode write
- * can dirty pagecache in the underlying blockdev.
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO. The number of pages submitted is
+ * returned.
  */
-void sync_inodes_sb(struct super_block *sb, int wait)
+long writeback_inodes_sb(struct super_block *sb)
 {
 	struct writeback_control wbc = {
-		.sync_mode	= wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+		.sync_mode	= WB_SYNC_NONE,
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
 	};
+	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+	long nr_to_write;
 
-	if (!wait) {
-		unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-		unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-
-		wbc.nr_to_write = nr_dirty + nr_unstable +
+	nr_to_write = nr_dirty + nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
-	} else
-		wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
 
-	sync_sb_inodes(sb, &wbc);
+	wbc.nr_to_write = nr_to_write;
+	generic_sync_sb_inodes(sb, &wbc);
+	return nr_to_write - wbc.nr_to_write;
+}
+EXPORT_SYMBOL(writeback_inodes_sb);
+
+/**
+ * sync_inodes_sb	-	sync sb inode pages
+ * @sb: the superblock
+ *
+ * This function writes and waits on any dirty inode belonging to this
+ * super_block. The number of pages synced is returned.
+ */
+long sync_inodes_sb(struct super_block *sb)
+{
+	struct writeback_control wbc = {
+		.sync_mode	= WB_SYNC_ALL,
+		.range_start	= 0,
+		.range_end	= LLONG_MAX,
+	};
+	long nr_to_write = LONG_MAX; /* doesn't actually matter */
+
+	wbc.nr_to_write = nr_to_write;
+	generic_sync_sb_inodes(sb, &wbc);
+	return nr_to_write - wbc.nr_to_write;
 }
+EXPORT_SYMBOL(sync_inodes_sb);
 
 /**
  * write_inode_now	-	write an inode to disk
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba61d86d..66f210476f40 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -19,20 +19,22 @@
 			SYNC_FILE_RANGE_WAIT_AFTER)
 
 /*
- * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
- * just dirties buffers with inodes so we have to submit IO for these buffers
- * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
- * case write_inode() functions do sync_dirty_buffer() and thus effectively
- * write one block at a time.
+ * Do the filesystem syncing work. For simple filesystems
+ * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
+ * submit IO for these buffers via __sync_blockdev(). This also speeds up the
+ * wait == 1 case since in that case write_inode() functions do
+ * sync_dirty_buffer() and thus effectively write one block at a time.
  */
 static int __sync_filesystem(struct super_block *sb, int wait)
 {
 	/* Avoid doing twice syncing and cache pruning for quota sync */
-	if (!wait)
+	if (!wait) {
 		writeout_quota_sb(sb, -1);
-	else
+		writeback_inodes_sb(sb);
+	} else {
 		sync_quota_sb(sb, -1);
-	sync_inodes_sb(sb, wait);
+		sync_inodes_sb(sb);
+	}
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
 	return __sync_blockdev(sb->s_bdev, wait);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index eaf6d891d46f..1c8991b0db13 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -65,26 +65,14 @@
 static int shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
 	int nr_written;
-	struct writeback_control wbc = {
-		.sync_mode   = WB_SYNC_NONE,
-		.range_end   = LLONG_MAX,
-		.nr_to_write = nr_to_write,
-	};
-
-	generic_sync_sb_inodes(c->vfs_sb, &wbc);
-	nr_written = nr_to_write - wbc.nr_to_write;
 
+	nr_written = writeback_inodes_sb(c->vfs_sb);
 	if (!nr_written) {
 		/*
 		 * Re-try again but wait on pages/inodes which are being
 		 * written-back concurrently (e.g., by pdflush).
 		 */
-		memset(&wbc, 0, sizeof(struct writeback_control));
-		wbc.sync_mode   = WB_SYNC_ALL;
-		wbc.range_end   = LLONG_MAX;
-		wbc.nr_to_write = nr_to_write;
-		generic_sync_sb_inodes(c->vfs_sb, &wbc);
-		nr_written = nr_to_write - wbc.nr_to_write;
+		nr_written = sync_inodes_sb(c->vfs_sb);
 	}
 
 	dbg_budg("%d pages were written back", nr_written);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 26d2e0d80465..8d6050a5966c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -438,12 +438,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 {
 	int i, err;
 	struct ubifs_info *c = sb->s_fs_info;
-	struct writeback_control wbc = {
-		.sync_mode   = WB_SYNC_ALL,
-		.range_start = 0,
-		.range_end   = LLONG_MAX,
-		.nr_to_write = LONG_MAX,
-	};
 
 	/*
 	 * Zero @wait is just an advisory thing to help the file system shove
@@ -462,7 +456,7 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	 * the user be able to get more accurate results of 'statfs()' after
 	 * they synchronize the file system.
 	 */
-	generic_sync_sb_inodes(sb, &wbc);
+	sync_inodes_sb(sb);
 
 	/*
 	 * Synchronize write buffers, because 'ubifs_run_commit()' does not
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c1f993515f51..46ff7dd6e164 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2071,8 +2071,6 @@ static inline void invalidate_remote_inode(struct inode *inode)
 extern int invalidate_inode_pages2(struct address_space *mapping);
 extern int invalidate_inode_pages2_range(struct address_space *mapping,
 					 pgoff_t start, pgoff_t end);
-extern void generic_sync_sb_inodes(struct super_block *sb,
-				struct writeback_control *wbc);
 extern int write_inode_now(struct inode *, int);
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_flush(struct address_space *);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3224820c8514..07039299603d 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -78,7 +78,8 @@ struct writeback_control {
  */	
 void writeback_inodes(struct writeback_control *wbc);
 int inode_wait(void *);
-void sync_inodes_sb(struct super_block *, int wait);
+long writeback_inodes_sb(struct super_block *);
+long sync_inodes_sb(struct super_block *);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
-- 
cgit v1.2.3-70-g09d2


From 66f3b8e2e103a0b93b945764d98e9ba46cb926dd Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 2 Sep 2009 09:19:46 +0200
Subject: writeback: move dirty inodes from super_block to backing_dev_info

This is a first step at introducing per-bdi flusher threads. We should
have no change in behaviour, although sb_has_dirty_inodes() is now
ridiculously expensive, as there's no easy way to answer that question.
Not a huge problem, since it'll be deleted in subsequent patches.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c           | 197 ++++++++++++++++++++++++++++----------------
 fs/super.c                  |   3 -
 include/linux/backing-dev.h |   9 ++
 include/linux/fs.h          |   5 +-
 mm/backing-dev.c            |  24 ++++++
 mm/page-writeback.c         |  11 +--
 6 files changed, 165 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 271e5f44e871..45ad4bb700e6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -25,6 +25,7 @@
 #include <linux/buffer_head.h>
 #include "internal.h"
 
+#define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
 
 /**
  * writeback_acquire - attempt to get exclusive writeback access to a device
@@ -165,12 +166,13 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			goto out;
 
 		/*
-		 * If the inode was already on s_dirty/s_io/s_more_io, don't
-		 * reposition it (that would break s_dirty time-ordering).
+		 * If the inode was already on b_dirty/b_io/b_more_io, don't
+		 * reposition it (that would break b_dirty time-ordering).
 		 */
 		if (!was_dirty) {
 			inode->dirtied_when = jiffies;
-			list_move(&inode->i_list, &sb->s_dirty);
+			list_move(&inode->i_list,
+					&inode_to_bdi(inode)->b_dirty);
 		}
 	}
 out:
@@ -191,31 +193,30 @@ static int write_inode(struct inode *inode, int sync)
  * furthest end of its superblock's dirty-inode list.
  *
  * Before stamping the inode's ->dirtied_when, we check to see whether it is
- * already the most-recently-dirtied inode on the s_dirty list.  If that is
+ * already the most-recently-dirtied inode on the b_dirty list.  If that is
  * the case then the inode must have been redirtied while it was being written
  * out and we don't reset its dirtied_when.
  */
 static void redirty_tail(struct inode *inode)
 {
-	struct super_block *sb = inode->i_sb;
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
 
-	if (!list_empty(&sb->s_dirty)) {
-		struct inode *tail_inode;
+	if (!list_empty(&bdi->b_dirty)) {
+		struct inode *tail;
 
-		tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
-		if (time_before(inode->dirtied_when,
-				tail_inode->dirtied_when))
+		tail = list_entry(bdi->b_dirty.next, struct inode, i_list);
+		if (time_before(inode->dirtied_when, tail->dirtied_when))
 			inode->dirtied_when = jiffies;
 	}
-	list_move(&inode->i_list, &sb->s_dirty);
+	list_move(&inode->i_list, &bdi->b_dirty);
 }
 
 /*
- * requeue inode for re-scanning after sb->s_io list is exhausted.
+ * requeue inode for re-scanning after bdi->b_io list is exhausted.
  */
 static void requeue_io(struct inode *inode)
 {
-	list_move(&inode->i_list, &inode->i_sb->s_more_io);
+	list_move(&inode->i_list, &inode_to_bdi(inode)->b_more_io);
 }
 
 static void inode_sync_complete(struct inode *inode)
@@ -262,18 +263,50 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 /*
  * Queue all expired dirty inodes for io, eldest first.
  */
-static void queue_io(struct super_block *sb,
-				unsigned long *older_than_this)
+static void queue_io(struct backing_dev_info *bdi,
+		     unsigned long *older_than_this)
+{
+	list_splice_init(&bdi->b_more_io, bdi->b_io.prev);
+	move_expired_inodes(&bdi->b_dirty, &bdi->b_io, older_than_this);
+}
+
+static int sb_on_inode_list(struct super_block *sb, struct list_head *list)
 {
-	list_splice_init(&sb->s_more_io, sb->s_io.prev);
-	move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
+	struct inode *inode;
+	int ret = 0;
+
+	spin_lock(&inode_lock);
+	list_for_each_entry(inode, list, i_list) {
+		if (inode->i_sb == sb) {
+			ret = 1;
+			break;
+		}
+	}
+	spin_unlock(&inode_lock);
+	return ret;
 }
 
 int sb_has_dirty_inodes(struct super_block *sb)
 {
-	return !list_empty(&sb->s_dirty) ||
-	       !list_empty(&sb->s_io) ||
-	       !list_empty(&sb->s_more_io);
+	struct backing_dev_info *bdi;
+	int ret = 0;
+
+	/*
+	 * This is REALLY expensive right now, but it'll go away
+	 * when the bdi writeback is introduced
+	 */
+	mutex_lock(&bdi_lock);
+	list_for_each_entry(bdi, &bdi_list, bdi_list) {
+		if (sb_on_inode_list(sb, &bdi->b_dirty) ||
+		    sb_on_inode_list(sb, &bdi->b_io) ||
+		    sb_on_inode_list(sb, &bdi->b_more_io)) {
+			ret = 1;
+			break;
+		}
+	}
+	mutex_unlock(&bdi_lock);
+
+	return ret;
 }
 EXPORT_SYMBOL(sb_has_dirty_inodes);
 
@@ -322,11 +355,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	if (inode->i_state & I_SYNC) {
 		/*
 		 * If this inode is locked for writeback and we are not doing
-		 * writeback-for-data-integrity, move it to s_more_io so that
+		 * writeback-for-data-integrity, move it to b_more_io so that
 		 * writeback can proceed with the other inodes on s_io.
 		 *
 		 * We'll have another go at writing back this inode when we
-		 * completed a full scan of s_io.
+		 * completed a full scan of b_io.
 		 */
 		if (!wait) {
 			requeue_io(inode);
@@ -371,11 +404,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 			/*
 			 * We didn't write back all the pages.  nfs_writepages()
 			 * sometimes bales out without doing anything. Redirty
-			 * the inode; Move it from s_io onto s_more_io/s_dirty.
+			 * the inode; Move it from b_io onto b_more_io/b_dirty.
 			 */
 			/*
 			 * akpm: if the caller was the kupdate function we put
-			 * this inode at the head of s_dirty so it gets first
+			 * this inode at the head of b_dirty so it gets first
 			 * consideration.  Otherwise, move it to the tail, for
 			 * the reasons described there.  I'm not really sure
 			 * how much sense this makes.  Presumably I had a good
@@ -385,7 +418,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 			if (wbc->for_kupdate) {
 				/*
 				 * For the kupdate function we move the inode
-				 * to s_more_io so it will get more writeout as
+				 * to b_more_io so it will get more writeout as
 				 * soon as the queue becomes uncongested.
 				 */
 				inode->i_state |= I_DIRTY_PAGES;
@@ -433,51 +466,34 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	return ret;
 }
 
-/*
- * Write out a superblock's list of dirty inodes.  A wait will be performed
- * upon no inodes, all inodes or the final one, depending upon sync_mode.
- *
- * If older_than_this is non-NULL, then only write out inodes which
- * had their first dirtying at a time earlier than *older_than_this.
- *
- * If we're a pdflush thread, then implement pdflush collision avoidance
- * against the entire list.
- *
- * If `bdi' is non-zero then we're being asked to writeback a specific queue.
- * This function assumes that the blockdev superblock's inodes are backed by
- * a variety of queues, so all inodes are searched.  For other superblocks,
- * assume that all inodes are backed by the same queue.
- *
- * FIXME: this linear search could get expensive with many fileystems.  But
- * how to fix?  We need to go from an address_space to all inodes which share
- * a queue with that address_space.  (Easy: have a global "dirty superblocks"
- * list).
- *
- * The inodes to be written are parked on sb->s_io.  They are moved back onto
- * sb->s_dirty as they are selected for writing.  This way, none can be missed
- * on the writer throttling path, and we get decent balancing between many
- * throttled threads: we don't want them all piling up on inode_sync_wait.
- */
-static void generic_sync_sb_inodes(struct super_block *sb,
-				   struct writeback_control *wbc)
+static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
+				    struct writeback_control *wbc,
+				    struct super_block *sb)
 {
+	const int is_blkdev_sb = sb_is_blkdev_sb(sb);
 	const unsigned long start = jiffies;	/* livelock avoidance */
-	int sync = wbc->sync_mode == WB_SYNC_ALL;
 
 	spin_lock(&inode_lock);
-	if (!wbc->for_kupdate || list_empty(&sb->s_io))
-		queue_io(sb, wbc->older_than_this);
 
-	while (!list_empty(&sb->s_io)) {
-		struct inode *inode = list_entry(sb->s_io.prev,
+	if (!wbc->for_kupdate || list_empty(&bdi->b_io))
+		queue_io(bdi, wbc->older_than_this);
+
+	while (!list_empty(&bdi->b_io)) {
+		struct inode *inode = list_entry(bdi->b_io.prev,
 						struct inode, i_list);
-		struct address_space *mapping = inode->i_mapping;
-		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		long pages_skipped;
 
+		/*
+		 * super block given and doesn't match, skip this inode
+		 */
+		if (sb && sb != inode->i_sb) {
+			redirty_tail(inode);
+			continue;
+		}
+
 		if (!bdi_cap_writeback_dirty(bdi)) {
 			redirty_tail(inode);
-			if (sb_is_blkdev_sb(sb)) {
+			if (is_blkdev_sb) {
 				/*
 				 * Dirty memory-backed blockdev: the ramdisk
 				 * driver does this.  Skip just this inode
@@ -499,14 +515,14 @@ static void generic_sync_sb_inodes(struct super_block *sb,
 
 		if (wbc->nonblocking && bdi_write_congested(bdi)) {
 			wbc->encountered_congestion = 1;
-			if (!sb_is_blkdev_sb(sb))
+			if (!is_blkdev_sb)
 				break;		/* Skip a congested fs */
 			requeue_io(inode);
 			continue;		/* Skip a congested blockdev */
 		}
 
 		if (wbc->bdi && bdi != wbc->bdi) {
-			if (!sb_is_blkdev_sb(sb))
+			if (!is_blkdev_sb)
 				break;		/* fs has the wrong queue */
 			requeue_io(inode);
 			continue;		/* blockdev has wrong queue */
@@ -544,13 +560,57 @@ static void generic_sync_sb_inodes(struct super_block *sb,
 			wbc->more_io = 1;
 			break;
 		}
-		if (!list_empty(&sb->s_more_io))
+		if (!list_empty(&bdi->b_more_io))
 			wbc->more_io = 1;
 	}
 
-	if (sync) {
+	spin_unlock(&inode_lock);
+	/* Leave any unwritten inodes on b_io */
+}
+
+/*
+ * Write out a superblock's list of dirty inodes.  A wait will be performed
+ * upon no inodes, all inodes or the final one, depending upon sync_mode.
+ *
+ * If older_than_this is non-NULL, then only write out inodes which
+ * had their first dirtying at a time earlier than *older_than_this.
+ *
+ * If we're a pdlfush thread, then implement pdflush collision avoidance
+ * against the entire list.
+ *
+ * If `bdi' is non-zero then we're being asked to writeback a specific queue.
+ * This function assumes that the blockdev superblock's inodes are backed by
+ * a variety of queues, so all inodes are searched.  For other superblocks,
+ * assume that all inodes are backed by the same queue.
+ *
+ * FIXME: this linear search could get expensive with many fileystems.  But
+ * how to fix?  We need to go from an address_space to all inodes which share
+ * a queue with that address_space.  (Easy: have a global "dirty superblocks"
+ * list).
+ *
+ * The inodes to be written are parked on bdi->b_io.  They are moved back onto
+ * bdi->b_dirty as they are selected for writing.  This way, none can be missed
+ * on the writer throttling path, and we get decent balancing between many
+ * throttled threads: we don't want them all piling up on inode_sync_wait.
+ */
+static void generic_sync_sb_inodes(struct super_block *sb,
+				   struct writeback_control *wbc)
+{
+	struct backing_dev_info *bdi;
+
+	if (!wbc->bdi) {
+		mutex_lock(&bdi_lock);
+		list_for_each_entry(bdi, &bdi_list, bdi_list)
+			generic_sync_bdi_inodes(bdi, wbc, sb);
+		mutex_unlock(&bdi_lock);
+	} else
+		generic_sync_bdi_inodes(wbc->bdi, wbc, sb);
+
+	if (wbc->sync_mode == WB_SYNC_ALL) {
 		struct inode *inode, *old_inode = NULL;
 
+		spin_lock(&inode_lock);
+
 		/*
 		 * Data integrity sync. Must wait for all pages under writeback,
 		 * because there may have been pages dirtied before our sync
@@ -588,10 +648,7 @@ static void generic_sync_sb_inodes(struct super_block *sb,
 		}
 		spin_unlock(&inode_lock);
 		iput(old_inode);
-	} else
-		spin_unlock(&inode_lock);
-
-	return;		/* Leave any unwritten inodes on s_io */
+	}
 }
 
 /*
@@ -599,8 +656,8 @@ static void generic_sync_sb_inodes(struct super_block *sb,
  *
  * Note:
  * We don't need to grab a reference to superblock here. If it has non-empty
- * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
- * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
+ * ->b_dirty it's hadn't been killed yet and kill_super() won't proceed
+ * past sync_inodes_sb() until the ->b_dirty/b_io/b_more_io lists are all
  * empty. Since __sync_single_inode() regains inode_lock before it finally moves
  * inode from superblock lists we are OK.
  *
diff --git a/fs/super.c b/fs/super.c
index 2761d3e22ed9..0d22ce3be4aa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
 			s = NULL;
 			goto out;
 		}
-		INIT_LIST_HEAD(&s->s_dirty);
-		INIT_LIST_HEAD(&s->s_io);
-		INIT_LIST_HEAD(&s->s_more_io);
 		INIT_LIST_HEAD(&s->s_files);
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 1d52425a6118..928cd5484f4d 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -40,6 +40,8 @@ enum bdi_stat_item {
 #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
 
 struct backing_dev_info {
+	struct list_head bdi_list;
+
 	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
 	unsigned long state;	/* Always use atomic bitops on this */
 	unsigned int capabilities; /* Device capabilities */
@@ -58,6 +60,10 @@ struct backing_dev_info {
 
 	struct device *dev;
 
+	struct list_head	b_dirty;	/* dirty inodes */
+	struct list_head	b_io;		/* parked for writeback */
+	struct list_head	b_more_io;	/* parked for more writeback */
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
 	struct dentry *debug_stats;
@@ -72,6 +78,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
 
+extern struct mutex bdi_lock;
+extern struct list_head bdi_list;
+
 static inline void __add_bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item, s64 amount)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 46ff7dd6e164..56371be1be65 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -715,7 +715,7 @@ struct posix_acl;
 
 struct inode {
 	struct hlist_node	i_hash;
-	struct list_head	i_list;
+	struct list_head	i_list;		/* backing dev IO list */
 	struct list_head	i_sb_list;
 	struct list_head	i_dentry;
 	unsigned long		i_ino;
@@ -1336,9 +1336,6 @@ struct super_block {
 	struct xattr_handler	**s_xattr;
 
 	struct list_head	s_inodes;	/* all inodes */
-	struct list_head	s_dirty;	/* dirty inodes */
-	struct list_head	s_io;		/* parked for writeback */
-	struct list_head	s_more_io;	/* parked for more writeback */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
 	struct list_head	s_files;
 	/* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c86edd244294..6f163e0f0509 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -22,6 +22,8 @@ struct backing_dev_info default_backing_dev_info = {
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
 static struct class *bdi_class;
+DEFINE_MUTEX(bdi_lock);
+LIST_HEAD(bdi_list);
 
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
@@ -211,6 +213,10 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		goto exit;
 	}
 
+	mutex_lock(&bdi_lock);
+	list_add_tail(&bdi->bdi_list, &bdi_list);
+	mutex_unlock(&bdi_lock);
+
 	bdi->dev = dev;
 	bdi_debug_register(bdi, dev_name(dev));
 
@@ -225,9 +231,17 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
 }
 EXPORT_SYMBOL(bdi_register_dev);
 
+static void bdi_remove_from_list(struct backing_dev_info *bdi)
+{
+	mutex_lock(&bdi_lock);
+	list_del(&bdi->bdi_list);
+	mutex_unlock(&bdi_lock);
+}
+
 void bdi_unregister(struct backing_dev_info *bdi)
 {
 	if (bdi->dev) {
+		bdi_remove_from_list(bdi);
 		bdi_debug_unregister(bdi);
 		device_unregister(bdi->dev);
 		bdi->dev = NULL;
@@ -245,6 +259,10 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = PROP_FRAC_BASE;
+	INIT_LIST_HEAD(&bdi->bdi_list);
+	INIT_LIST_HEAD(&bdi->b_io);
+	INIT_LIST_HEAD(&bdi->b_dirty);
+	INIT_LIST_HEAD(&bdi->b_more_io);
 
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
 		err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -259,6 +277,8 @@ int bdi_init(struct backing_dev_info *bdi)
 err:
 		while (i--)
 			percpu_counter_destroy(&bdi->bdi_stat[i]);
+
+		bdi_remove_from_list(bdi);
 	}
 
 	return err;
@@ -269,6 +289,10 @@ void bdi_destroy(struct backing_dev_info *bdi)
 {
 	int i;
 
+	WARN_ON(!list_empty(&bdi->b_dirty));
+	WARN_ON(!list_empty(&bdi->b_io));
+	WARN_ON(!list_empty(&bdi->b_more_io));
+
 	bdi_unregister(bdi);
 
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 81627ebcd313..f8341b6019bf 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -320,15 +320,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
 /*
  *
  */
-static DEFINE_SPINLOCK(bdi_lock);
 static unsigned int bdi_min_ratio;
 
 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
 {
 	int ret = 0;
-	unsigned long flags;
 
-	spin_lock_irqsave(&bdi_lock, flags);
+	mutex_lock(&bdi_lock);
 	if (min_ratio > bdi->max_ratio) {
 		ret = -EINVAL;
 	} else {
@@ -340,27 +338,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
 			ret = -EINVAL;
 		}
 	}
-	spin_unlock_irqrestore(&bdi_lock, flags);
+	mutex_unlock(&bdi_lock);
 
 	return ret;
 }
 
 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 {
-	unsigned long flags;
 	int ret = 0;
 
 	if (max_ratio > 100)
 		return -EINVAL;
 
-	spin_lock_irqsave(&bdi_lock, flags);
+	mutex_lock(&bdi_lock);
 	if (bdi->min_ratio > max_ratio) {
 		ret = -EINVAL;
 	} else {
 		bdi->max_ratio = max_ratio;
 		bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
 	}
-	spin_unlock_irqrestore(&bdi_lock, flags);
+	mutex_unlock(&bdi_lock);
 
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 03ba3782e8dcc5b0e1efe440d33084f066e38cae Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 9 Sep 2009 09:08:54 +0200
Subject: writeback: switch to per-bdi threads for flushing data

This gets rid of pdflush for bdi writeout and kupdated style cleaning.
pdflush writeout suffers from lack of locality and also requires more
threads to handle the same workload, since it has to work in a
non-blocking fashion against each queue. This also introduces lumpy
behaviour and potential request starvation, since pdflush can be starved
for queue access if others are accessing it. A sample ffsb workload that
does random writes to files is about 8% faster here on a simple SATA drive
during the benchmark phase. File layout also seems a LOT more smooth in
vmstat:

 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
 0  1      0 608848   2652 375372    0    0     0 71024  604    24  1 10 48 42
 0  1      0 549644   2712 433736    0    0     0 60692  505    27  1  8 48 44
 1  0      0 476928   2784 505192    0    0     4 29540  553    24  0  9 53 37
 0  1      0 457972   2808 524008    0    0     0 54876  331    16  0  4 38 58
 0  1      0 366128   2928 614284    0    0     4 92168  710    58  0 13 53 34
 0  1      0 295092   3000 684140    0    0     0 62924  572    23  0  9 53 37
 0  1      0 236592   3064 741704    0    0     4 58256  523    17  0  8 48 44
 0  1      0 165608   3132 811464    0    0     0 57460  560    21  0  8 54 38
 0  1      0 102952   3200 873164    0    0     4 74748  540    29  1 10 48 41
 0  1      0  48604   3252 926472    0    0     0 53248  469    29  0  7 47 45

where vanilla tends to fluctuate a lot in the creation phase:

 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
 1  1      0 678716   5792 303380    0    0     0 74064  565    50  1 11 52 36
 1  0      0 662488   5864 319396    0    0     4   352  302   329  0  2 47 51
 0  1      0 599312   5924 381468    0    0     0 78164  516    55  0  9 51 40
 0  1      0 519952   6008 459516    0    0     4 78156  622    56  1 11 52 37
 1  1      0 436640   6092 541632    0    0     0 82244  622    54  0 11 48 41
 0  1      0 436640   6092 541660    0    0     0     8  152    39  0  0 51 49
 0  1      0 332224   6200 644252    0    0     4 102800  728    46  1 13 49 36
 1  0      0 274492   6260 701056    0    0     4 12328  459    49  0  7 50 43
 0  1      0 211220   6324 763356    0    0     0 106940  515    37  1 10 51 39
 1  0      0 160412   6376 813468    0    0     0  8224  415    43  0  6 49 45
 1  1      0  85980   6452 886556    0    0     4 113516  575    39  1 11 54 34
 0  2      0  85968   6452 886620    0    0     0  1640  158   211  0  0 46 54

A 10 disk test with btrfs performs 26% faster with per-bdi flushing. A
SSD based writeback test on XFS performs over 20% better as well, with
the throughput being very stable around 1GB/sec, where pdflush only
manages 750MB/sec and fluctuates wildly while doing so. Random buffered
writes to many files behave a lot better as well, as does random mmap'ed
writes.

A separate thread is added to sync the super blocks. In the long term,
adding sync_supers_bdi() functionality could get rid of this thread again.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/buffer.c                 |   2 +-
 fs/fs-writeback.c           | 999 +++++++++++++++++++++++++++++++-------------
 fs/super.c                  |   2 +-
 fs/sync.c                   |   2 +-
 include/linux/backing-dev.h |  55 ++-
 include/linux/fs.h          |   2 +-
 include/linux/writeback.h   |   8 +-
 mm/backing-dev.c            | 341 ++++++++++++++-
 mm/page-writeback.c         | 179 ++------
 mm/vmscan.c                 |   2 +-
 10 files changed, 1120 insertions(+), 472 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 28f320fac4d4..90a98865b0cc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -281,7 +281,7 @@ static void free_more_memory(void)
 	struct zone *zone;
 	int nid;
 
-	wakeup_pdflush(1024);
+	wakeup_flusher_threads(1024);
 	yield();
 
 	for_each_online_node(nid) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 45ad4bb700e6..7f6dae8aa47f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,6 +19,8 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
@@ -27,165 +29,208 @@
 
 #define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
 
-/**
- * writeback_acquire - attempt to get exclusive writeback access to a device
- * @bdi: the device's backing_dev_info structure
- *
- * It is a waste of resources to have more than one pdflush thread blocked on
- * a single request queue.  Exclusion at the request_queue level is obtained
- * via a flag in the request_queue's backing_dev_info.state.
- *
- * Non-request_queue-backed address_spaces will share default_backing_dev_info,
- * unless they implement their own.  Which is somewhat inefficient, as this
- * may prevent concurrent writeback against multiple devices.
+/*
+ * Work items for the bdi_writeback threads
  */
-static int writeback_acquire(struct backing_dev_info *bdi)
+struct bdi_work {
+	struct list_head list;
+	struct list_head wait_list;
+	struct rcu_head rcu_head;
+
+	unsigned long seen;
+	atomic_t pending;
+
+	struct super_block *sb;
+	unsigned long nr_pages;
+	enum writeback_sync_modes sync_mode;
+
+	unsigned long state;
+};
+
+enum {
+	WS_USED_B = 0,
+	WS_ONSTACK_B,
+};
+
+#define WS_USED (1 << WS_USED_B)
+#define WS_ONSTACK (1 << WS_ONSTACK_B)
+
+static inline bool bdi_work_on_stack(struct bdi_work *work)
+{
+	return test_bit(WS_ONSTACK_B, &work->state);
+}
+
+static inline void bdi_work_init(struct bdi_work *work,
+				 struct writeback_control *wbc)
+{
+	INIT_RCU_HEAD(&work->rcu_head);
+	work->sb = wbc->sb;
+	work->nr_pages = wbc->nr_to_write;
+	work->sync_mode = wbc->sync_mode;
+	work->state = WS_USED;
+}
+
+static inline void bdi_work_init_on_stack(struct bdi_work *work,
+					  struct writeback_control *wbc)
 {
-	return !test_and_set_bit(BDI_pdflush, &bdi->state);
+	bdi_work_init(work, wbc);
+	work->state |= WS_ONSTACK;
 }
 
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
  *
- * Determine whether there is writeback in progress against a backing device.
+ * Determine whether there is writeback waiting to be handled against a
+ * backing device.
  */
 int writeback_in_progress(struct backing_dev_info *bdi)
 {
-	return test_bit(BDI_pdflush, &bdi->state);
+	return !list_empty(&bdi->work_list);
 }
 
-/**
- * writeback_release - relinquish exclusive writeback access against a device.
- * @bdi: the device's backing_dev_info structure
- */
-static void writeback_release(struct backing_dev_info *bdi)
+static void bdi_work_clear(struct bdi_work *work)
 {
-	BUG_ON(!writeback_in_progress(bdi));
-	clear_bit(BDI_pdflush, &bdi->state);
+	clear_bit(WS_USED_B, &work->state);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&work->state, WS_USED_B);
 }
 
-static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+static void bdi_work_free(struct rcu_head *head)
 {
-	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
-		struct dentry *dentry;
-		const char *name = "?";
+	struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
 
-		dentry = d_find_alias(inode);
-		if (dentry) {
-			spin_lock(&dentry->d_lock);
-			name = (const char *) dentry->d_name.name;
-		}
-		printk(KERN_DEBUG
-		       "%s(%d): dirtied inode %lu (%s) on %s\n",
-		       current->comm, task_pid_nr(current), inode->i_ino,
-		       name, inode->i_sb->s_id);
-		if (dentry) {
-			spin_unlock(&dentry->d_lock);
-			dput(dentry);
-		}
-	}
+	if (!bdi_work_on_stack(work))
+		kfree(work);
+	else
+		bdi_work_clear(work);
 }
 
-/**
- *	__mark_inode_dirty -	internal function
- *	@inode: inode to mark
- *	@flags: what kind of dirty (i.e. I_DIRTY_SYNC)
- *	Mark an inode as dirty. Callers should use mark_inode_dirty or
- *  	mark_inode_dirty_sync.
- *
- * Put the inode on the super block's dirty list.
- *
- * CAREFUL! We mark it dirty unconditionally, but move it onto the
- * dirty list only if it is hashed or if it refers to a blockdev.
- * If it was not hashed, it will never be added to the dirty list
- * even if it is later hashed, as it will have been marked dirty already.
- *
- * In short, make sure you hash any inodes _before_ you start marking
- * them dirty.
- *
- * This function *must* be atomic for the I_DIRTY_PAGES case -
- * set_page_dirty() is called under spinlock in several places.
- *
- * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
- * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
- * the kernel-internal blockdev inode represents the dirtying time of the
- * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
- * page->mapping->host, so the page-dirtying time is recorded in the internal
- * blockdev inode.
- */
-void __mark_inode_dirty(struct inode *inode, int flags)
+static void wb_work_complete(struct bdi_work *work)
 {
-	struct super_block *sb = inode->i_sb;
+	const enum writeback_sync_modes sync_mode = work->sync_mode;
 
 	/*
-	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
-	 * dirty the inode itself
+	 * For allocated work, we can clear the done/seen bit right here.
+	 * For on-stack work, we need to postpone both the clear and free
+	 * to after the RCU grace period, since the stack could be invalidated
+	 * as soon as bdi_work_clear() has done the wakeup.
 	 */
-	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
-		if (sb->s_op->dirty_inode)
-			sb->s_op->dirty_inode(inode);
-	}
+	if (!bdi_work_on_stack(work))
+		bdi_work_clear(work);
+	if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work))
+		call_rcu(&work->rcu_head, bdi_work_free);
+}
 
+static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
+{
 	/*
-	 * make sure that changes are seen by all cpus before we test i_state
-	 * -- mikulas
+	 * The caller has retrieved the work arguments from this work,
+	 * drop our reference. If this is the last ref, delete and free it
 	 */
-	smp_mb();
+	if (atomic_dec_and_test(&work->pending)) {
+		struct backing_dev_info *bdi = wb->bdi;
 
-	/* avoid the locking if we can */
-	if ((inode->i_state & flags) == flags)
-		return;
-
-	if (unlikely(block_dump))
-		block_dump___mark_inode_dirty(inode);
+		spin_lock(&bdi->wb_lock);
+		list_del_rcu(&work->list);
+		spin_unlock(&bdi->wb_lock);
 
-	spin_lock(&inode_lock);
-	if ((inode->i_state & flags) != flags) {
-		const int was_dirty = inode->i_state & I_DIRTY;
+		wb_work_complete(work);
+	}
+}
 
-		inode->i_state |= flags;
+static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
+{
+	if (work) {
+		work->seen = bdi->wb_mask;
+		BUG_ON(!work->seen);
+		atomic_set(&work->pending, bdi->wb_cnt);
+		BUG_ON(!bdi->wb_cnt);
 
 		/*
-		 * If the inode is being synced, just update its dirty state.
-		 * The unlocker will place the inode on the appropriate
-		 * superblock list, based upon its state.
+		 * Make sure stores are seen before it appears on the list
 		 */
-		if (inode->i_state & I_SYNC)
-			goto out;
+		smp_mb();
 
-		/*
-		 * Only add valid (hashed) inodes to the superblock's
-		 * dirty list.  Add blockdev inodes as well.
-		 */
-		if (!S_ISBLK(inode->i_mode)) {
-			if (hlist_unhashed(&inode->i_hash))
-				goto out;
-		}
-		if (inode->i_state & (I_FREEING|I_CLEAR))
-			goto out;
+		spin_lock(&bdi->wb_lock);
+		list_add_tail_rcu(&work->list, &bdi->work_list);
+		spin_unlock(&bdi->wb_lock);
+	}
+
+	/*
+	 * If the default thread isn't there, make sure we add it. When
+	 * it gets created and wakes up, we'll run this work.
+	 */
+	if (unlikely(list_empty_careful(&bdi->wb_list)))
+		wake_up_process(default_backing_dev_info.wb.task);
+	else {
+		struct bdi_writeback *wb = &bdi->wb;
 
 		/*
-		 * If the inode was already on b_dirty/b_io/b_more_io, don't
-		 * reposition it (that would break b_dirty time-ordering).
+		 * If we failed allocating the bdi work item, wake up the wb
+		 * thread always. As a safety precaution, it'll flush out
+		 * everything
 		 */
-		if (!was_dirty) {
-			inode->dirtied_when = jiffies;
-			list_move(&inode->i_list,
-					&inode_to_bdi(inode)->b_dirty);
-		}
+		if (!wb_has_dirty_io(wb)) {
+			if (work)
+				wb_clear_pending(wb, work);
+		} else if (wb->task)
+			wake_up_process(wb->task);
 	}
-out:
-	spin_unlock(&inode_lock);
 }
 
-EXPORT_SYMBOL(__mark_inode_dirty);
+/*
+ * Used for on-stack allocated work items. The caller needs to wait until
+ * the wb threads have acked the work before it's safe to continue.
+ */
+static void bdi_wait_on_work_clear(struct bdi_work *work)
+{
+	wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
+		    TASK_UNINTERRUPTIBLE);
+}
 
-static int write_inode(struct inode *inode, int sync)
+static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
 {
-	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
-		return inode->i_sb->s_op->write_inode(inode, sync);
-	return 0;
+	struct bdi_work *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work)
+		bdi_work_init(work, wbc);
+
+	return work;
+}
+
+void bdi_start_writeback(struct writeback_control *wbc)
+{
+	const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
+	struct bdi_work work_stack, *work = NULL;
+
+	if (!must_wait)
+		work = bdi_alloc_work(wbc);
+
+	if (!work) {
+		work = &work_stack;
+		bdi_work_init_on_stack(work, wbc);
+	}
+
+	bdi_queue_work(wbc->bdi, work);
+
+	/*
+	 * If the sync mode is WB_SYNC_ALL, block waiting for the work to
+	 * complete. If not, we only need to wait for the work to be started,
+	 * if we allocated it on-stack. We use the same mechanism, if the
+	 * wait bit is set in the bdi_work struct, then threads will not
+	 * clear pending until after they are done.
+	 *
+	 * Note that work == &work_stack if must_wait is true, so we don't
+	 * need to do call_rcu() here ever, since the completion path will
+	 * have done that for us.
+	 */
+	if (must_wait || work == &work_stack) {
+		bdi_wait_on_work_clear(work);
+		if (work != &work_stack)
+			call_rcu(&work->rcu_head, bdi_work_free);
+	}
 }
 
 /*
@@ -199,16 +244,16 @@ static int write_inode(struct inode *inode, int sync)
  */
 static void redirty_tail(struct inode *inode)
 {
-	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 
-	if (!list_empty(&bdi->b_dirty)) {
+	if (!list_empty(&wb->b_dirty)) {
 		struct inode *tail;
 
-		tail = list_entry(bdi->b_dirty.next, struct inode, i_list);
+		tail = list_entry(wb->b_dirty.next, struct inode, i_list);
 		if (time_before(inode->dirtied_when, tail->dirtied_when))
 			inode->dirtied_when = jiffies;
 	}
-	list_move(&inode->i_list, &bdi->b_dirty);
+	list_move(&inode->i_list, &wb->b_dirty);
 }
 
 /*
@@ -216,7 +261,9 @@ static void redirty_tail(struct inode *inode)
  */
 static void requeue_io(struct inode *inode)
 {
-	list_move(&inode->i_list, &inode_to_bdi(inode)->b_more_io);
+	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+
+	list_move(&inode->i_list, &wb->b_more_io);
 }
 
 static void inode_sync_complete(struct inode *inode)
@@ -263,52 +310,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 /*
  * Queue all expired dirty inodes for io, eldest first.
  */
-static void queue_io(struct backing_dev_info *bdi,
-		     unsigned long *older_than_this)
+static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 {
-	list_splice_init(&bdi->b_more_io, bdi->b_io.prev);
-	move_expired_inodes(&bdi->b_dirty, &bdi->b_io, older_than_this);
+	list_splice_init(&wb->b_more_io, wb->b_io.prev);
+	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
 
-static int sb_on_inode_list(struct super_block *sb, struct list_head *list)
-{
-	struct inode *inode;
-	int ret = 0;
-
-	spin_lock(&inode_lock);
-	list_for_each_entry(inode, list, i_list) {
-		if (inode->i_sb == sb) {
-			ret = 1;
-			break;
-		}
-	}
-	spin_unlock(&inode_lock);
-	return ret;
-}
-
-int sb_has_dirty_inodes(struct super_block *sb)
+static int write_inode(struct inode *inode, int sync)
 {
-	struct backing_dev_info *bdi;
-	int ret = 0;
-
-	/*
-	 * This is REALLY expensive right now, but it'll go away
-	 * when the bdi writeback is introduced
-	 */
-	mutex_lock(&bdi_lock);
-	list_for_each_entry(bdi, &bdi_list, bdi_list) {
-		if (sb_on_inode_list(sb, &bdi->b_dirty) ||
-		    sb_on_inode_list(sb, &bdi->b_io) ||
-		    sb_on_inode_list(sb, &bdi->b_more_io)) {
-			ret = 1;
-			break;
-		}
-	}
-	mutex_unlock(&bdi_lock);
-
-	return ret;
+	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
+		return inode->i_sb->s_op->write_inode(inode, sync);
+	return 0;
 }
-EXPORT_SYMBOL(sb_has_dirty_inodes);
 
 /*
  * Wait for writeback on an inode to complete.
@@ -466,20 +479,71 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	return ret;
 }
 
-static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
-				    struct writeback_control *wbc,
-				    struct super_block *sb)
+/*
+ * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
+ * before calling writeback. So make sure that we do pin it, so it doesn't
+ * go away while we are writing inodes from it.
+ *
+ * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
+ * 1 if we failed.
+ */
+static int pin_sb_for_writeback(struct writeback_control *wbc,
+				   struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	/*
+	 * Caller must already hold the ref for this
+	 */
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		WARN_ON(!rwsem_is_locked(&sb->s_umount));
+		return 0;
+	}
+
+	spin_lock(&sb_lock);
+	sb->s_count++;
+	if (down_read_trylock(&sb->s_umount)) {
+		if (sb->s_root) {
+			spin_unlock(&sb_lock);
+			return 0;
+		}
+		/*
+		 * umounted, drop rwsem again and fall through to failure
+		 */
+		up_read(&sb->s_umount);
+	}
+
+	sb->s_count--;
+	spin_unlock(&sb_lock);
+	return 1;
+}
+
+static void unpin_sb_for_writeback(struct writeback_control *wbc,
+				   struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		return;
+
+	up_read(&sb->s_umount);
+	put_super(sb);
+}
+
+static void writeback_inodes_wb(struct bdi_writeback *wb,
+				struct writeback_control *wbc)
 {
+	struct super_block *sb = wbc->sb;
 	const int is_blkdev_sb = sb_is_blkdev_sb(sb);
 	const unsigned long start = jiffies;	/* livelock avoidance */
 
 	spin_lock(&inode_lock);
 
-	if (!wbc->for_kupdate || list_empty(&bdi->b_io))
-		queue_io(bdi, wbc->older_than_this);
+	if (!wbc->for_kupdate || list_empty(&wb->b_io))
+		queue_io(wb, wbc->older_than_this);
 
-	while (!list_empty(&bdi->b_io)) {
-		struct inode *inode = list_entry(bdi->b_io.prev,
+	while (!list_empty(&wb->b_io)) {
+		struct inode *inode = list_entry(wb->b_io.prev,
 						struct inode, i_list);
 		long pages_skipped;
 
@@ -491,7 +555,7 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
 			continue;
 		}
 
-		if (!bdi_cap_writeback_dirty(bdi)) {
+		if (!bdi_cap_writeback_dirty(wb->bdi)) {
 			redirty_tail(inode);
 			if (is_blkdev_sb) {
 				/*
@@ -513,7 +577,7 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
 			continue;
 		}
 
-		if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
 			wbc->encountered_congestion = 1;
 			if (!is_blkdev_sb)
 				break;		/* Skip a congested fs */
@@ -521,13 +585,6 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
 			continue;		/* Skip a congested blockdev */
 		}
 
-		if (wbc->bdi && bdi != wbc->bdi) {
-			if (!is_blkdev_sb)
-				break;		/* fs has the wrong queue */
-			requeue_io(inode);
-			continue;		/* blockdev has wrong queue */
-		}
-
 		/*
 		 * Was this inode dirtied after sync_sb_inodes was called?
 		 * This keeps sync from extra jobs and livelock.
@@ -535,16 +592,16 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
 		if (inode_dirtied_after(inode, start))
 			break;
 
-		/* Is another pdflush already flushing this queue? */
-		if (current_is_pdflush() && !writeback_acquire(bdi))
-			break;
+		if (pin_sb_for_writeback(wbc, inode)) {
+			requeue_io(inode);
+			continue;
+		}
 
 		BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
 		__iget(inode);
 		pages_skipped = wbc->pages_skipped;
 		writeback_single_inode(inode, wbc);
-		if (current_is_pdflush())
-			writeback_release(bdi);
+		unpin_sb_for_writeback(wbc, inode);
 		if (wbc->pages_skipped != pages_skipped) {
 			/*
 			 * writeback is not making progress due to locked
@@ -560,7 +617,7 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
 			wbc->more_io = 1;
 			break;
 		}
-		if (!list_empty(&bdi->b_more_io))
+		if (!list_empty(&wb->b_more_io))
 			wbc->more_io = 1;
 	}
 
@@ -568,139 +625,500 @@ static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
 	/* Leave any unwritten inodes on b_io */
 }
 
+void writeback_inodes_wbc(struct writeback_control *wbc)
+{
+	struct backing_dev_info *bdi = wbc->bdi;
+
+	writeback_inodes_wb(&bdi->wb, wbc);
+}
+
 /*
- * Write out a superblock's list of dirty inodes.  A wait will be performed
- * upon no inodes, all inodes or the final one, depending upon sync_mode.
- *
- * If older_than_this is non-NULL, then only write out inodes which
- * had their first dirtying at a time earlier than *older_than_this.
- *
- * If we're a pdlfush thread, then implement pdflush collision avoidance
- * against the entire list.
+ * The maximum number of pages to writeout in a single bdi flush/kupdate
+ * operation.  We do this so we don't hold I_SYNC against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode.  Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES     1024
+
+static inline bool over_bground_thresh(void)
+{
+	unsigned long background_thresh, dirty_thresh;
+
+	get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+
+	return (global_page_state(NR_FILE_DIRTY) +
+		global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+}
+
+/*
+ * Explicit flushing or periodic writeback of "old" data.
  *
- * If `bdi' is non-zero then we're being asked to writeback a specific queue.
- * This function assumes that the blockdev superblock's inodes are backed by
- * a variety of queues, so all inodes are searched.  For other superblocks,
- * assume that all inodes are backed by the same queue.
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
+ * dirtying-time in the inode's address_space.  So this periodic writeback code
+ * just walks the superblock inode list, writing back any inodes which are
+ * older than a specific point in time.
  *
- * FIXME: this linear search could get expensive with many fileystems.  But
- * how to fix?  We need to go from an address_space to all inodes which share
- * a queue with that address_space.  (Easy: have a global "dirty superblocks"
- * list).
+ * Try to run once per dirty_writeback_interval.  But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
+ * one-second gap.
  *
- * The inodes to be written are parked on bdi->b_io.  They are moved back onto
- * bdi->b_dirty as they are selected for writing.  This way, none can be missed
- * on the writer throttling path, and we get decent balancing between many
- * throttled threads: we don't want them all piling up on inode_sync_wait.
+ * older_than_this takes precedence over nr_to_write.  So we'll only write back
+ * all dirty pages if they are all attached to "old" mappings.
  */
-static void generic_sync_sb_inodes(struct super_block *sb,
-				   struct writeback_control *wbc)
+static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
+			 struct super_block *sb,
+			 enum writeback_sync_modes sync_mode, int for_kupdate)
 {
-	struct backing_dev_info *bdi;
-
-	if (!wbc->bdi) {
-		mutex_lock(&bdi_lock);
-		list_for_each_entry(bdi, &bdi_list, bdi_list)
-			generic_sync_bdi_inodes(bdi, wbc, sb);
-		mutex_unlock(&bdi_lock);
-	} else
-		generic_sync_bdi_inodes(wbc->bdi, wbc, sb);
+	struct writeback_control wbc = {
+		.bdi			= wb->bdi,
+		.sb			= sb,
+		.sync_mode		= sync_mode,
+		.older_than_this	= NULL,
+		.for_kupdate		= for_kupdate,
+		.range_cyclic		= 1,
+	};
+	unsigned long oldest_jif;
+	long wrote = 0;
 
-	if (wbc->sync_mode == WB_SYNC_ALL) {
-		struct inode *inode, *old_inode = NULL;
+	if (wbc.for_kupdate) {
+		wbc.older_than_this = &oldest_jif;
+		oldest_jif = jiffies -
+				msecs_to_jiffies(dirty_expire_interval * 10);
+	}
 
-		spin_lock(&inode_lock);
+	for (;;) {
+		/*
+		 * Don't flush anything for non-integrity writeback where
+		 * no nr_pages was given
+		 */
+		if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
+			break;
 
 		/*
-		 * Data integrity sync. Must wait for all pages under writeback,
-		 * because there may have been pages dirtied before our sync
-		 * call, but which had writeout started before we write it out.
-		 * In which case, the inode may not be on the dirty list, but
-		 * we still have to wait for that writeout.
+		 * If no specific pages were given and this is just a
+		 * periodic background writeout and we are below the
+		 * background dirty threshold, don't do anything
 		 */
-		list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-			struct address_space *mapping;
+		if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
+			break;
 
-			if (inode->i_state &
-					(I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
-				continue;
-			mapping = inode->i_mapping;
-			if (mapping->nrpages == 0)
+		wbc.more_io = 0;
+		wbc.encountered_congestion = 0;
+		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		wbc.pages_skipped = 0;
+		writeback_inodes_wb(wb, &wbc);
+		nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+
+		/*
+		 * If we ran out of stuff to write, bail unless more_io got set
+		 */
+		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
+			if (wbc.more_io && !wbc.for_kupdate)
 				continue;
-			__iget(inode);
-			spin_unlock(&inode_lock);
+			break;
+		}
+	}
+
+	return wrote;
+}
+
+/*
+ * Return the next bdi_work struct that hasn't been processed by this
+ * wb thread yet
+ */
+static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
+					   struct bdi_writeback *wb)
+{
+	struct bdi_work *work, *ret = NULL;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(work, &bdi->work_list, list) {
+		if (!test_and_clear_bit(wb->nr, &work->seen))
+			continue;
+
+		ret = work;
+		break;
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
+
+static long wb_check_old_data_flush(struct bdi_writeback *wb)
+{
+	unsigned long expired;
+	long nr_pages;
+
+	expired = wb->last_old_flush +
+			msecs_to_jiffies(dirty_writeback_interval * 10);
+	if (time_before(jiffies, expired))
+		return 0;
+
+	wb->last_old_flush = jiffies;
+	nr_pages = global_page_state(NR_FILE_DIRTY) +
+			global_page_state(NR_UNSTABLE_NFS) +
+			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+	if (nr_pages)
+		return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
+
+	return 0;
+}
+
+/*
+ * Retrieve work items and do the writeback they describe
+ */
+long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+{
+	struct backing_dev_info *bdi = wb->bdi;
+	struct bdi_work *work;
+	long nr_pages, wrote = 0;
+
+	while ((work = get_next_work_item(bdi, wb)) != NULL) {
+		enum writeback_sync_modes sync_mode;
+
+		nr_pages = work->nr_pages;
+
+		/*
+		 * Override sync mode, in case we must wait for completion
+		 */
+		if (force_wait)
+			work->sync_mode = sync_mode = WB_SYNC_ALL;
+		else
+			sync_mode = work->sync_mode;
+
+		/*
+		 * If this isn't a data integrity operation, just notify
+		 * that we have seen this work and we are now starting it.
+		 */
+		if (sync_mode == WB_SYNC_NONE)
+			wb_clear_pending(wb, work);
+
+		wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
+
+		/*
+		 * This is a data integrity writeback, so only do the
+		 * notification when we have completed the work.
+		 */
+		if (sync_mode == WB_SYNC_ALL)
+			wb_clear_pending(wb, work);
+	}
+
+	/*
+	 * Check for periodic writeback, kupdated() style
+	 */
+	wrote += wb_check_old_data_flush(wb);
+
+	return wrote;
+}
+
+/*
+ * Handle writeback of dirty data for the device backed by this bdi. Also
+ * wakes up periodically and does kupdated style flushing.
+ */
+int bdi_writeback_task(struct bdi_writeback *wb)
+{
+	unsigned long last_active = jiffies;
+	unsigned long wait_jiffies = -1UL;
+	long pages_written;
+
+	while (!kthread_should_stop()) {
+		pages_written = wb_do_writeback(wb, 0);
+
+		if (pages_written)
+			last_active = jiffies;
+		else if (wait_jiffies != -1UL) {
+			unsigned long max_idle;
+
 			/*
-			 * We hold a reference to 'inode' so it couldn't have
-			 * been removed from s_inodes list while we dropped the
-			 * inode_lock.  We cannot iput the inode now as we can
-			 * be holding the last reference and we cannot iput it
-			 * under inode_lock. So we keep the reference and iput
-			 * it later.
+			 * Longest period of inactivity that we tolerate. If we
+			 * see dirty data again later, the task will get
+			 * recreated automatically.
 			 */
-			iput(old_inode);
-			old_inode = inode;
+			max_idle = max(5UL * 60 * HZ, wait_jiffies);
+			if (time_after(jiffies, max_idle + last_active))
+				break;
+		}
+
+		wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(wait_jiffies);
+		try_to_freeze();
+	}
+
+	return 0;
+}
+
+/*
+ * Schedule writeback for all backing devices. Expensive! If this is a data
+ * integrity operation, writeback will be complete when this returns. If
+ * we are simply called for WB_SYNC_NONE, then writeback will merely be
+ * scheduled to run.
+ */
+static void bdi_writeback_all(struct writeback_control *wbc)
+{
+	const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
+	struct backing_dev_info *bdi;
+	struct bdi_work *work;
+	LIST_HEAD(list);
+
+restart:
+	spin_lock(&bdi_lock);
+
+	list_for_each_entry(bdi, &bdi_list, bdi_list) {
+		struct bdi_work *work;
+
+		if (!bdi_has_dirty_io(bdi))
+			continue;
 
-			filemap_fdatawait(mapping);
+		/*
+		 * If work allocation fails, do the writes inline. We drop
+		 * the lock and restart the list writeout. This should be OK,
+		 * since this happens rarely and because the writeout should
+		 * eventually make more free memory available.
+		 */
+		work = bdi_alloc_work(wbc);
+		if (!work) {
+			struct writeback_control __wbc;
 
-			cond_resched();
+			/*
+			 * Not a data integrity writeout, just continue
+			 */
+			if (!must_wait)
+				continue;
 
-			spin_lock(&inode_lock);
+			spin_unlock(&bdi_lock);
+			__wbc = *wbc;
+			__wbc.bdi = bdi;
+			writeback_inodes_wbc(&__wbc);
+			goto restart;
 		}
-		spin_unlock(&inode_lock);
-		iput(old_inode);
+		if (must_wait)
+			list_add_tail(&work->wait_list, &list);
+
+		bdi_queue_work(bdi, work);
+	}
+
+	spin_unlock(&bdi_lock);
+
+	/*
+	 * If this is for WB_SYNC_ALL, wait for pending work to complete
+	 * before returning.
+	 */
+	while (!list_empty(&list)) {
+		work = list_entry(list.next, struct bdi_work, wait_list);
+		list_del(&work->wait_list);
+		bdi_wait_on_work_clear(work);
+		call_rcu(&work->rcu_head, bdi_work_free);
 	}
 }
 
 /*
- * Start writeback of dirty pagecache data against all unlocked inodes.
+ * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
+ * the whole world.
+ */
+void wakeup_flusher_threads(long nr_pages)
+{
+	struct writeback_control wbc = {
+		.sync_mode	= WB_SYNC_NONE,
+		.older_than_this = NULL,
+		.range_cyclic	= 1,
+	};
+
+	if (nr_pages == 0)
+		nr_pages = global_page_state(NR_FILE_DIRTY) +
+				global_page_state(NR_UNSTABLE_NFS);
+	wbc.nr_to_write = nr_pages;
+	bdi_writeback_all(&wbc);
+}
+
+static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+{
+	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
+		struct dentry *dentry;
+		const char *name = "?";
+
+		dentry = d_find_alias(inode);
+		if (dentry) {
+			spin_lock(&dentry->d_lock);
+			name = (const char *) dentry->d_name.name;
+		}
+		printk(KERN_DEBUG
+		       "%s(%d): dirtied inode %lu (%s) on %s\n",
+		       current->comm, task_pid_nr(current), inode->i_ino,
+		       name, inode->i_sb->s_id);
+		if (dentry) {
+			spin_unlock(&dentry->d_lock);
+			dput(dentry);
+		}
+	}
+}
+
+/**
+ *	__mark_inode_dirty -	internal function
+ *	@inode: inode to mark
+ *	@flags: what kind of dirty (i.e. I_DIRTY_SYNC)
+ *	Mark an inode as dirty. Callers should use mark_inode_dirty or
+ *  	mark_inode_dirty_sync.
  *
- * Note:
- * We don't need to grab a reference to superblock here. If it has non-empty
- * ->b_dirty it's hadn't been killed yet and kill_super() won't proceed
- * past sync_inodes_sb() until the ->b_dirty/b_io/b_more_io lists are all
- * empty. Since __sync_single_inode() regains inode_lock before it finally moves
- * inode from superblock lists we are OK.
+ * Put the inode on the super block's dirty list.
+ *
+ * CAREFUL! We mark it dirty unconditionally, but move it onto the
+ * dirty list only if it is hashed or if it refers to a blockdev.
+ * If it was not hashed, it will never be added to the dirty list
+ * even if it is later hashed, as it will have been marked dirty already.
+ *
+ * In short, make sure you hash any inodes _before_ you start marking
+ * them dirty.
  *
- * If `older_than_this' is non-zero then only flush inodes which have a
- * flushtime older than *older_than_this.
+ * This function *must* be atomic for the I_DIRTY_PAGES case -
+ * set_page_dirty() is called under spinlock in several places.
  *
- * If `bdi' is non-zero then we will scan the first inode against each
- * superblock until we find the matching ones.  One group will be the dirty
- * inodes against a filesystem.  Then when we hit the dummy blockdev superblock,
- * sync_sb_inodes will seekout the blockdev which matches `bdi'.  Maybe not
- * super-efficient but we're about to do a ton of I/O...
+ * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
+ * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
+ * the kernel-internal blockdev inode represents the dirtying time of the
+ * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
+ * page->mapping->host, so the page-dirtying time is recorded in the internal
+ * blockdev inode.
  */
-void
-writeback_inodes(struct writeback_control *wbc)
+void __mark_inode_dirty(struct inode *inode, int flags)
 {
-	struct super_block *sb;
+	struct super_block *sb = inode->i_sb;
 
-	might_sleep();
-	spin_lock(&sb_lock);
-restart:
-	list_for_each_entry_reverse(sb, &super_blocks, s_list) {
-		if (sb_has_dirty_inodes(sb)) {
-			/* we're making our own get_super here */
-			sb->s_count++;
-			spin_unlock(&sb_lock);
-			/*
-			 * If we can't get the readlock, there's no sense in
-			 * waiting around, most of the time the FS is going to
-			 * be unmounted by the time it is released.
-			 */
-			if (down_read_trylock(&sb->s_umount)) {
-				if (sb->s_root)
-					generic_sync_sb_inodes(sb, wbc);
-				up_read(&sb->s_umount);
-			}
-			spin_lock(&sb_lock);
-			if (__put_super_and_need_restart(sb))
-				goto restart;
+	/*
+	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
+	 * dirty the inode itself
+	 */
+	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+		if (sb->s_op->dirty_inode)
+			sb->s_op->dirty_inode(inode);
+	}
+
+	/*
+	 * make sure that changes are seen by all cpus before we test i_state
+	 * -- mikulas
+	 */
+	smp_mb();
+
+	/* avoid the locking if we can */
+	if ((inode->i_state & flags) == flags)
+		return;
+
+	if (unlikely(block_dump))
+		block_dump___mark_inode_dirty(inode);
+
+	spin_lock(&inode_lock);
+	if ((inode->i_state & flags) != flags) {
+		const int was_dirty = inode->i_state & I_DIRTY;
+
+		inode->i_state |= flags;
+
+		/*
+		 * If the inode is being synced, just update its dirty state.
+		 * The unlocker will place the inode on the appropriate
+		 * superblock list, based upon its state.
+		 */
+		if (inode->i_state & I_SYNC)
+			goto out;
+
+		/*
+		 * Only add valid (hashed) inodes to the superblock's
+		 * dirty list.  Add blockdev inodes as well.
+		 */
+		if (!S_ISBLK(inode->i_mode)) {
+			if (hlist_unhashed(&inode->i_hash))
+				goto out;
+		}
+		if (inode->i_state & (I_FREEING|I_CLEAR))
+			goto out;
+
+		/*
+		 * If the inode was already on b_dirty/b_io/b_more_io, don't
+		 * reposition it (that would break b_dirty time-ordering).
+		 */
+		if (!was_dirty) {
+			struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+
+			inode->dirtied_when = jiffies;
+			list_move(&inode->i_list, &wb->b_dirty);
 		}
-		if (wbc->nr_to_write <= 0)
-			break;
 	}
-	spin_unlock(&sb_lock);
+out:
+	spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(__mark_inode_dirty);
+
+/*
+ * Write out a superblock's list of dirty inodes.  A wait will be performed
+ * upon no inodes, all inodes or the final one, depending upon sync_mode.
+ *
+ * If older_than_this is non-NULL, then only write out inodes which
+ * had their first dirtying at a time earlier than *older_than_this.
+ *
+ * If we're a pdlfush thread, then implement pdflush collision avoidance
+ * against the entire list.
+ *
+ * If `bdi' is non-zero then we're being asked to writeback a specific queue.
+ * This function assumes that the blockdev superblock's inodes are backed by
+ * a variety of queues, so all inodes are searched.  For other superblocks,
+ * assume that all inodes are backed by the same queue.
+ *
+ * The inodes to be written are parked on bdi->b_io.  They are moved back onto
+ * bdi->b_dirty as they are selected for writing.  This way, none can be missed
+ * on the writer throttling path, and we get decent balancing between many
+ * throttled threads: we don't want them all piling up on inode_sync_wait.
+ */
+static void wait_sb_inodes(struct writeback_control *wbc)
+{
+	struct inode *inode, *old_inode = NULL;
+
+	/*
+	 * We need to be protected against the filesystem going from
+	 * r/o to r/w or vice versa.
+	 */
+	WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
+
+	spin_lock(&inode_lock);
+
+	/*
+	 * Data integrity sync. Must wait for all pages under writeback,
+	 * because there may have been pages dirtied before our sync
+	 * call, but which had writeout started before we write it out.
+	 * In which case, the inode may not be on the dirty list, but
+	 * we still have to wait for that writeout.
+	 */
+	list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
+		struct address_space *mapping;
+
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+			continue;
+		mapping = inode->i_mapping;
+		if (mapping->nrpages == 0)
+			continue;
+		__iget(inode);
+		spin_unlock(&inode_lock);
+		/*
+		 * We hold a reference to 'inode' so it couldn't have
+		 * been removed from s_inodes list while we dropped the
+		 * inode_lock.  We cannot iput the inode now as we can
+		 * be holding the last reference and we cannot iput it
+		 * under inode_lock. So we keep the reference and iput
+		 * it later.
+		 */
+		iput(old_inode);
+		old_inode = inode;
+
+		filemap_fdatawait(mapping);
+
+		cond_resched();
+
+		spin_lock(&inode_lock);
+	}
+	spin_unlock(&inode_lock);
+	iput(old_inode);
 }
 
 /**
@@ -715,6 +1133,7 @@ restart:
 long writeback_inodes_sb(struct super_block *sb)
 {
 	struct writeback_control wbc = {
+		.sb		= sb,
 		.sync_mode	= WB_SYNC_NONE,
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
@@ -727,7 +1146,7 @@ long writeback_inodes_sb(struct super_block *sb)
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
 	wbc.nr_to_write = nr_to_write;
-	generic_sync_sb_inodes(sb, &wbc);
+	bdi_writeback_all(&wbc);
 	return nr_to_write - wbc.nr_to_write;
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
@@ -742,6 +1161,7 @@ EXPORT_SYMBOL(writeback_inodes_sb);
 long sync_inodes_sb(struct super_block *sb)
 {
 	struct writeback_control wbc = {
+		.sb		= sb,
 		.sync_mode	= WB_SYNC_ALL,
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
@@ -749,7 +1169,8 @@ long sync_inodes_sb(struct super_block *sb)
 	long nr_to_write = LONG_MAX; /* doesn't actually matter */
 
 	wbc.nr_to_write = nr_to_write;
-	generic_sync_sb_inodes(sb, &wbc);
+	bdi_writeback_all(&wbc);
+	wait_sb_inodes(&wbc);
 	return nr_to_write - wbc.nr_to_write;
 }
 EXPORT_SYMBOL(sync_inodes_sb);
diff --git a/fs/super.c b/fs/super.c
index 0d22ce3be4aa..9cda337ddae2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -168,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb)
  *	Drops a temporary reference, frees superblock if there's no
  *	references left.
  */
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
 {
 	spin_lock(&sb_lock);
 	__put_super(sb);
diff --git a/fs/sync.c b/fs/sync.c
index 66f210476f40..103cc7fdd3df 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -120,7 +120,7 @@ restart:
  */
 SYSCALL_DEFINE0(sync)
 {
-	wakeup_pdflush(0);
+	wakeup_flusher_threads(0);
 	sync_filesystems(0);
 	sync_filesystems(1);
 	if (unlikely(laptop_mode))
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 928cd5484f4d..d045f5f615c7 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -13,6 +13,8 @@
 #include <linux/proportions.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
 #include <asm/atomic.h>
 
 struct page;
@@ -23,7 +25,8 @@ struct dentry;
  * Bits in backing_dev_info.state
  */
 enum bdi_state {
-	BDI_pdflush,		/* A pdflush thread is working this device */
+	BDI_pending,		/* On its way to being activated */
+	BDI_wb_alloc,		/* Default embedded wb allocated */
 	BDI_async_congested,	/* The async (write) queue is getting full */
 	BDI_sync_congested,	/* The sync queue is getting full */
 	BDI_unused,		/* Available bits start here */
@@ -39,9 +42,22 @@ enum bdi_stat_item {
 
 #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
 
+struct bdi_writeback {
+	struct list_head list;			/* hangs off the bdi */
+
+	struct backing_dev_info *bdi;		/* our parent bdi */
+	unsigned int nr;
+
+	unsigned long last_old_flush;		/* last old data flush */
+
+	struct task_struct	*task;		/* writeback task */
+	struct list_head	b_dirty;	/* dirty inodes */
+	struct list_head	b_io;		/* parked for writeback */
+	struct list_head	b_more_io;	/* parked for more writeback */
+};
+
 struct backing_dev_info {
 	struct list_head bdi_list;
-
 	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
 	unsigned long state;	/* Always use atomic bitops on this */
 	unsigned int capabilities; /* Device capabilities */
@@ -58,11 +74,15 @@ struct backing_dev_info {
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
 
-	struct device *dev;
+	struct bdi_writeback wb;  /* default writeback info for this bdi */
+	spinlock_t wb_lock;	  /* protects update side of wb_list */
+	struct list_head wb_list; /* the flusher threads hanging off this bdi */
+	unsigned long wb_mask;	  /* bitmask of registered tasks */
+	unsigned int wb_cnt;	  /* number of registered tasks */
 
-	struct list_head	b_dirty;	/* dirty inodes */
-	struct list_head	b_io;		/* parked for writeback */
-	struct list_head	b_more_io;	/* parked for more writeback */
+	struct list_head work_list;
+
+	struct device *dev;
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
@@ -77,10 +97,20 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
+void bdi_start_writeback(struct writeback_control *wbc);
+int bdi_writeback_task(struct bdi_writeback *wb);
+int bdi_has_dirty_io(struct backing_dev_info *bdi);
 
-extern struct mutex bdi_lock;
+extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
 
+static inline int wb_has_dirty_io(struct bdi_writeback *wb)
+{
+	return !list_empty(&wb->b_dirty) ||
+	       !list_empty(&wb->b_io) ||
+	       !list_empty(&wb->b_more_io);
+}
+
 static inline void __add_bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item, s64 amount)
 {
@@ -270,6 +300,11 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
 	return bdi->capabilities & BDI_CAP_SWAP_BACKED;
 }
 
+static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
+{
+	return bdi == &default_backing_dev_info;
+}
+
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
 	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
@@ -285,4 +320,10 @@ static inline bool mapping_cap_swap_backed(struct address_space *mapping)
 	return bdi_cap_swap_backed(mapping->backing_dev_info);
 }
 
+static inline int bdi_sched_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
 #endif		/* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 56371be1be65..26da98f61116 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1786,6 +1786,7 @@ extern int get_sb_pseudo(struct file_system_type *, char *,
 	struct vfsmount *mnt);
 extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
 int __put_super_and_need_restart(struct super_block *sb);
+void put_super(struct super_block *sb);
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
 #define fops_get(fops) \
@@ -2182,7 +2183,6 @@ extern int bdev_read_only(struct block_device *);
 extern int set_blocksize(struct block_device *, int);
 extern int sb_set_blocksize(struct super_block *, int);
 extern int sb_min_blocksize(struct super_block *, int);
-extern int sb_has_dirty_inodes(struct super_block *);
 
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 07039299603d..cef75527a14c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -40,6 +40,8 @@ enum writeback_sync_modes {
 struct writeback_control {
 	struct backing_dev_info *bdi;	/* If !NULL, only write back this
 					   queue */
+	struct super_block *sb;		/* if !NULL, only write inodes from
+					   this super_block */
 	enum writeback_sync_modes sync_mode;
 	unsigned long *older_than_this;	/* If !NULL, only write back inodes
 					   older than this */
@@ -76,10 +78,13 @@ struct writeback_control {
 /*
  * fs/fs-writeback.c
  */	
-void writeback_inodes(struct writeback_control *wbc);
+struct bdi_writeback;
 int inode_wait(void *);
 long writeback_inodes_sb(struct super_block *);
 long sync_inodes_sb(struct super_block *);
+void writeback_inodes_wbc(struct writeback_control *wbc);
+long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
+void wakeup_flusher_threads(long nr_pages);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
@@ -99,7 +104,6 @@ static inline void inode_sync_wait(struct inode *inode)
 /*
  * mm/page-writeback.c
  */
-int wakeup_pdflush(long nr_pages);
 void laptop_io_completion(void);
 void laptop_sync_completion(void);
 void throttle_vm_writeout(gfp_t gfp_mask);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6f163e0f0509..7f3fa79f25c0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
 
 #include <linux/wait.h>
 #include <linux/backing-dev.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/writeback.h>
@@ -22,8 +25,18 @@ struct backing_dev_info default_backing_dev_info = {
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
 static struct class *bdi_class;
-DEFINE_MUTEX(bdi_lock);
+DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
+LIST_HEAD(bdi_pending_list);
+
+static struct task_struct *sync_supers_tsk;
+static struct timer_list sync_supers_timer;
+
+static int bdi_sync_supers(void *);
+static void sync_supers_timer_fn(unsigned long);
+static void arm_supers_timer(void);
+
+static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
 
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
@@ -187,6 +200,13 @@ static int __init default_bdi_init(void)
 {
 	int err;
 
+	sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
+	BUG_ON(IS_ERR(sync_supers_tsk));
+
+	init_timer(&sync_supers_timer);
+	setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
+	arm_supers_timer();
+
 	err = bdi_init(&default_backing_dev_info);
 	if (!err)
 		bdi_register(&default_backing_dev_info, NULL, "default");
@@ -195,6 +215,242 @@ static int __init default_bdi_init(void)
 }
 subsys_initcall(default_bdi_init);
 
+static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+{
+	memset(wb, 0, sizeof(*wb));
+
+	wb->bdi = bdi;
+	wb->last_old_flush = jiffies;
+	INIT_LIST_HEAD(&wb->b_dirty);
+	INIT_LIST_HEAD(&wb->b_io);
+	INIT_LIST_HEAD(&wb->b_more_io);
+}
+
+static void bdi_task_init(struct backing_dev_info *bdi,
+			  struct bdi_writeback *wb)
+{
+	struct task_struct *tsk = current;
+
+	spin_lock(&bdi->wb_lock);
+	list_add_tail_rcu(&wb->list, &bdi->wb_list);
+	spin_unlock(&bdi->wb_lock);
+
+	tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
+	set_freezable();
+
+	/*
+	 * Our parent may run at a different priority, just set us to normal
+	 */
+	set_user_nice(tsk, 0);
+}
+
+static int bdi_start_fn(void *ptr)
+{
+	struct bdi_writeback *wb = ptr;
+	struct backing_dev_info *bdi = wb->bdi;
+	int ret;
+
+	/*
+	 * Add us to the active bdi_list
+	 */
+	spin_lock(&bdi_lock);
+	list_add(&bdi->bdi_list, &bdi_list);
+	spin_unlock(&bdi_lock);
+
+	bdi_task_init(bdi, wb);
+
+	/*
+	 * Clear pending bit and wakeup anybody waiting to tear us down
+	 */
+	clear_bit(BDI_pending, &bdi->state);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&bdi->state, BDI_pending);
+
+	ret = bdi_writeback_task(wb);
+
+	/*
+	 * Remove us from the list
+	 */
+	spin_lock(&bdi->wb_lock);
+	list_del_rcu(&wb->list);
+	spin_unlock(&bdi->wb_lock);
+
+	/*
+	 * Flush any work that raced with us exiting. No new work
+	 * will be added, since this bdi isn't discoverable anymore.
+	 */
+	if (!list_empty(&bdi->work_list))
+		wb_do_writeback(wb, 1);
+
+	wb->task = NULL;
+	return ret;
+}
+
+int bdi_has_dirty_io(struct backing_dev_info *bdi)
+{
+	return wb_has_dirty_io(&bdi->wb);
+}
+
+static void bdi_flush_io(struct backing_dev_info *bdi)
+{
+	struct writeback_control wbc = {
+		.bdi			= bdi,
+		.sync_mode		= WB_SYNC_NONE,
+		.older_than_this	= NULL,
+		.range_cyclic		= 1,
+		.nr_to_write		= 1024,
+	};
+
+	writeback_inodes_wbc(&wbc);
+}
+
+/*
+ * kupdated() used to do this. We cannot do it from the bdi_forker_task()
+ * or we risk deadlocking on ->s_umount. The longer term solution would be
+ * to implement sync_supers_bdi() or similar and simply do it from the
+ * bdi writeback tasks individually.
+ */
+static int bdi_sync_supers(void *unused)
+{
+	set_user_nice(current, 0);
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
+
+		/*
+		 * Do this periodically, like kupdated() did before.
+		 */
+		sync_supers();
+	}
+
+	return 0;
+}
+
+static void arm_supers_timer(void)
+{
+	unsigned long next;
+
+	next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
+	mod_timer(&sync_supers_timer, round_jiffies_up(next));
+}
+
+static void sync_supers_timer_fn(unsigned long unused)
+{
+	wake_up_process(sync_supers_tsk);
+	arm_supers_timer();
+}
+
+static int bdi_forker_task(void *ptr)
+{
+	struct bdi_writeback *me = ptr;
+
+	bdi_task_init(me->bdi, me);
+
+	for (;;) {
+		struct backing_dev_info *bdi, *tmp;
+		struct bdi_writeback *wb;
+
+		/*
+		 * Temporary measure, we want to make sure we don't see
+		 * dirty data on the default backing_dev_info
+		 */
+		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
+			wb_do_writeback(me, 0);
+
+		spin_lock(&bdi_lock);
+
+		/*
+		 * Check if any existing bdi's have dirty data without
+		 * a thread registered. If so, set that up.
+		 */
+		list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
+			if (bdi->wb.task)
+				continue;
+			if (list_empty(&bdi->work_list) &&
+			    !bdi_has_dirty_io(bdi))
+				continue;
+
+			bdi_add_default_flusher_task(bdi);
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (list_empty(&bdi_pending_list)) {
+			unsigned long wait;
+
+			spin_unlock(&bdi_lock);
+			wait = msecs_to_jiffies(dirty_writeback_interval * 10);
+			schedule_timeout(wait);
+			try_to_freeze();
+			continue;
+		}
+
+		__set_current_state(TASK_RUNNING);
+
+		/*
+		 * This is our real job - check for pending entries in
+		 * bdi_pending_list, and create the tasks that got added
+		 */
+		bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
+				 bdi_list);
+		list_del_init(&bdi->bdi_list);
+		spin_unlock(&bdi_lock);
+
+		wb = &bdi->wb;
+		wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
+					dev_name(bdi->dev));
+		/*
+		 * If task creation fails, then readd the bdi to
+		 * the pending list and force writeout of the bdi
+		 * from this forker thread. That will free some memory
+		 * and we can try again.
+		 */
+		if (IS_ERR(wb->task)) {
+			wb->task = NULL;
+
+			/*
+			 * Add this 'bdi' to the back, so we get
+			 * a chance to flush other bdi's to free
+			 * memory.
+			 */
+			spin_lock(&bdi_lock);
+			list_add_tail(&bdi->bdi_list, &bdi_pending_list);
+			spin_unlock(&bdi_lock);
+
+			bdi_flush_io(bdi);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Add the default flusher task that gets created for any bdi
+ * that has dirty data pending writeout
+ */
+void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
+{
+	if (!bdi_cap_writeback_dirty(bdi))
+		return;
+
+	/*
+	 * Check with the helper whether to proceed adding a task. Will only
+	 * abort if we two or more simultanous calls to
+	 * bdi_add_default_flusher_task() occured, further additions will block
+	 * waiting for previous additions to finish.
+	 */
+	if (!test_and_set_bit(BDI_pending, &bdi->state)) {
+		list_move_tail(&bdi->bdi_list, &bdi_pending_list);
+
+		/*
+		 * We are now on the pending list, wake up bdi_forker_task()
+		 * to finish the job and add us back to the active bdi_list
+		 */
+		wake_up_process(default_backing_dev_info.wb.task);
+	}
+}
+
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...)
 {
@@ -213,13 +469,34 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		goto exit;
 	}
 
-	mutex_lock(&bdi_lock);
+	spin_lock(&bdi_lock);
 	list_add_tail(&bdi->bdi_list, &bdi_list);
-	mutex_unlock(&bdi_lock);
+	spin_unlock(&bdi_lock);
 
 	bdi->dev = dev;
-	bdi_debug_register(bdi, dev_name(dev));
 
+	/*
+	 * Just start the forker thread for our default backing_dev_info,
+	 * and add other bdi's to the list. They will get a thread created
+	 * on-demand when they need it.
+	 */
+	if (bdi_cap_flush_forker(bdi)) {
+		struct bdi_writeback *wb = &bdi->wb;
+
+		wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
+						dev_name(dev));
+		if (IS_ERR(wb->task)) {
+			wb->task = NULL;
+			ret = -ENOMEM;
+
+			spin_lock(&bdi_lock);
+			list_del(&bdi->bdi_list);
+			spin_unlock(&bdi_lock);
+			goto exit;
+		}
+	}
+
+	bdi_debug_register(bdi, dev_name(dev));
 exit:
 	return ret;
 }
@@ -231,17 +508,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
 }
 EXPORT_SYMBOL(bdi_register_dev);
 
-static void bdi_remove_from_list(struct backing_dev_info *bdi)
+/*
+ * Remove bdi from the global list and shutdown any threads we have running
+ */
+static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 {
-	mutex_lock(&bdi_lock);
+	struct bdi_writeback *wb;
+
+	if (!bdi_cap_writeback_dirty(bdi))
+		return;
+
+	/*
+	 * If setup is pending, wait for that to complete first
+	 */
+	wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
+			TASK_UNINTERRUPTIBLE);
+
+	/*
+	 * Make sure nobody finds us on the bdi_list anymore
+	 */
+	spin_lock(&bdi_lock);
 	list_del(&bdi->bdi_list);
-	mutex_unlock(&bdi_lock);
+	spin_unlock(&bdi_lock);
+
+	/*
+	 * Finally, kill the kernel threads. We don't need to be RCU
+	 * safe anymore, since the bdi is gone from visibility.
+	 */
+	list_for_each_entry(wb, &bdi->wb_list, list)
+		kthread_stop(wb->task);
 }
 
 void bdi_unregister(struct backing_dev_info *bdi)
 {
 	if (bdi->dev) {
-		bdi_remove_from_list(bdi);
+		if (!bdi_cap_flush_forker(bdi))
+			bdi_wb_shutdown(bdi);
 		bdi_debug_unregister(bdi);
 		device_unregister(bdi->dev);
 		bdi->dev = NULL;
@@ -251,18 +553,25 @@ EXPORT_SYMBOL(bdi_unregister);
 
 int bdi_init(struct backing_dev_info *bdi)
 {
-	int i;
-	int err;
+	int i, err;
 
 	bdi->dev = NULL;
 
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = PROP_FRAC_BASE;
+	spin_lock_init(&bdi->wb_lock);
 	INIT_LIST_HEAD(&bdi->bdi_list);
-	INIT_LIST_HEAD(&bdi->b_io);
-	INIT_LIST_HEAD(&bdi->b_dirty);
-	INIT_LIST_HEAD(&bdi->b_more_io);
+	INIT_LIST_HEAD(&bdi->wb_list);
+	INIT_LIST_HEAD(&bdi->work_list);
+
+	bdi_wb_init(&bdi->wb, bdi);
+
+	/*
+	 * Just one thread support for now, hard code mask and count
+	 */
+	bdi->wb_mask = 1;
+	bdi->wb_cnt = 1;
 
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
 		err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -277,8 +586,6 @@ int bdi_init(struct backing_dev_info *bdi)
 err:
 		while (i--)
 			percpu_counter_destroy(&bdi->bdi_stat[i]);
-
-		bdi_remove_from_list(bdi);
 	}
 
 	return err;
@@ -289,9 +596,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
 {
 	int i;
 
-	WARN_ON(!list_empty(&bdi->b_dirty));
-	WARN_ON(!list_empty(&bdi->b_io));
-	WARN_ON(!list_empty(&bdi->b_more_io));
+	WARN_ON(bdi_has_dirty_io(bdi));
 
 	bdi_unregister(bdi);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f8341b6019bf..25e7770309b8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -35,15 +35,6 @@
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
 
-/*
- * The maximum number of pages to writeout in a single bdflush/kupdate
- * operation.  We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode.  Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
-#define MAX_WRITEBACK_PAGES	1024
-
 /*
  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
  * will look to see if it needs to force writeback or throttling.
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
 /* End of sysctl-exported parameters */
 
 
-static void background_writeout(unsigned long _min_pages);
-
 /*
  * Scale the writeback cache size proportional to the relative writeout speeds.
  *
@@ -326,7 +315,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
 {
 	int ret = 0;
 
-	mutex_lock(&bdi_lock);
+	spin_lock(&bdi_lock);
 	if (min_ratio > bdi->max_ratio) {
 		ret = -EINVAL;
 	} else {
@@ -338,7 +327,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
 			ret = -EINVAL;
 		}
 	}
-	mutex_unlock(&bdi_lock);
+	spin_unlock(&bdi_lock);
 
 	return ret;
 }
@@ -350,14 +339,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 	if (max_ratio > 100)
 		return -EINVAL;
 
-	mutex_lock(&bdi_lock);
+	spin_lock(&bdi_lock);
 	if (bdi->min_ratio > max_ratio) {
 		ret = -EINVAL;
 	} else {
 		bdi->max_ratio = max_ratio;
 		bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
 	}
-	mutex_unlock(&bdi_lock);
+	spin_unlock(&bdi_lock);
 
 	return ret;
 }
@@ -543,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping)
 		 * up.
 		 */
 		if (bdi_nr_reclaimable > bdi_thresh) {
-			writeback_inodes(&wbc);
+			writeback_inodes_wbc(&wbc);
 			pages_written += write_chunk - wbc.nr_to_write;
 			get_dirty_limits(&background_thresh, &dirty_thresh,
 				       &bdi_thresh, bdi);
@@ -572,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping)
 		if (pages_written >= write_chunk)
 			break;		/* We've done our duty */
 
-		congestion_wait(BLK_RW_ASYNC, HZ/10);
+		schedule_timeout(1);
 	}
 
 	if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -591,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping)
 	 * background_thresh, to keep the amount of dirty memory low.
 	 */
 	if ((laptop_mode && pages_written) ||
-			(!laptop_mode && (global_page_state(NR_FILE_DIRTY)
-					  + global_page_state(NR_UNSTABLE_NFS)
-					  > background_thresh)))
-		pdflush_operation(background_writeout, 0);
+	    (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
+					  + global_page_state(NR_UNSTABLE_NFS))
+					  > background_thresh))) {
+		struct writeback_control wbc = {
+			.bdi		= bdi,
+			.sync_mode	= WB_SYNC_NONE,
+			.nr_to_write	= nr_writeback,
+		};
+
+
+		bdi_start_writeback(&wbc);
+	}
 }
 
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -678,124 +675,10 @@ void throttle_vm_writeout(gfp_t gfp_mask)
         }
 }
 
-/*
- * writeback at least _min_pages, and keep writing until the amount of dirty
- * memory is less than the background threshold, or until we're all clean.
- */
-static void background_writeout(unsigned long _min_pages)
-{
-	long min_pages = _min_pages;
-	struct writeback_control wbc = {
-		.bdi		= NULL,
-		.sync_mode	= WB_SYNC_NONE,
-		.older_than_this = NULL,
-		.nr_to_write	= 0,
-		.nonblocking	= 1,
-		.range_cyclic	= 1,
-	};
-
-	for ( ; ; ) {
-		unsigned long background_thresh;
-		unsigned long dirty_thresh;
-
-		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
-		if (global_page_state(NR_FILE_DIRTY) +
-			global_page_state(NR_UNSTABLE_NFS) < background_thresh
-				&& min_pages <= 0)
-			break;
-		wbc.more_io = 0;
-		wbc.encountered_congestion = 0;
-		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
-		wbc.pages_skipped = 0;
-		writeback_inodes(&wbc);
-		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
-			/* Wrote less than expected */
-			if (wbc.encountered_congestion || wbc.more_io)
-				congestion_wait(BLK_RW_ASYNC, HZ/10);
-			else
-				break;
-		}
-	}
-}
-
-/*
- * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
- * the whole world.  Returns 0 if a pdflush thread was dispatched.  Returns
- * -1 if all pdflush threads were busy.
- */
-int wakeup_pdflush(long nr_pages)
-{
-	if (nr_pages == 0)
-		nr_pages = global_page_state(NR_FILE_DIRTY) +
-				global_page_state(NR_UNSTABLE_NFS);
-	return pdflush_operation(background_writeout, nr_pages);
-}
-
-static void wb_timer_fn(unsigned long unused);
 static void laptop_timer_fn(unsigned long unused);
 
-static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
 static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
 
-/*
- * Periodic writeback of "old" data.
- *
- * Define "old": the first time one of an inode's pages is dirtied, we mark the
- * dirtying-time in the inode's address_space.  So this periodic writeback code
- * just walks the superblock inode list, writing back any inodes which are
- * older than a specific point in time.
- *
- * Try to run once per dirty_writeback_interval.  But if a writeback event
- * takes longer than a dirty_writeback_interval interval, then leave a
- * one-second gap.
- *
- * older_than_this takes precedence over nr_to_write.  So we'll only write back
- * all dirty pages if they are all attached to "old" mappings.
- */
-static void wb_kupdate(unsigned long arg)
-{
-	unsigned long oldest_jif;
-	unsigned long start_jif;
-	unsigned long next_jif;
-	long nr_to_write;
-	struct writeback_control wbc = {
-		.bdi		= NULL,
-		.sync_mode	= WB_SYNC_NONE,
-		.older_than_this = &oldest_jif,
-		.nr_to_write	= 0,
-		.nonblocking	= 1,
-		.for_kupdate	= 1,
-		.range_cyclic	= 1,
-	};
-
-	sync_supers();
-
-	oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
-	start_jif = jiffies;
-	next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
-	nr_to_write = global_page_state(NR_FILE_DIRTY) +
-			global_page_state(NR_UNSTABLE_NFS) +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
-	while (nr_to_write > 0) {
-		wbc.more_io = 0;
-		wbc.encountered_congestion = 0;
-		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
-		writeback_inodes(&wbc);
-		if (wbc.nr_to_write > 0) {
-			if (wbc.encountered_congestion || wbc.more_io)
-				congestion_wait(BLK_RW_ASYNC, HZ/10);
-			else
-				break;	/* All the old data is written */
-		}
-		nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-	}
-	if (time_before(next_jif, jiffies + HZ))
-		next_jif = jiffies + HZ;
-	if (dirty_writeback_interval)
-		mod_timer(&wb_timer, next_jif);
-}
-
 /*
  * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
  */
@@ -803,28 +686,24 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, file, buffer, length, ppos);
-	if (dirty_writeback_interval)
-		mod_timer(&wb_timer, jiffies +
-			msecs_to_jiffies(dirty_writeback_interval * 10));
-	else
-		del_timer(&wb_timer);
 	return 0;
 }
 
-static void wb_timer_fn(unsigned long unused)
-{
-	if (pdflush_operation(wb_kupdate, 0) < 0)
-		mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
-}
-
-static void laptop_flush(unsigned long unused)
+static void do_laptop_sync(struct work_struct *work)
 {
-	sys_sync();
+	wakeup_flusher_threads(0);
+	kfree(work);
 }
 
 static void laptop_timer_fn(unsigned long unused)
 {
-	pdflush_operation(laptop_flush, 0);
+	struct work_struct *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work) {
+		INIT_WORK(work, do_laptop_sync);
+		schedule_work(work);
+	}
 }
 
 /*
@@ -907,8 +786,6 @@ void __init page_writeback_init(void)
 {
 	int shift;
 
-	mod_timer(&wb_timer,
-		  jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 94e86dd6954c..ba8228e0a806 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1720,7 +1720,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		 */
 		if (total_scanned > sc->swap_cluster_max +
 					sc->swap_cluster_max / 2) {
-			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
+			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
 			sc->may_writepage = 1;
 		}
 
-- 
cgit v1.2.3-70-g09d2


From d0bceac747b547c0b4769b91fec7d3c15600153f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 18 May 2009 08:20:32 +0200
Subject: writeback: get rid of pdflush completely

It is now unused, so kill it off.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c         |   5 +
 include/linux/writeback.h |  12 ---
 mm/Makefile               |   2 +-
 mm/pdflush.c              | 269 ----------------------------------------------
 4 files changed, 6 insertions(+), 282 deletions(-)
 delete mode 100644 mm/pdflush.c

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7f6dae8aa47f..2e601ce581c9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -29,6 +29,11 @@
 
 #define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
 
+/*
+ * We don't actually have pdflush, but this one is exported though /proc...
+ */
+int nr_pdflush_threads;
+
 /*
  * Work items for the bdi_writeback threads
  */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index cef75527a14c..78b1e4684cc9 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -13,17 +13,6 @@ extern spinlock_t inode_lock;
 extern struct list_head inode_in_use;
 extern struct list_head inode_unused;
 
-/*
- * Yes, writeback.h requires sched.h
- * No, sched.h is not included from here.
- */
-static inline int task_is_pdflush(struct task_struct *task)
-{
-	return task->flags & PF_FLUSHER;
-}
-
-#define current_is_pdflush()	task_is_pdflush(current)
-
 /*
  * fs/fs-writeback.c
  */
@@ -155,7 +144,6 @@ balance_dirty_pages_ratelimited(struct address_space *mapping)
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
 				void *data);
 
-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
 int generic_writepages(struct address_space *mapping,
 		       struct writeback_control *wbc);
 int write_cache_pages(struct address_space *mapping,
diff --git a/mm/Makefile b/mm/Makefile
index 5e0bd6426693..147a7a7873c4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 			   vmalloc.o
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
-			   maccess.o page_alloc.o page-writeback.o pdflush.o \
+			   maccess.o page_alloc.o page-writeback.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
 			   page_isolation.o mm_init.o $(mmu-y)
diff --git a/mm/pdflush.c b/mm/pdflush.c
deleted file mode 100644
index 235ac440c44e..000000000000
--- a/mm/pdflush.c
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * mm/pdflush.c - worker threads for writing back filesystem data
- *
- * Copyright (C) 2002, Linus Torvalds.
- *
- * 09Apr2002	Andrew Morton
- *		Initial version
- * 29Feb2004	kaos@sgi.com
- *		Move worker thread creation to kthread to avoid chewing
- *		up stack space with nested calls to kernel_thread.
- */
-
-#include <linux/sched.h>
-#include <linux/list.h>
-#include <linux/signal.h>
-#include <linux/spinlock.h>
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/fs.h>		/* Needed by writeback.h	  */
-#include <linux/writeback.h>	/* Prototypes pdflush_operation() */
-#include <linux/kthread.h>
-#include <linux/cpuset.h>
-#include <linux/freezer.h>
-
-
-/*
- * Minimum and maximum number of pdflush instances
- */
-#define MIN_PDFLUSH_THREADS	2
-#define MAX_PDFLUSH_THREADS	8
-
-static void start_one_pdflush_thread(void);
-
-
-/*
- * The pdflush threads are worker threads for writing back dirty data.
- * Ideally, we'd like one thread per active disk spindle.  But the disk
- * topology is very hard to divine at this level.   Instead, we take
- * care in various places to prevent more than one pdflush thread from
- * performing writeback against a single filesystem.  pdflush threads
- * have the PF_FLUSHER flag set in current->flags to aid in this.
- */
-
-/*
- * All the pdflush threads.  Protected by pdflush_lock
- */
-static LIST_HEAD(pdflush_list);
-static DEFINE_SPINLOCK(pdflush_lock);
-
-/*
- * The count of currently-running pdflush threads.  Protected
- * by pdflush_lock.
- *
- * Readable by sysctl, but not writable.  Published to userspace at
- * /proc/sys/vm/nr_pdflush_threads.
- */
-int nr_pdflush_threads = 0;
-
-/*
- * The time at which the pdflush thread pool last went empty
- */
-static unsigned long last_empty_jifs;
-
-/*
- * The pdflush thread.
- *
- * Thread pool management algorithm:
- * 
- * - The minimum and maximum number of pdflush instances are bound
- *   by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
- * 
- * - If there have been no idle pdflush instances for 1 second, create
- *   a new one.
- * 
- * - If the least-recently-went-to-sleep pdflush thread has been asleep
- *   for more than one second, terminate a thread.
- */
-
-/*
- * A structure for passing work to a pdflush thread.  Also for passing
- * state information between pdflush threads.  Protected by pdflush_lock.
- */
-struct pdflush_work {
-	struct task_struct *who;	/* The thread */
-	void (*fn)(unsigned long);	/* A callback function */
-	unsigned long arg0;		/* An argument to the callback */
-	struct list_head list;		/* On pdflush_list, when idle */
-	unsigned long when_i_went_to_sleep;
-};
-
-static int __pdflush(struct pdflush_work *my_work)
-{
-	current->flags |= PF_FLUSHER | PF_SWAPWRITE;
-	set_freezable();
-	my_work->fn = NULL;
-	my_work->who = current;
-	INIT_LIST_HEAD(&my_work->list);
-
-	spin_lock_irq(&pdflush_lock);
-	for ( ; ; ) {
-		struct pdflush_work *pdf;
-
-		set_current_state(TASK_INTERRUPTIBLE);
-		list_move(&my_work->list, &pdflush_list);
-		my_work->when_i_went_to_sleep = jiffies;
-		spin_unlock_irq(&pdflush_lock);
-		schedule();
-		try_to_freeze();
-		spin_lock_irq(&pdflush_lock);
-		if (!list_empty(&my_work->list)) {
-			/*
-			 * Someone woke us up, but without removing our control
-			 * structure from the global list.  swsusp will do this
-			 * in try_to_freeze()->refrigerator().  Handle it.
-			 */
-			my_work->fn = NULL;
-			continue;
-		}
-		if (my_work->fn == NULL) {
-			printk("pdflush: bogus wakeup\n");
-			continue;
-		}
-		spin_unlock_irq(&pdflush_lock);
-
-		(*my_work->fn)(my_work->arg0);
-
-		spin_lock_irq(&pdflush_lock);
-
-		/*
-		 * Thread creation: For how long have there been zero
-		 * available threads?
-		 *
-		 * To throttle creation, we reset last_empty_jifs.
-		 */
-		if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
-			if (list_empty(&pdflush_list)) {
-				if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
-					last_empty_jifs = jiffies;
-					nr_pdflush_threads++;
-					spin_unlock_irq(&pdflush_lock);
-					start_one_pdflush_thread();
-					spin_lock_irq(&pdflush_lock);
-				}
-			}
-		}
-
-		my_work->fn = NULL;
-
-		/*
-		 * Thread destruction: For how long has the sleepiest
-		 * thread slept?
-		 */
-		if (list_empty(&pdflush_list))
-			continue;
-		if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
-			continue;
-		pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
-		if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
-			/* Limit exit rate */
-			pdf->when_i_went_to_sleep = jiffies;
-			break;					/* exeunt */
-		}
-	}
-	nr_pdflush_threads--;
-	spin_unlock_irq(&pdflush_lock);
-	return 0;
-}
-
-/*
- * Of course, my_work wants to be just a local in __pdflush().  It is
- * separated out in this manner to hopefully prevent the compiler from
- * performing unfortunate optimisations against the auto variables.  Because
- * these are visible to other tasks and CPUs.  (No problem has actually
- * been observed.  This is just paranoia).
- */
-static int pdflush(void *dummy)
-{
-	struct pdflush_work my_work;
-	cpumask_var_t cpus_allowed;
-
-	/*
-	 * Since the caller doesn't even check kthread_run() worked, let's not
-	 * freak out too much if this fails.
-	 */
-	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
-		printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
-		return 0;
-	}
-
-	/*
-	 * pdflush can spend a lot of time doing encryption via dm-crypt.  We
-	 * don't want to do that at keventd's priority.
-	 */
-	set_user_nice(current, 0);
-
-	/*
-	 * Some configs put our parent kthread in a limited cpuset,
-	 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
-	 * Our needs are more modest - cut back to our cpusets cpus_allowed.
-	 * This is needed as pdflush's are dynamically created and destroyed.
-	 * The boottime pdflush's are easily placed w/o these 2 lines.
-	 */
-	cpuset_cpus_allowed(current, cpus_allowed);
-	set_cpus_allowed_ptr(current, cpus_allowed);
-	free_cpumask_var(cpus_allowed);
-
-	return __pdflush(&my_work);
-}
-
-/*
- * Attempt to wake up a pdflush thread, and get it to do some work for you.
- * Returns zero if it indeed managed to find a worker thread, and passed your
- * payload to it.
- */
-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
-{
-	unsigned long flags;
-	int ret = 0;
-
-	BUG_ON(fn == NULL);	/* Hard to diagnose if it's deferred */
-
-	spin_lock_irqsave(&pdflush_lock, flags);
-	if (list_empty(&pdflush_list)) {
-		ret = -1;
-	} else {
-		struct pdflush_work *pdf;
-
-		pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
-		list_del_init(&pdf->list);
-		if (list_empty(&pdflush_list))
-			last_empty_jifs = jiffies;
-		pdf->fn = fn;
-		pdf->arg0 = arg0;
-		wake_up_process(pdf->who);
-	}
-	spin_unlock_irqrestore(&pdflush_lock, flags);
-
-	return ret;
-}
-
-static void start_one_pdflush_thread(void)
-{
-	struct task_struct *k;
-
-	k = kthread_run(pdflush, NULL, "pdflush");
-	if (unlikely(IS_ERR(k))) {
-		spin_lock_irq(&pdflush_lock);
-		nr_pdflush_threads--;
-		spin_unlock_irq(&pdflush_lock);
-	}
-}
-
-static int __init pdflush_init(void)
-{
-	int i;
-
-	/*
-	 * Pre-set nr_pdflush_threads...  If we fail to create,
-	 * the count will be decremented.
-	 */
-	nr_pdflush_threads = MIN_PDFLUSH_THREADS;
-
-	for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
-		start_one_pdflush_thread();
-	return 0;
-}
-
-module_init(pdflush_init);
-- 
cgit v1.2.3-70-g09d2


From d993831fa7ffeb89e994f046f93eeb09ec91df08 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 12 Jun 2009 14:45:52 +0200
Subject: writeback: add name to backing_dev_info

This enables us to track who does what and print info. Its main use
is catching dirty inodes on the default_backing_dev_info, so we can
fix that up.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c            | 1 +
 drivers/block/aoe/aoeblk.c  | 1 +
 drivers/char/mem.c          | 1 +
 fs/btrfs/disk-io.c          | 1 +
 fs/char_dev.c               | 1 +
 fs/configfs/inode.c         | 1 +
 fs/fuse/inode.c             | 1 +
 fs/hugetlbfs/inode.c        | 1 +
 fs/nfs/client.c             | 1 +
 fs/ocfs2/dlm/dlmfs.c        | 1 +
 fs/ramfs/inode.c            | 1 +
 fs/sysfs/inode.c            | 1 +
 fs/ubifs/super.c            | 1 +
 include/linux/backing-dev.h | 2 ++
 kernel/cgroup.c             | 1 +
 mm/backing-dev.c            | 1 +
 mm/swap_state.c             | 1 +
 17 files changed, 18 insertions(+)

(limited to 'fs')

diff --git a/block/blk-core.c b/block/blk-core.c
index e3299a77a0d8..e695634882a6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -501,6 +501,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.state = 0;
 	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
+	q->backing_dev_info.name = "block";
 
 	err = bdi_init(&q->backing_dev_info);
 	if (err) {
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 1e15889c4b98..95d344971eda 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -268,6 +268,7 @@ aoeblk_gdalloc(void *vp)
 	if (!d->blkq)
 		goto err_mempool;
 	blk_queue_make_request(d->blkq, aoeblk_make_request);
+	d->blkq->backing_dev_info.name = "aoe";
 	if (bdi_init(&d->blkq->backing_dev_info))
 		goto err_blkq;
 	spin_lock_irqsave(&d->lock, flags);
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index afa8813e737a..645237bda682 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -822,6 +822,7 @@ static const struct file_operations zero_fops = {
  * - permits private mappings, "copies" are taken of the source of zeros
  */
 static struct backing_dev_info zero_bdi = {
+	.name		= "char/mem",
 	.capabilities	= BDI_CAP_MAP_COPY,
 };
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e83be2e4602c..15831d5c7367 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1352,6 +1352,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
 	int err;
 
+	bdi->name = "btrfs";
 	bdi->capabilities = BDI_CAP_MAP_COPY;
 	err = bdi_init(bdi);
 	if (err)
diff --git a/fs/char_dev.c b/fs/char_dev.c
index a173551e19d7..7c27a8ebef6a 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -31,6 +31,7 @@
  * - no readahead or I/O queue unplugging required
  */
 struct backing_dev_info directly_mappable_cdev_bdi = {
+	.name = "char",
 	.capabilities	= (
 #ifdef CONFIG_MMU
 		/* permit private copies of the data to be taken */
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4921e7426d95..a2f746066c5d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = {
 };
 
 static struct backing_dev_info configfs_backing_dev_info = {
+	.name		= "configfs",
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f91ccc4a189d..4567db6f9430 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -801,6 +801,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 {
 	int err;
 
+	fc->bdi.name = "fuse";
 	fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	fc->bdi.unplug_io_fn = default_unplug_io_fn;
 	/* fuse does it's own writeback accounting */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index cb88dac8ccaa..a93b885311d8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -44,6 +44,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations;
 static const struct inode_operations hugetlbfs_inode_operations;
 
 static struct backing_dev_info hugetlbfs_backing_dev_info = {
+	.name		= "hugetlbfs",
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8d25ccb2d51d..c6be84a161f6 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -879,6 +879,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
 		server->rsize = NFS_MAX_FILE_IO_SIZE;
 	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
+	server->backing_dev_info.name = "nfs";
 	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
 
 	if (server->wsize > max_rpc_payload)
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 1c9efb406a96..02bf17808bdc 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -325,6 +325,7 @@ clear_fields:
 }
 
 static struct backing_dev_info dlmfs_backing_dev_info = {
+	.name		= "ocfs2-dlmfs",
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0ff7566c767c..a7f0110fca4c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -46,6 +46,7 @@ static const struct super_operations ramfs_ops;
 static const struct inode_operations ramfs_dir_inode_operations;
 
 static struct backing_dev_info ramfs_backing_dev_info = {
+	.name		= "ramfs",
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK |
 			  BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 555f0ff988df..e57f98e54fce 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -29,6 +29,7 @@ static const struct address_space_operations sysfs_aops = {
 };
 
 static struct backing_dev_info sysfs_backing_dev_info = {
+	.name		= "sysfs",
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 8d6050a5966c..51763aa8f4de 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1965,6 +1965,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	 *
 	 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
 	 */
+	c->bdi.name = "ubifs",
 	c->bdi.capabilities = BDI_CAP_MAP_COPY;
 	c->bdi.unplug_io_fn = default_unplug_io_fn;
 	err  = bdi_init(&c->bdi);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index d045f5f615c7..2f218b7cb063 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -66,6 +66,8 @@ struct backing_dev_info {
 	void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
 	void *unplug_io_data;
 
+	char *name;
+
 	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
 
 	struct prop_local_percpu completions;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b6eadfe30e7b..c7ece8f027f2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -600,6 +600,7 @@ static struct inode_operations cgroup_dir_inode_operations;
 static struct file_operations proc_cgroupstats_operations;
 
 static struct backing_dev_info cgroup_backing_dev_info = {
+	.name		= "cgroup",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 22c45e932e3a..5cb32c5b93d8 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -17,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 EXPORT_SYMBOL(default_unplug_io_fn);
 
 struct backing_dev_info default_backing_dev_info = {
+	.name		= "default",
 	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
 	.state		= 0,
 	.capabilities	= BDI_CAP_MAP_COPY,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 42cd38eba79f..5ae6b8b78c80 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = {
 };
 
 static struct backing_dev_info swap_backing_dev_info = {
+	.name		= "swap",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
 	.unplug_io_fn	= swap_unplug_io_fn,
 };
-- 
cgit v1.2.3-70-g09d2


From 500b067c5e6ceea49cf280a02597b1169320e08c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 9 Sep 2009 09:10:25 +0200
Subject: writeback: check for registered bdi in flusher add and inode dirty

Also a debugging aid. We want to catch dirty inodes being added to
backing devices that don't do writeback.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c           | 8 ++++++++
 include/linux/backing-dev.h | 1 +
 mm/backing-dev.c            | 7 +++++++
 3 files changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2e601ce581c9..da86ef58e427 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1046,6 +1046,14 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 */
 		if (!was_dirty) {
 			struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+			struct backing_dev_info *bdi = wb->bdi;
+
+			if (bdi_cap_writeback_dirty(bdi) &&
+			    !test_bit(BDI_registered, &bdi->state)) {
+				WARN_ON(1);
+				printk(KERN_ERR "bdi-%s not registered\n",
+								bdi->name);
+			}
 
 			inode->dirtied_when = jiffies;
 			list_move(&inode->i_list, &wb->b_dirty);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 2f218b7cb063..f169bcb90b58 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -29,6 +29,7 @@ enum bdi_state {
 	BDI_wb_alloc,		/* Default embedded wb allocated */
 	BDI_async_congested,	/* The async (write) queue is getting full */
 	BDI_sync_congested,	/* The sync queue is getting full */
+	BDI_registered,		/* bdi_register() was done */
 	BDI_unused,		/* Available bits start here */
 };
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5cb32c5b93d8..d3ca0dac1111 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -465,6 +465,12 @@ void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
 	if (!bdi_cap_writeback_dirty(bdi))
 		return;
 
+	if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
+		printk(KERN_ERR "bdi %p/%s is not registered!\n",
+							bdi, bdi->name);
+		return;
+	}
+
 	/*
 	 * Check with the helper whether to proceed adding a task. Will only
 	 * abort if we two or more simultanous calls to
@@ -528,6 +534,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 	}
 
 	bdi_debug_register(bdi, dev_name(dev));
+	set_bit(BDI_registered, &bdi->state);
 exit:
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 1f98a13f623e0ef666690a18c1250335fc6d7ef1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 11 Sep 2009 14:32:04 +0200
Subject: bio: first step in sanitizing the bio->bi_rw flag testing

Get rid of any functions that test for these bits and make callers
use bio_rw_flagged() directly. Then it is at least directly apparent
what variable and flag they check.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c            | 25 +++++++++++++------------
 block/cfq-iosched.c         |  2 +-
 block/elevator.c            |  3 ++-
 drivers/block/loop.c        |  2 +-
 drivers/md/dm-raid1.c       |  2 +-
 drivers/md/dm-stripe.c      |  2 +-
 drivers/md/dm.c             | 12 ++++++------
 drivers/md/linear.c         |  2 +-
 drivers/md/multipath.c      |  4 ++--
 drivers/md/raid0.c          |  2 +-
 drivers/md/raid1.c          | 14 ++++++++------
 drivers/md/raid10.c         |  6 +++---
 drivers/md/raid5.c          |  2 +-
 drivers/staging/dst/dcore.c |  5 +++--
 fs/btrfs/volumes.c          |  4 ++--
 include/linux/bio.h         | 25 +++++++------------------
 include/linux/blkdev.h      |  2 +-
 17 files changed, 54 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/block/blk-core.c b/block/blk-core.c
index c822239bcc9d..52559715cb90 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1114,24 +1114,24 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	 * Inherit FAILFAST from bio (for read-ahead, and explicit
 	 * FAILFAST).  FAILFAST flags are identical for req and bio.
 	 */
-	if (bio_rw_ahead(bio))
+	if (bio_rw_flagged(bio, BIO_RW_AHEAD))
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 	else
 		req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK;
 
-	if (unlikely(bio_discard(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) {
 		req->cmd_flags |= REQ_DISCARD;
-		if (bio_barrier(bio))
+		if (bio_rw_flagged(bio, BIO_RW_BARRIER))
 			req->cmd_flags |= REQ_SOFTBARRIER;
 		req->q->prepare_discard_fn(req->q, req);
-	} else if (unlikely(bio_barrier(bio)))
+	} else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)))
 		req->cmd_flags |= REQ_HARDBARRIER;
 
-	if (bio_sync(bio))
+	if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
 		req->cmd_flags |= REQ_RW_SYNC;
-	if (bio_rw_meta(bio))
+	if (bio_rw_flagged(bio, BIO_RW_META))
 		req->cmd_flags |= REQ_RW_META;
-	if (bio_noidle(bio))
+	if (bio_rw_flagged(bio, BIO_RW_NOIDLE))
 		req->cmd_flags |= REQ_NOIDLE;
 
 	req->errors = 0;
@@ -1155,12 +1155,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	int el_ret;
 	unsigned int bytes = bio->bi_size;
 	const unsigned short prio = bio_prio(bio);
-	const int sync = bio_sync(bio);
-	const int unplug = bio_unplug(bio);
+	const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
+	const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
 	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 	int rw_flags;
 
-	if (bio_barrier(bio) && bio_has_data(bio) &&
+	if (bio_rw_flagged(bio, BIO_RW_BARRIER) && bio_has_data(bio) &&
 	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
@@ -1174,7 +1174,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 	spin_lock_irq(q->queue_lock);
 
-	if (unlikely(bio_barrier(bio)) || elv_queue_empty(q))
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -1470,7 +1470,8 @@ static inline void __generic_make_request(struct bio *bio)
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
 
-		if (bio_discard(bio) && !q->prepare_discard_fn) {
+		if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
+		    !q->prepare_discard_fn) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ca0d7e71324b..9e6d0af6c990 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -257,7 +257,7 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic,
  */
 static inline int cfq_bio_sync(struct bio *bio)
 {
-	if (bio_data_dir(bio) == READ || bio_sync(bio))
+	if (bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO))
 		return 1;
 
 	return 0;
diff --git a/block/elevator.c b/block/elevator.c
index ca861927ba41..1975b619c86d 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -79,7 +79,8 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 	/*
 	 * Don't merge file system requests and discard requests
 	 */
-	if (bio_discard(bio) != bio_discard(rq->bio))
+	if (bio_rw_flagged(bio, BIO_RW_DISCARD) !=
+	    bio_rw_flagged(rq->bio, BIO_RW_DISCARD))
 		return 0;
 
 	/*
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5757188cd1fb..bbb79441d895 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -475,7 +475,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 	pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
 
 	if (bio_rw(bio) == WRITE) {
-		int barrier = bio_barrier(bio);
+		bool barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
 		struct file *file = lo->lo_backing_file;
 
 		if (barrier) {
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 33f179e66bf5..cc9dc79b0784 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1129,7 +1129,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 	if (error == -EOPNOTSUPP)
 		goto out;
 
-	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+	if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
 		goto out;
 
 	if (unlikely(error)) {
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 3e563d251733..fde658ccbcec 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -285,7 +285,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
 	if (!error)
 		return 0; /* I/O complete */
 
-	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
+	if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
 		return error;
 
 	if (error == -EOPNOTSUPP)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b4845b14740d..ec012f030240 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -586,7 +586,7 @@ static void dec_pending(struct dm_io *io, int error)
 			 */
 			spin_lock_irqsave(&md->deferred_lock, flags);
 			if (__noflush_suspending(md)) {
-				if (!bio_barrier(io->bio))
+				if (!bio_rw_flagged(io->bio, BIO_RW_BARRIER))
 					bio_list_add_head(&md->deferred,
 							  io->bio);
 			} else
@@ -598,7 +598,7 @@ static void dec_pending(struct dm_io *io, int error)
 		io_error = io->error;
 		bio = io->bio;
 
-		if (bio_barrier(bio)) {
+		if (bio_rw_flagged(bio, BIO_RW_BARRIER)) {
 			/*
 			 * There can be just one barrier request so we use
 			 * a per-device variable for error reporting.
@@ -1209,7 +1209,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 
 	ci.map = dm_get_table(md);
 	if (unlikely(!ci.map)) {
-		if (!bio_barrier(bio))
+		if (!bio_rw_flagged(bio, BIO_RW_BARRIER))
 			bio_io_error(bio);
 		else
 			if (!md->barrier_error)
@@ -1321,7 +1321,7 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
 	 * we have to queue this io for later.
 	 */
 	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
-	    unlikely(bio_barrier(bio))) {
+	    unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		up_read(&md->io_lock);
 
 		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
@@ -1344,7 +1344,7 @@ static int dm_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct mapped_device *md = q->queuedata;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
@@ -2164,7 +2164,7 @@ static void dm_wq_work(struct work_struct *work)
 		if (dm_request_based(md))
 			generic_make_request(c);
 		else {
-			if (bio_barrier(c))
+			if (bio_rw_flagged(c, BIO_RW_BARRIER))
 				process_barrier(md, c);
 			else
 				__split_and_process_bio(md, c);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5fe39c2a3d2b..ea4842905444 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -288,7 +288,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 	sector_t start_sector;
 	int cpu;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 7140909f6662..89e76819f61f 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -90,7 +90,7 @@ static void multipath_end_request(struct bio *bio, int error)
 
 	if (uptodate)
 		multipath_end_bh_io(mp_bh, 0);
-	else if (!bio_rw_ahead(bio)) {
+	else if (!bio_rw_flagged(bio, BIO_RW_AHEAD)) {
 		/*
 		 * oops, IO error:
 		 */
@@ -144,7 +144,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 	const int rw = bio_data_dir(bio);
 	int cpu;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 898e2bdfee47..f845ed98fec9 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -448,7 +448,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
 	const int rw = bio_data_dir(bio);
 	int cpu;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8726fd7ebce5..ff7ed3335995 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -782,8 +782,9 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	struct bio_list bl;
 	struct page **behind_pages = NULL;
 	const int rw = bio_data_dir(bio);
-	const int do_sync = bio_sync(bio);
-	int cpu, do_barriers;
+	const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
+	int cpu;
+	bool do_barriers;
 	mdk_rdev_t *blocked_rdev;
 
 	/*
@@ -797,7 +798,8 @@ static int make_request(struct request_queue *q, struct bio * bio)
 
 	md_write_start(mddev, bio); /* wait on superblock update early */
 
-	if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
+	if (unlikely(!mddev->barriers_work &&
+		     bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		if (rw == WRITE)
 			md_write_end(mddev);
 		bio_endio(bio, -EOPNOTSUPP);
@@ -925,7 +927,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	atomic_set(&r1_bio->remaining, 0);
 	atomic_set(&r1_bio->behind_remaining, 0);
 
-	do_barriers = bio_barrier(bio);
+	do_barriers = bio_rw_flagged(bio, BIO_RW_BARRIER);
 	if (do_barriers)
 		set_bit(R1BIO_Barrier, &r1_bio->state);
 
@@ -1600,7 +1602,7 @@ static void raid1d(mddev_t *mddev)
 			 * We already have a nr_pending reference on these rdevs.
 			 */
 			int i;
-			const int do_sync = bio_sync(r1_bio->master_bio);
+			const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
 			clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
 			clear_bit(R1BIO_Barrier, &r1_bio->state);
 			for (i=0; i < conf->raid_disks; i++)
@@ -1654,7 +1656,7 @@ static void raid1d(mddev_t *mddev)
 				       (unsigned long long)r1_bio->sector);
 				raid_end_bio_io(r1_bio);
 			} else {
-				const int do_sync = bio_sync(r1_bio->master_bio);
+				const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
 				r1_bio->bios[r1_bio->read_disk] =
 					mddev->ro ? IO_BLOCKED : NULL;
 				r1_bio->read_disk = disk;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3d9020cf6f6e..d0a2152e064f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -796,12 +796,12 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	int i;
 	int chunk_sects = conf->chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
-	const int do_sync = bio_sync(bio);
+	const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
 	struct bio_list bl;
 	unsigned long flags;
 	mdk_rdev_t *blocked_rdev;
 
-	if (unlikely(bio_barrier(bio))) {
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
@@ -1610,7 +1610,7 @@ static void raid10d(mddev_t *mddev)
 				raid_end_bio_io(r10_bio);
 				bio_put(bio);
 			} else {
-				const int do_sync = bio_sync(r10_bio->master_bio);
+				const bool do_sync = bio_rw_flagged(r10_bio->master_bio, BIO_RW_SYNCIO);
 				bio_put(bio);
 				rdev = conf->mirrors[mirror].rdev;
 				if (printk_ratelimit())
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b8a2c5dc67ba..826eb3467357 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3606,7 +3606,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 	const int rw = bio_data_dir(bi);
 	int cpu, remaining;
 
-	if (unlikely(bio_barrier(bi))) {
+	if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
 		bio_endio(bi, -EOPNOTSUPP);
 		return 0;
 	}
diff --git a/drivers/staging/dst/dcore.c b/drivers/staging/dst/dcore.c
index fad25b753042..b1c258ca2102 100644
--- a/drivers/staging/dst/dcore.c
+++ b/drivers/staging/dst/dcore.c
@@ -112,8 +112,9 @@ static int dst_request(struct request_queue *q, struct bio *bio)
 		 * I worked with.
 		 *
 		 * Empty barriers are not allowed anyway, see 51fd77bd9f512
-		 * for example, although later it was changed to bio_discard()
-		 * only, which does not work in this case.
+		 * for example, although later it was changed to
+		 * bio_rw_flagged(bio, BIO_RW_DISCARD) only, which does not
+		 * work in this case.
 		 */
 		//err = -EOPNOTSUPP;
 		err = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5dbefd11b4af..5cf405b0828d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -260,7 +260,7 @@ loop_lock:
 		num_run++;
 		batch_run++;
 
-		if (bio_sync(cur))
+		if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
 			num_sync_run++;
 
 		if (need_resched()) {
@@ -2903,7 +2903,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
 	bio->bi_rw |= rw;
 
 	spin_lock(&device->io_lock);
-	if (bio_sync(bio))
+	if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
 		pending_bios = &device->pending_sync_bios;
 	else
 		pending_bios = &device->pending_bios;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4f8fd0221cd2..5be93f18d842 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -177,28 +177,17 @@ enum bio_rw_flags {
 	BIO_RW_NOIDLE,
 };
 
+/*
+ * First four bits must match between bio->bi_rw and rq->cmd_flags, make
+ * that explicit here.
+ */
+#define BIO_RW_RQ_MASK		0xf
+
 static inline bool bio_rw_flagged(struct bio *bio, enum bio_rw_flags flag)
 {
 	return (bio->bi_rw & (1 << flag)) != 0;
 }
 
-/*
- * Old defines, these should eventually be replaced by direct usage of
- * bio_rw_flagged()
- */
-#define bio_barrier(bio)	bio_rw_flagged(bio, BIO_RW_BARRIER)
-#define bio_sync(bio)		bio_rw_flagged(bio, BIO_RW_SYNCIO)
-#define bio_unplug(bio)		bio_rw_flagged(bio, BIO_RW_UNPLUG)
-#define bio_failfast_dev(bio)	bio_rw_flagged(bio, BIO_RW_FAILFAST_DEV)
-#define bio_failfast_transport(bio)	\
-		bio_rw_flagged(bio, BIO_RW_FAILFAST_TRANSPORT)
-#define bio_failfast_driver(bio) 	\
-		bio_rw_flagged(bio, BIO_RW_FAILFAST_DRIVER)
-#define bio_rw_ahead(bio)	bio_rw_flagged(bio, BIO_RW_AHEAD)
-#define bio_rw_meta(bio)	bio_rw_flagged(bio, BIO_RW_META)
-#define bio_discard(bio)	bio_rw_flagged(bio, BIO_RW_DISCARD)
-#define bio_noidle(bio)		bio_rw_flagged(bio, BIO_RW_NOIDLE)
-
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
  */
@@ -222,7 +211,7 @@ static inline bool bio_rw_flagged(struct bio *bio, enum bio_rw_flags flag)
 #define bio_offset(bio)		bio_iovec((bio))->bv_offset
 #define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_idx)
 #define bio_sectors(bio)	((bio)->bi_size >> 9)
-#define bio_empty_barrier(bio)	(bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio))
+#define bio_empty_barrier(bio)	(bio_rw_flagged(bio, BIO_RW_BARRIER) && !bio_has_data(bio) && !bio_rw_flagged(bio, BIO_RW_DISCARD))
 
 static inline unsigned int bio_cur_bytes(struct bio *bio)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 650b6a9cb679..88edb62def82 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -86,7 +86,7 @@ enum {
 };
 
 /*
- * request type modified bits. first two bits match BIO_RW* bits, important
+ * request type modified bits. first four bits match BIO_RW* bits, important
  */
 enum rq_flag_bits {
 	__REQ_RW,		/* not set, read. set, write */
-- 
cgit v1.2.3-70-g09d2


From 723590ed52d244b025f10a8e2f54c2f57ed02b4d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Sat, 15 Aug 2009 08:43:22 +0200
Subject: splice: update mtime and atime on files

Splice should update the modification and access times on regular
files just like read and write. Not updating mtime will confuse
backup tools, etc...

This patch only adds the time updates for regular files.  For pipes
and other special files that splice touches the need for updating the
times is less clear.  Let's discuss and fix that separately.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 73766d24f97b..aec4014291be 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -502,8 +502,10 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 		len = left;
 
 	ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
-	if (ret > 0)
+	if (ret > 0) {
 		*ppos += ret;
+		file_accessed(in);
+	}
 
 	return ret;
 }
@@ -963,8 +965,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
 		ret = file_remove_suid(out);
-		if (!ret)
+		if (!ret) {
+			file_update_time(out);
 			ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+		}
 		mutex_unlock(&inode->i_mutex);
 	} while (ret > 0);
 	splice_from_pipe_end(pipe, &sd);
-- 
cgit v1.2.3-70-g09d2


From 7ad9bb651fc2036ea94bed94da76a4b08959a911 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 11 Sep 2009 16:51:28 -0400
Subject: ext4: Fix initalization of s_flex_groups

The s_flex_groups array should have been initialized using atomic_add
to sum up the free counts from the block groups that make up a
flex_bg.  By using atomic_set, the value of the s_flex_groups array
was set to the values of the last block group in the flex_bg.

The impact of this bug is that the block and inode allocation
algorithms might not pick the best flex_bg for new allocation.

Thanks to Damien Guibouret for pointing out this problem!

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9f6fa3f74629..04c693357336 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1694,12 +1694,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 
 		flex_group = ext4_flex_group(sbi, i);
-		atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
-			   ext4_free_inodes_count(sb, gdp));
-		atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
-			   ext4_free_blks_count(sb, gdp));
-		atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
-			   ext4_used_dirs_count(sb, gdp));
+		atomic_add(ext4_free_inodes_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].free_inodes);
+		atomic_add(ext4_free_blks_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].free_blocks);
+		atomic_add(ext4_used_dirs_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].used_dirs);
 	}
 
 	return 1;
-- 
cgit v1.2.3-70-g09d2


From a9327cac440be4d8333bba975cbbf76045096275 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Fri, 11 Sep 2009 09:18:54 +0200
Subject: Seperate read and write statistics of in_flight requests

Currently, there is a single in_flight counter measuring the number of
requests in the request_queue. But some monitoring tools would like to
know how many read requests and write requests are in progress. Split the
current in_flight counter into two seperate counters for read and write.

This information is exported as a sysfs attribute, as changing the
currently available stat files would break the existing tools.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c      |  6 +++---
 block/blk-merge.c     |  2 +-
 block/genhd.c         |  4 +++-
 drivers/md/dm.c       | 16 ++++++++++------
 fs/partitions/check.c | 12 +++++++++++-
 include/linux/genhd.h | 21 ++++++++++++++-------
 6 files changed, 42 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/block/blk-core.c b/block/blk-core.c
index 982d634e67f9..182ebe3eb9e0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -69,7 +69,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		part_stat_inc(cpu, part, merges[rw]);
 	else {
 		part_round_stats(cpu, part);
-		part_inc_in_flight(part);
+		part_inc_in_flight(part, rw);
 	}
 
 	part_stat_unlock();
@@ -1030,7 +1030,7 @@ static void part_round_stats_single(int cpu, struct hd_struct *part,
 
 	if (part->in_flight) {
 		__part_stat_add(cpu, part, time_in_queue,
-				part->in_flight * (now - part->stamp));
+				part_in_flight(part) * (now - part->stamp));
 		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
 	}
 	part->stamp = now;
@@ -1737,7 +1737,7 @@ static void blk_account_io_done(struct request *req)
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
 		part_round_stats(cpu, part);
-		part_dec_in_flight(part);
+		part_dec_in_flight(part, rw);
 
 		part_stat_unlock();
 	}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index b0de8574fdc8..99cb5cf1f447 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
 		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 
 		part_round_stats(cpu, part);
-		part_dec_in_flight(part);
+		part_dec_in_flight(part, rq_data_dir(req));
 
 		part_stat_unlock();
 	}
diff --git a/block/genhd.c b/block/genhd.c
index b89328eceee2..5b76bf55d05c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -869,6 +869,7 @@ static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -888,6 +889,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_alignment_offset.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
+	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
@@ -1053,7 +1055,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   part_stat_read(hd, merges[1]),
 			   (unsigned long long)part_stat_read(hd, sectors[1]),
 			   jiffies_to_msecs(part_stat_read(hd, ticks[1])),
-			   hd->in_flight,
+			   part_in_flight(hd),
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
 			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))
 			);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ec012f030240..eee28fac210c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -130,7 +130,7 @@ struct mapped_device {
 	/*
 	 * A list of ios that arrived while we were suspended.
 	 */
-	atomic_t pending;
+	atomic_t pending[2];
 	wait_queue_head_t wait;
 	struct work_struct work;
 	struct bio_list deferred;
@@ -453,13 +453,14 @@ static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
 	int cpu;
+	int rw = bio_data_dir(io->bio);
 
 	io->start_time = jiffies;
 
 	cpu = part_stat_lock();
 	part_round_stats(cpu, &dm_disk(md)->part0);
 	part_stat_unlock();
-	dm_disk(md)->part0.in_flight = atomic_inc_return(&md->pending);
+	dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]);
 }
 
 static void end_io_acct(struct dm_io *io)
@@ -479,8 +480,9 @@ static void end_io_acct(struct dm_io *io)
 	 * After this is decremented the bio must not be touched if it is
 	 * a barrier.
 	 */
-	dm_disk(md)->part0.in_flight = pending =
-		atomic_dec_return(&md->pending);
+	dm_disk(md)->part0.in_flight[rw] = pending =
+		atomic_dec_return(&md->pending[rw]);
+	pending += atomic_read(&md->pending[rw^0x1]);
 
 	/* nudge anyone waiting on suspend queue */
 	if (!pending)
@@ -1785,7 +1787,8 @@ static struct mapped_device *alloc_dev(int minor)
 	if (!md->disk)
 		goto bad_disk;
 
-	atomic_set(&md->pending, 0);
+	atomic_set(&md->pending[0], 0);
+	atomic_set(&md->pending[1], 0);
 	init_waitqueue_head(&md->wait);
 	INIT_WORK(&md->work, dm_wq_work);
 	init_waitqueue_head(&md->eventq);
@@ -2088,7 +2091,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 				break;
 			}
 			spin_unlock_irqrestore(q->queue_lock, flags);
-		} else if (!atomic_read(&md->pending))
+		} else if (!atomic_read(&md->pending[0]) &&
+					!atomic_read(&md->pending[1]))
 			break;
 
 		if (interruptible == TASK_INTERRUPTIBLE &&
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ea4e6cb29e13..619ba99dfe39 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev,
 		part_stat_read(p, merges[WRITE]),
 		(unsigned long long)part_stat_read(p, sectors[WRITE]),
 		jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
-		p->in_flight,
+		part_in_flight(p),
 		jiffies_to_msecs(part_stat_read(p, io_ticks)),
 		jiffies_to_msecs(part_stat_read(p, time_in_queue)));
 }
 
+ssize_t part_inflight_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
+}
+
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 ssize_t part_fail_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
@@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
 	&dev_attr_stat.attr,
+	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 45fc320a53c6..44263cb27121 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -98,7 +98,7 @@ struct hd_struct {
 	int make_it_fail;
 #endif
 	unsigned long stamp;
-	int in_flight;
+	int in_flight[2];
 #ifdef	CONFIG_SMP
 	struct disk_stats *dkstats;
 #else
@@ -322,18 +322,23 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_sub(cpu, gendiskp, field, subnd)			\
 	part_stat_add(cpu, gendiskp, field, -subnd)
 
-static inline void part_inc_in_flight(struct hd_struct *part)
+static inline void part_inc_in_flight(struct hd_struct *part, int rw)
 {
-	part->in_flight++;
+	part->in_flight[rw]++;
 	if (part->partno)
-		part_to_disk(part)->part0.in_flight++;
+		part_to_disk(part)->part0.in_flight[rw]++;
 }
 
-static inline void part_dec_in_flight(struct hd_struct *part)
+static inline void part_dec_in_flight(struct hd_struct *part, int rw)
 {
-	part->in_flight--;
+	part->in_flight[rw]--;
 	if (part->partno)
-		part_to_disk(part)->part0.in_flight--;
+		part_to_disk(part)->part0.in_flight[rw]--;
+}
+
+static inline int part_in_flight(struct hd_struct *part)
+{
+	return part->in_flight[0] + part->in_flight[1];
 }
 
 /* block/blk-core.c */
@@ -546,6 +551,8 @@ extern ssize_t part_size_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);
 extern ssize_t part_stat_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);
+extern ssize_t part_inflight_show(struct device *dev,
+			      struct device_attribute *attr, char *buf);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 extern ssize_t part_fail_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);
-- 
cgit v1.2.3-70-g09d2


From 746cd1e7e4a555ddaee53b19a46e05c9c61eaf09 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 12 Sep 2009 07:35:43 +0200
Subject: block: use blkdev_issue_discard in blk_ioctl_discard

blk_ioctl_discard duplicates large amounts of code from blkdev_issue_discard,
the only difference between the two is that blkdev_issue_discard needs to
send a barrier discard request and blk_ioctl_discard a non-barrier one,
and blk_ioctl_discard needs to wait on the request.  To facilitates this
add a flags argument to blkdev_issue_discard to control both aspects of the
behaviour.  This will be very useful later on for using the waiting
funcitonality for other callers.

Based on an earlier patch from Matthew Wilcox <matthew@wil.cx>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    | 31 +++++++++++++++++++------------
 block/ioctl.c          | 49 ++-----------------------------------------------
 fs/btrfs/extent-tree.c |  3 ++-
 fs/gfs2/rgrp.c         |  6 ++++--
 include/linux/blkdev.h |  9 ++++++---
 mm/swapfile.c          |  6 ++++--
 6 files changed, 37 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 30022b4e2f63..6593ab39cfe9 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -348,6 +348,9 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	}
 
+	if (bio->bi_private)
+		complete(bio->bi_private);
+
 	bio_put(bio);
 }
 
@@ -357,21 +360,20 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
  * @sector:	start sector
  * @nr_sects:	number of sectors to discard
  * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @flags:	DISCARD_FL_* flags to control behaviour
  *
  * Description:
- *    Issue a discard request for the sectors in question. Does not wait.
+ *    Issue a discard request for the sectors in question.
  */
-int blkdev_issue_discard(struct block_device *bdev,
-			 sector_t sector, sector_t nr_sects, gfp_t gfp_mask)
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, int flags)
 {
-	struct request_queue *q;
-	struct bio *bio;
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct request_queue *q = bdev_get_queue(bdev);
+	int type = flags & DISCARD_FL_BARRIER ?
+		DISCARD_BARRIER : DISCARD_NOBARRIER;
 	int ret = 0;
 
-	if (bdev->bd_disk == NULL)
-		return -ENXIO;
-
-	q = bdev_get_queue(bdev);
 	if (!q)
 		return -ENXIO;
 
@@ -379,12 +381,14 @@ int blkdev_issue_discard(struct block_device *bdev,
 		return -EOPNOTSUPP;
 
 	while (nr_sects && !ret) {
-		bio = bio_alloc(gfp_mask, 0);
+		struct bio *bio = bio_alloc(gfp_mask, 0);
 		if (!bio)
 			return -ENOMEM;
 
 		bio->bi_end_io = blkdev_discard_end_io;
 		bio->bi_bdev = bdev;
+		if (flags & DISCARD_FL_WAIT)
+			bio->bi_private = &wait;
 
 		bio->bi_sector = sector;
 
@@ -396,10 +400,13 @@ int blkdev_issue_discard(struct block_device *bdev,
 			bio->bi_size = nr_sects << 9;
 			nr_sects = 0;
 		}
+
 		bio_get(bio);
-		submit_bio(DISCARD_BARRIER, bio);
+		submit_bio(type, bio);
+
+		if (flags & DISCARD_FL_WAIT)
+			wait_for_completion(&wait);
 
-		/* Check if it failed immediately */
 		if (bio_flagged(bio, BIO_EOPNOTSUPP))
 			ret = -EOPNOTSUPP;
 		else if (!bio_flagged(bio, BIO_UPTODATE))
diff --git a/block/ioctl.c b/block/ioctl.c
index 500e4c73cc52..d3e6b5827a34 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -112,22 +112,9 @@ static int blkdev_reread_part(struct block_device *bdev)
 	return res;
 }
 
-static void blk_ioc_discard_endio(struct bio *bio, int err)
-{
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	}
-	complete(bio->bi_private);
-}
-
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			     uint64_t len)
 {
-	struct request_queue *q = bdev_get_queue(bdev);
-	int ret = 0;
-
 	if (start & 511)
 		return -EINVAL;
 	if (len & 511)
@@ -137,40 +124,8 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 
 	if (start + len > (bdev->bd_inode->i_size >> 9))
 		return -EINVAL;
-
-	if (!q->prepare_discard_fn)
-		return -EOPNOTSUPP;
-
-	while (len && !ret) {
-		DECLARE_COMPLETION_ONSTACK(wait);
-		struct bio *bio;
-
-		bio = bio_alloc(GFP_KERNEL, 0);
-
-		bio->bi_end_io = blk_ioc_discard_endio;
-		bio->bi_bdev = bdev;
-		bio->bi_private = &wait;
-		bio->bi_sector = start;
-
-		if (len > queue_max_hw_sectors(q)) {
-			bio->bi_size = queue_max_hw_sectors(q) << 9;
-			len -= queue_max_hw_sectors(q);
-			start += queue_max_hw_sectors(q);
-		} else {
-			bio->bi_size = len << 9;
-			len = 0;
-		}
-		submit_bio(DISCARD_NOBARRIER, bio);
-
-		wait_for_completion(&wait);
-
-		if (bio_flagged(bio, BIO_EOPNOTSUPP))
-			ret = -EOPNOTSUPP;
-		else if (!bio_flagged(bio, BIO_UPTODATE))
-			ret = -EIO;
-		bio_put(bio);
-	}
-	return ret;
+	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
+				    DISCARD_FL_WAIT);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72a2b9c28e9f..535f85ba104f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1511,7 +1511,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
-	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+			     DISCARD_FL_BARRIER);
 }
 #endif
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fba795798d3a..fbc43241f2ef 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -857,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 					goto start_new_extent;
 				if ((start + nr_sects) != blk) {
 					rv = blkdev_issue_discard(bdev, start,
-							    nr_sects, GFP_NOFS);
+							    nr_sects, GFP_NOFS,
+							    DISCARD_FL_BARRIER);
 					if (rv)
 						goto fail;
 					nr_sects = 0;
@@ -871,7 +872,8 @@ start_new_extent:
 		}
 	}
 	if (nr_sects) {
-		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS);
+		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
+					 DISCARD_FL_BARRIER);
 		if (rv)
 			goto fail;
 	}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 86c2bdff3b89..e23a86cae5ac 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -998,15 +998,18 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
-extern int blkdev_issue_discard(struct block_device *,
-				sector_t sector, sector_t nr_sects, gfp_t);
+#define DISCARD_FL_WAIT		0x01	/* wait for completion */
+#define DISCARD_FL_BARRIER	0x02	/* issue DISCARD_BARRIER request */
+extern int blkdev_issue_discard(struct block_device *, sector_t sector,
+		sector_t nr_sects, gfp_t, int flags);
 
 static inline int sb_issue_discard(struct super_block *sb,
 				   sector_t block, sector_t nr_blocks)
 {
 	block <<= (sb->s_blocksize_bits - 9);
 	nr_blocks <<= (sb->s_blocksize_bits - 9);
-	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL);
+	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL,
+				    DISCARD_FL_BARRIER);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8ffdc0d23c53..74f1102e8749 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -161,7 +161,8 @@ static int discard_swap(struct swap_info_struct *si)
 		}
 
 		err = blkdev_issue_discard(si->bdev, start_block,
-						nr_blocks, GFP_KERNEL);
+						nr_blocks, GFP_KERNEL,
+						DISCARD_FL_BARRIER);
 		if (err)
 			break;
 
@@ -200,7 +201,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 			start_block <<= PAGE_SHIFT - 9;
 			nr_blocks <<= PAGE_SHIFT - 9;
 			if (blkdev_issue_discard(si->bdev, start_block,
-							nr_blocks, GFP_NOIO))
+							nr_blocks, GFP_NOIO,
+							DISCARD_FL_BARRIER))
 				break;
 		}
 
-- 
cgit v1.2.3-70-g09d2


From 86d006365610fe6cda243d89b67d5047dca44656 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 14 Sep 2009 09:50:57 +0100
Subject: GFS2: Whitespace fixes

Reported-by: Daniel Walker <dwalker@fifo99.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c  | 2 +-
 fs/gfs2/xattr.c | 4 ++--
 fs/gfs2/xattr.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index caaa665d4033..18d3a28554ac 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1463,7 +1463,7 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
 {
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
-	        (unsigned long long)rgd->rd_addr);
+		(unsigned long long)rgd->rd_addr);
 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
 	gfs2_rgrp_dump(NULL, rgd->rd_gl);
 	rgd->rd_flags |= GFS2_RDF_ERROR;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 34ae9ba4c9fe..8a0f8ef6ee27 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -562,7 +562,7 @@ int gfs2_xattr_get(struct inode *inode, int type, const char *name,
 		return error;
 	if (!el.el_ea)
 		return -ENODATA;
-	if (size) 
+	if (size)
 		error = gfs2_ea_get_copy(ip, &el, buffer, size);
 	else
 		error = GFS2_EA_DATA_LEN(el.el_ea);
@@ -1046,7 +1046,7 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
 				     GFS2_EA2NEXT(el->el_prev) == el->el_ea);
 	}
 
-	return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
+	return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0);
 }
 
 static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index 4040a188f63b..cbdfd7743733 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -19,7 +19,7 @@ struct iattr;
 #define GFS2_EA_SIZE(ea) \
 ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
       ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
-                                  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
+				  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
 
 #define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
 #define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
-- 
cgit v1.2.3-70-g09d2


From 1b2f5a641bca91966fd91dc489238068a0ab4b69 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 22 Aug 2009 19:10:07 +0900
Subject: nilfs2: fix ignored error code in __nilfs_read_inode()

The __nilfs_read_inode function is ignoring the error code returned
from nilfs_read_inode_common(), and wrongly delivers a success code
(zero) when it escapes from the function in erroneous cases.

This adds the missing error handling.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index fe9d8f2a13f8..807e584b163d 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -430,7 +430,8 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
 
 	raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
 
-	if (nilfs_read_inode_common(inode, raw_inode))
+	err = nilfs_read_inode_common(inode, raw_inode);
+	if (err)
 		goto failed_unmap;
 
 	if (S_ISREG(inode->i_mode)) {
-- 
cgit v1.2.3-70-g09d2


From b5696e5e0dbfb6323277d51d67d230317c18aba9 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 3 Sep 2009 17:42:48 +0200
Subject: nilfs2: fix format string compile warning (ino_t)

Unlike on most other architectures ino_t is an unsigned int on s390.
So add an explicit cast to avoid this compile warning:

fs/nilfs2/recovery.c: In function 'recover_dsync_blocks':
fs/nilfs2/recovery.c:555: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t'

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/recovery.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d80cc71be749..6dc83591d118 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -552,7 +552,8 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
 		printk(KERN_WARNING
 		       "NILFS warning: error recovering data block "
 		       "(err=%d, ino=%lu, block-offset=%llu)\n",
-		       err, rb->ino, (unsigned long long)rb->blkoff);
+		       err, (unsigned long)rb->ino,
+		       (unsigned long long)rb->blkoff);
 		if (!err2)
 			err2 = err;
  next:
-- 
cgit v1.2.3-70-g09d2


From 027d6404eb4327878454db72a006adfcb8001bb8 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 2 Aug 2009 22:45:33 +0900
Subject: nilfs2: use semaphore to protect pointer to a writable FS-instance

will get rid of nilfs_get_writer() and nilfs_put_writer() pair used to
retain a writable FS-instance for a period.

The pair functions were making up some kind of recursive lock with a
mutex, but they became overkill since the commit
201913ed746c7724a40d33ee5a0b6a1fd2ef3193.  Furthermore, they caused
the following lockdep warning because the mutex can be released by a
task which didn't lock it:

 =====================================
 [ BUG: bad unlock balance detected! ]
 -------------------------------------
 kswapd0/422 is trying to release lock (&nilfs->ns_writer_mutex) at:
 [<c1359ff5>] mutex_unlock+0x8/0xa
 but there are no more locks to release!

 other info that might help us debug this:
 no locks held by kswapd0/422.

 stack backtrace:
 Pid: 422, comm: kswapd0 Not tainted 2.6.31-rc4-nilfs #51
 Call Trace:
  [<c1358f97>] ? printk+0xf/0x18
  [<c104fea7>] print_unlock_inbalance_bug+0xcc/0xd7
  [<c11578de>] ? prop_put_global+0x3/0x35
  [<c1050195>] lock_release+0xed/0x1dc
  [<c1359ff5>] ? mutex_unlock+0x8/0xa
  [<c1359f83>] __mutex_unlock_slowpath+0xaf/0x119
  [<c1359ff5>] mutex_unlock+0x8/0xa
  [<d1284add>] nilfs_mdt_write_page+0xd8/0xe1 [nilfs2]
  [<c1092653>] shrink_page_list+0x379/0x68d
  [<c109171b>] ? isolate_pages_global+0xb4/0x18c
  [<c1092bd2>] shrink_list+0x26b/0x54b
  [<c10930be>] shrink_zone+0x20c/0x2a2
  [<c10936b7>] kswapd+0x407/0x591
  [<c1091667>] ? isolate_pages_global+0x0/0x18c
  [<c1040603>] ? autoremove_wake_function+0x0/0x33
  [<c10932b0>] ? kswapd+0x0/0x591
  [<c104033b>] kthread+0x69/0x6e
  [<c10402d2>] ? kthread+0x0/0x6e
  [<c1003e33>] kernel_thread_helper+0x7/0x1a

This patch uses a reader/writer semaphore instead of the own lock and
kills this warning.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/mdt.c       |  8 +++++---
 fs/nilfs2/the_nilfs.c |  3 +--
 fs/nilfs2/the_nilfs.h | 27 ++++++---------------------
 3 files changed, 12 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 2dfd47714ae5..a5cd80162dc5 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -402,6 +402,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 	struct inode *inode = container_of(page->mapping,
 					   struct inode, i_data);
 	struct super_block *sb = inode->i_sb;
+	struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
 	struct nilfs_sb_info *writer = NULL;
 	int err = 0;
 
@@ -411,9 +412,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 	if (page->mapping->assoc_mapping)
 		return 0; /* Do not request flush for shadow page cache */
 	if (!sb) {
-		writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
+		down_read(&nilfs->ns_writer_sem);
+		writer = nilfs->ns_writer;
 		if (!writer) {
-			nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+			up_read(&nilfs->ns_writer_sem);
 			return -EROFS;
 		}
 		sb = writer->s_super;
@@ -425,7 +427,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 		nilfs_flush_segment(sb, inode->i_ino);
 
 	if (writer)
-		nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+		up_read(&nilfs->ns_writer_sem);
 	return err;
 }
 
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8b8889825716..0c4573653b87 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -68,12 +68,11 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 
 	nilfs->ns_bdev = bdev;
 	atomic_set(&nilfs->ns_count, 1);
-	atomic_set(&nilfs->ns_writer_refcount, -1);
 	atomic_set(&nilfs->ns_ndirtyblks, 0);
 	init_rwsem(&nilfs->ns_sem);
 	init_rwsem(&nilfs->ns_super_sem);
 	mutex_init(&nilfs->ns_mount_mutex);
-	mutex_init(&nilfs->ns_writer_mutex);
+	init_rwsem(&nilfs->ns_writer_sem);
 	INIT_LIST_HEAD(&nilfs->ns_list);
 	INIT_LIST_HEAD(&nilfs->ns_supers);
 	spin_lock_init(&nilfs->ns_last_segment_lock);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 1b9caafb8662..fa3a1dfe4053 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -50,8 +50,7 @@ enum {
  * @ns_sem: semaphore for shared states
  * @ns_super_sem: semaphore for global operations across super block instances
  * @ns_mount_mutex: mutex protecting mount process of nilfs
- * @ns_writer_mutex: mutex protecting ns_writer attach/detach
- * @ns_writer_refcount: number of referrers on ns_writer
+ * @ns_writer_sem: semaphore protecting ns_writer attach/detach
  * @ns_current: back pointer to current mount
  * @ns_sbh: buffer heads of on-disk super blocks
  * @ns_sbp: pointers to super block data
@@ -100,8 +99,7 @@ struct the_nilfs {
 	struct rw_semaphore	ns_sem;
 	struct rw_semaphore	ns_super_sem;
 	struct mutex		ns_mount_mutex;
-	struct mutex		ns_writer_mutex;
-	atomic_t		ns_writer_refcount;
+	struct rw_semaphore	ns_writer_sem;
 
 	/*
 	 * components protected by ns_super_sem
@@ -221,34 +219,21 @@ static inline void get_nilfs(struct the_nilfs *nilfs)
 	atomic_inc(&nilfs->ns_count);
 }
 
-static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
-{
-	if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
-		mutex_lock(&nilfs->ns_writer_mutex);
-	return nilfs->ns_writer;
-}
-
-static inline void nilfs_put_writer(struct the_nilfs *nilfs)
-{
-	if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
-		mutex_unlock(&nilfs->ns_writer_mutex);
-}
-
 static inline void
 nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 {
-	mutex_lock(&nilfs->ns_writer_mutex);
+	down_write(&nilfs->ns_writer_sem);
 	nilfs->ns_writer = sbi;
-	mutex_unlock(&nilfs->ns_writer_mutex);
+	up_write(&nilfs->ns_writer_sem);
 }
 
 static inline void
 nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 {
-	mutex_lock(&nilfs->ns_writer_mutex);
+	down_write(&nilfs->ns_writer_sem);
 	if (sbi == nilfs->ns_writer)
 		nilfs->ns_writer = NULL;
-	mutex_unlock(&nilfs->ns_writer_mutex);
+	up_write(&nilfs->ns_writer_sem);
 }
 
 static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
-- 
cgit v1.2.3-70-g09d2


From 143511046765504d2d1be633efd710f8d84e0407 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 6 Sep 2009 17:49:49 +0900
Subject: nilfs2: always lookup disk block address before reading metadata
 block

The current metadata file code skips disk address lookup for its data
block if the buffer has a mapped flag.

This has a potential risk to cause read request to be performed
against the stale block address that GC moved, and it may lead to meta
data corruption.  The mapped flag is safe if the buffer has an
uptodate flag, otherwise it may prevent necessary update of disk
address in the next read.

This will avoid the potential problem by ensuring disk address lookup
before reading metadata block even for buffers with the mapped flag.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/mdt.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index a5cd80162dc5..1ae8d56052c3 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -103,15 +103,12 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
 		goto failed_unlock;
 
 	err = -EEXIST;
-	if (buffer_uptodate(bh) || buffer_mapped(bh))
+	if (buffer_uptodate(bh))
 		goto failed_bh;
-#if 0
-	/* The uptodate flag is not protected by the page lock, but
-	   the mapped flag is.  Thus, we don't have to wait the buffer. */
+
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		goto failed_bh;
-#endif
 
 	bh->b_bdev = nilfs->ns_bdev;
 	err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
@@ -162,17 +159,15 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 		unlock_buffer(bh);
 		goto out;
 	}
-	if (!buffer_mapped(bh)) { /* unused buffer */
-		ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff,
-					&blknum);
-		if (unlikely(ret)) {
-			unlock_buffer(bh);
-			goto failed_bh;
-		}
-		bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
-		bh->b_blocknr = blknum;
-		set_buffer_mapped(bh);
+
+	ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum);
+	if (unlikely(ret)) {
+		unlock_buffer(bh);
+		goto failed_bh;
 	}
+	bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
+	bh->b_blocknr = blknum;
+	set_buffer_mapped(bh);
 
 	bh->b_end_io = end_buffer_read_sync;
 	get_bh(bh);
-- 
cgit v1.2.3-70-g09d2


From b58a285ba40866e22e5876969a8f796a74311d9e Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Wed, 24 Jun 2009 20:06:34 +0900
Subject: nilfs2: implement nilfs_show_options to display mount options in
 /proc/mounts

This is a patch to display mount options in procfs.
Mount options will show up in the /proc/mounts as other fs does.

...
/dev/sda6 /mnt nilfs2 ro,relatime,barrier=off,cp=3,order=strict 0 0
...

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 151964f0de4c..ebbefb9d26c9 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -50,6 +50,8 @@
 #include <linux/writeback.h>
 #include <linux/kobject.h>
 #include <linux/exportfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
 #include "nilfs.h"
 #include "mdt.h"
 #include "alloc.h"
@@ -529,6 +531,26 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct super_block *sb = vfs->mnt_sb;
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+
+	if (!nilfs_test_opt(sbi, BARRIER))
+		seq_printf(seq, ",barrier=off");
+	if (nilfs_test_opt(sbi, SNAPSHOT))
+		seq_printf(seq, ",cp=%llu",
+			   (unsigned long long int)sbi->s_snapshot_cno);
+	if (nilfs_test_opt(sbi, ERRORS_RO))
+		seq_printf(seq, ",errors=remount-ro");
+	if (nilfs_test_opt(sbi, ERRORS_PANIC))
+		seq_printf(seq, ",errors=panic");
+	if (nilfs_test_opt(sbi, STRICT_ORDER))
+		seq_printf(seq, ",order=strict");
+
+	return 0;
+}
+
 static struct super_operations nilfs_sops = {
 	.alloc_inode    = nilfs_alloc_inode,
 	.destroy_inode  = nilfs_destroy_inode,
@@ -546,7 +568,7 @@ static struct super_operations nilfs_sops = {
 	.remount_fs     = nilfs_remount,
 	.clear_inode    = nilfs_clear_inode,
 	/* .umount_begin */
-	/* .show_options */
+	.show_options = nilfs_show_options
 };
 
 static struct inode *
-- 
cgit v1.2.3-70-g09d2


From ec5d66abdb0caf8b753a138568f20770a3d64c8c Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Fri, 10 Jul 2009 19:57:28 +0900
Subject: nilfs2: remove redundant super block commit

This removes redundant super block commit.

nilfs_write_super will call nilfs_commit_super to store super block
into block device.  However, nilfs_put_super will call
nilfs_commit_super right after calling nilfs_write_super.  So calling
nilfs_write_super in nilfs_put_super would be redundant.

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index ebbefb9d26c9..e543eda0d939 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -313,9 +313,6 @@ static void nilfs_put_super(struct super_block *sb)
 
 	lock_kernel();
 
-	if (sb->s_dirt)
-		nilfs_write_super(sb);
-
 	nilfs_detach_segment_constructor(sbi);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-- 
cgit v1.2.3-70-g09d2


From 6233caa9d5b153c2190d6c1c35c1dd1010104fc1 Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Thu, 23 Jul 2009 01:26:33 +0900
Subject: nilfs2: fix disorder of nilfs_write_super in nilfs_sync_fs

This fixes disorder of nilfs_write_super in nilfs_sync_fs.  Commiting
super block must be the end of the function so that every changes are
reflected.

->sync_fs() is not called frequently so this makes nilfs_sync_fs call
nilfs_commit_super instead of nilfs_write_super.

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e543eda0d939..9926a1d6d225 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -385,13 +385,19 @@ static void nilfs_write_super(struct super_block *sb)
 
 static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int err = 0;
 
-	nilfs_write_super(sb);
-
 	/* This function is called when super block should be written back */
 	if (wait)
 		err = nilfs_construct_segment(sb);
+
+	down_write(&nilfs->ns_sem);
+	if (sb->s_dirt)
+		nilfs_commit_super(sbi, 1);
+	up_write(&nilfs->ns_sem);
+
 	return err;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 79efdd94111f30c373fce05e4e5822d8ff671c2a Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Thu, 23 Jul 2009 01:26:34 +0900
Subject: nilfs2: clean up nilfs_write_super

Separate conditions that check if syncing super block and alternative
super block are required as inline functions to reuse the conditions.

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c     | 11 +++--------
 fs/nilfs2/the_nilfs.h | 14 ++++++++++++++
 2 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 9926a1d6d225..70e8613a1954 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -367,17 +367,12 @@ static void nilfs_write_super(struct super_block *sb)
 
 	down_write(&nilfs->ns_sem);
 	if (!(sb->s_flags & MS_RDONLY)) {
-		struct nilfs_super_block **sbp = nilfs->ns_sbp;
-		u64 t = get_seconds();
-		int dupsb;
-
-		if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
-		    t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
+		if (!nilfs_discontinued(nilfs) &&
+		    !nilfs_sb_need_update(nilfs)) {
 			up_write(&nilfs->ns_sem);
 			return;
 		}
-		dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
-		nilfs_commit_super(sbi, dupsb);
+		nilfs_commit_super(sbi, nilfs_altsb_need_update(nilfs));
 	}
 	sb->s_dirt = 0;
 	up_write(&nilfs->ns_sem);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index fa3a1dfe4053..68e9626e3d44 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -200,6 +200,20 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
 #define NILFS_SB_FREQ		10
 #define NILFS_ALTSB_FREQ	60  /* spare superblock */
 
+static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
+{
+	u64 t = get_seconds();
+	return t < nilfs->ns_sbwtime[0] ||
+		 t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
+}
+
+static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs)
+{
+	u64 t = get_seconds();
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
+}
+
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
 struct the_nilfs *find_or_create_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
-- 
cgit v1.2.3-70-g09d2


From 1dfa27105a6cf4171c439fd40a004a0a17838115 Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Thu, 23 Jul 2009 01:33:49 +0900
Subject: nilfs2: stop using periodic write_super callback

This removes nilfs_write_super and commit super block in nilfs
internal thread, instead of periodic write_super callback.

VFS layer calls ->write_super callback periodically.  However,
it looks like that calling back is ommited when disk I/O is busy.
And when cleanerd (nilfs GC) is runnig, disk I/O tend to be busy thus
nilfs superblock is not synchronized as nilfs designed.

To avoid it, syncing superblock by nilfs thread instead of pdflush.

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segment.c |  7 ++++++-
 fs/nilfs2/super.c   | 46 +---------------------------------------------
 2 files changed, 7 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 51ff3d0a4ee2..683df89dbae5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2501,7 +2501,8 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
 		if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
 		    nilfs_discontinued(nilfs)) {
 			down_write(&nilfs->ns_sem);
-			req->sb_err = nilfs_commit_super(sbi, 0);
+			req->sb_err = nilfs_commit_super(sbi,
+					nilfs_altsb_need_update(nilfs));
 			up_write(&nilfs->ns_sem);
 		}
 	}
@@ -2689,6 +2690,7 @@ static int nilfs_segctor_thread(void *arg)
 	} else {
 		DEFINE_WAIT(wait);
 		int should_sleep = 1;
+		struct the_nilfs *nilfs;
 
 		prepare_to_wait(&sci->sc_wait_daemon, &wait,
 				TASK_INTERRUPTIBLE);
@@ -2709,6 +2711,9 @@ static int nilfs_segctor_thread(void *arg)
 		finish_wait(&sci->sc_wait_daemon, &wait);
 		timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
 			   time_after_eq(jiffies, sci->sc_timer->expires));
+		nilfs = sci->sc_sbi->s_nilfs;
+		if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs))
+			set_nilfs_discontinued(nilfs);
 	}
 	goto loop;
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 70e8613a1954..ed8ec5718ae6 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -67,7 +67,6 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
 		   "(NILFS)");
 MODULE_LICENSE("GPL");
 
-static void nilfs_write_super(struct super_block *sb);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 
 /**
@@ -335,49 +334,6 @@ static void nilfs_put_super(struct super_block *sb)
 	unlock_kernel();
 }
 
-/**
- * nilfs_write_super - write super block(s) of NILFS
- * @sb: super_block
- *
- * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
- * clears s_dirt.  This function is called in the section protected by
- * lock_super().
- *
- * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
- * of the struct the_nilfs.  Lock order must be as follows:
- *
- *   1. lock_super()
- *   2.    down_write(&nilfs->ns_sem)
- *
- * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
- * of the super block (nilfs->ns_sbp[]).
- *
- * In most cases, VFS functions call lock_super() before calling these
- * methods.  So we must be careful not to bring on deadlocks when using
- * lock_super();  see generic_shutdown_super(), write_super(), and so on.
- *
- * Note that order of lock_kernel() and lock_super() depends on contexts
- * of VFS.  We should also note that lock_kernel() can be used in its
- * protective section and only the outermost one has an effect.
- */
-static void nilfs_write_super(struct super_block *sb)
-{
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct the_nilfs *nilfs = sbi->s_nilfs;
-
-	down_write(&nilfs->ns_sem);
-	if (!(sb->s_flags & MS_RDONLY)) {
-		if (!nilfs_discontinued(nilfs) &&
-		    !nilfs_sb_need_update(nilfs)) {
-			up_write(&nilfs->ns_sem);
-			return;
-		}
-		nilfs_commit_super(sbi, nilfs_altsb_need_update(nilfs));
-	}
-	sb->s_dirt = 0;
-	up_write(&nilfs->ns_sem);
-}
-
 static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
@@ -558,7 +514,7 @@ static struct super_operations nilfs_sops = {
 	/* .drop_inode	  = nilfs_drop_inode, */
 	.delete_inode   = nilfs_delete_inode,
 	.put_super      = nilfs_put_super,
-	.write_super    = nilfs_write_super,
+	/* .write_super    = nilfs_write_super, */
 	.sync_fs        = nilfs_sync_fs,
 	/* .write_super_lockfs */
 	/* .unlockfs */
-- 
cgit v1.2.3-70-g09d2


From c1b353f04a42f1b531be920149b85343d56a6b3b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 19 Jun 2009 15:25:42 +0900
Subject: nilfs2: use GFP_NOIO for bio_alloc instead of GFP_NOWAIT

Alberto Bertogli advised me about bio_alloc() use in nilfs:
On Sat, 13 Jun 2009 22:52:40 -0300, Alberto Bertogli wrote:
> By the way, those bio_alloc()s are using GFP_NOWAIT but it looks
> like they could use at least GFP_NOIO or GFP_NOFS, since the caller
> can (and sometimes do) sleep. The only caller is nilfs_submit_bh(),
> which calls nilfs_submit_seg_bio() which can sleep calling
> wait_for_completion().

This takes in the comment and replaces the use of GFP_NOWAIT flag with
GFP_NOIO.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segbuf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 9e3fe17bb96b..e6d9e37fa241 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -316,10 +316,10 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
 {
 	struct bio *bio;
 
-	bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+	bio = bio_alloc(GFP_NOIO, nr_vecs);
 	if (bio == NULL) {
 		while (!bio && (nr_vecs >>= 1))
-			bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+			bio = bio_alloc(GFP_NOIO, nr_vecs);
 	}
 	if (likely(bio)) {
 		bio->bi_bdev = sb->s_bdev;
-- 
cgit v1.2.3-70-g09d2


From a4f0b9c5b4ae83636dafde8f3a0e04b5e411a0f3 Mon Sep 17 00:00:00 2001
From: Zhang Qiang <zhangqiang.buaa@gmail.com>
Date: Sun, 9 Aug 2009 19:13:10 +0800
Subject: nilfs2: An unassigned variable is assigned to a never used structure
 member

nilfs2: In procedure 'nilfs_get_sb()', when a nilfs filesysttem is
mounted for the first time, local variable 'nilfs->ns_last_cno' is
used before loading the latest checkpoint number from disk (in
'nilfs_fill_super'). 'nilfs->ns_last_cno' is assigned to 'sd.cno', but
'sd.cno' has never been used in the procedure.

Signed-off-by: Zhang Qiang <zhangqiang.buaa@gmail.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index ed8ec5718ae6..019752f7d2da 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1103,10 +1103,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	 */
 	sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
 
-	if (!sd.cno)
-		/* trying to get the latest checkpoint.  */
-		sd.cno = nilfs_last_cno(nilfs);
-
 	/*
 	 * Get super block instance holding the nilfs_sb_info struct.
 	 * A new instance is allocated if no existing mount is present or
-- 
cgit v1.2.3-70-g09d2


From 43be0ec0387a5ccce2e064cb78502e7b2b4dd590 Mon Sep 17 00:00:00 2001
From: Zhu Yanhai <zhu.yanhai@gmail.com>
Date: Wed, 12 Aug 2009 14:17:59 +0800
Subject: nilfs2: add more check routines in mount process

nilfs2: Add more safeguard routines and protections in mount process,
which also makes nilfs2 report consistency error messages when
checkpoint number is invalid.

Signed-off-by: Zhu Yanhai <zhu.yanhai@gmail.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/cpfile.c | 11 ++++++++---
 fs/nilfs2/super.c  |  7 ++++++-
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index aec942cf79e3..1c6cfb59128d 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -815,8 +815,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
 	void *kaddr;
 	int ret;
 
-	if (cno == 0)
-		return -ENOENT; /* checkpoint number 0 is invalid */
+	/* CP number is invalid if it's zero or larger than the
+	largest	exist one.*/
+	if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
+		return -ENOENT;
 	down_read(&NILFS_MDT(cpfile)->mi_sem);
 
 	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
@@ -824,7 +826,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
 		goto out;
 	kaddr = kmap_atomic(bh->b_page, KM_USER0);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
-	ret = nilfs_checkpoint_snapshot(cp);
+	if (nilfs_checkpoint_invalid(cp))
+		ret = -ENOENT;
+	else
+		ret = nilfs_checkpoint_snapshot(cp);
 	kunmap_atomic(kaddr, KM_USER0);
 	brelse(bh);
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 019752f7d2da..50284add7880 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -792,10 +792,15 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 
 	if (sb->s_flags & MS_RDONLY) {
 		if (nilfs_test_opt(sbi, SNAPSHOT)) {
+			down_read(&nilfs->ns_segctor_sem);
 			err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
 						       sbi->s_snapshot_cno);
-			if (err < 0)
+			up_read(&nilfs->ns_segctor_sem);
+			if (err < 0) {
+				if (err == -ENOENT)
+					err = -EINVAL;
 				goto failed_sbi;
+			}
 			if (!err) {
 				printk(KERN_ERR
 				       "NILFS: The specified checkpoint is "
-- 
cgit v1.2.3-70-g09d2


From 1cf58fa840472ec7df6bf2312885949ebb308853 Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Thu, 3 Sep 2009 22:24:17 +0900
Subject: nilfs2: shorten freeze period due to GC in write operation v3

This is a re-revised patch to shorten freeze period.
This version include a fix of the bug Konishi-san mentioned last time.

When GC is runnning, GC moves live block to difference segments.
Copying live blocks into memory is done in a transaction,
however it is not necessarily to be in the transaction.
This patch will get the nilfs_ioctl_move_blocks() out from
transaction lock and put it before the transaction.

I ran sysbench fileio test against nilfs partition.
I copied some DVD/CD images and created snapshot to create live blocks
before starting the benchmark.

Followings are summary of rc8 and rc8 w/ the patch of per-request
statistics, which is min/max and avg.  I ran each test three times and
bellow is average of those numers.

According to this benchmark result, average time is slightly degrated.
However, worstcase (max) result is significantly improved.
This can address a few seconds write freeze.

- random write per-request performance of rc8
 min   0.843ms
 max 680.406ms
 avg   3.050ms
- random write per-request performance of rc8 w/ this patch
 min   0.843ms -> 100.00%
 max 380.490ms ->  55.90%
 avg   3.233ms -> 106.00%

- sequential write per-request performance of rc8
 min   0.736ms
 max 774.343ms
 avg   2.883ms
- sequential write per-request performance of rc8 w/ this patch
 min   0.720ms ->  97.80%
 max  644.280ms->  83.20%
 avg   3.130ms -> 108.50%

-----8<-----8<-----nilfs_cleanerd.conf-----8<-----8<-----
protection_period       150
selection_policy        timestamp       # timestamp in ascend order
nsegments_per_clean     2
cleaning_interval       2
retry_interval          60
use_mmap
log_priority            info
-----8<-----8<-----nilfs_cleanerd.conf-----8<-----8<-----

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/ioctl.c     | 26 +++++++++++++++++++-------
 fs/nilfs2/the_nilfs.h |  2 ++
 2 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 6ea5f872e2de..6572ea4bc4df 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -442,12 +442,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
 	const char *msg;
 	int ret;
 
-	ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
-	if (ret < 0) {
-		msg = "cannot read source blocks";
-		goto failed;
-	}
-
 	ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
 	if (ret < 0) {
 		/*
@@ -548,7 +542,25 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 		}
 	}
 
-	ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+	/*
+	 * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(),
+	 * which will operates an inode list without blocking.
+	 * To protect the list from concurrent operations,
+	 * nilfs_ioctl_move_blocks should be atomic operation.
+	 */
+	if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) {
+		ret = -EBUSY;
+		goto out_free;
+	}
+
+	ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
+	if (ret < 0)
+		printk(KERN_ERR "NILFS: GC failed during preparation: "
+			"cannot read source blocks: err=%d\n", ret);
+	else
+		ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+
+	clear_nilfs_gc_running(nilfs);
 
  out_free:
 	while (--n >= 0)
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 68e9626e3d44..20abd55881e0 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -37,6 +37,7 @@ enum {
 	THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
 				   the latest checkpoint was loaded */
 	THE_NILFS_DISCONTINUED,	/* 'next' pointer chain has broken */
+	THE_NILFS_GC_RUNNING,	/* gc process is running */
 };
 
 /**
@@ -195,6 +196,7 @@ static inline int nilfs_##name(struct the_nilfs *nilfs)			\
 THE_NILFS_FNS(INIT, init)
 THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
+THE_NILFS_FNS(GC_RUNNING, gc_running)
 
 /* Minimum interval of periodical update of superblocks (in seconds) */
 #define NILFS_SB_FREQ		10
-- 
cgit v1.2.3-70-g09d2


From 9ead98637300dc7caefd904bbe1e092bf4d21f87 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 14 Aug 2009 23:39:45 +0900
Subject: nilfs2: remove nilfs_dat_abort_start and nilfs_dat_abort_free

These functions are not called from any functions.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/dat.c | 11 -----------
 fs/nilfs2/dat.h |  1 -
 2 files changed, 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 8927ca27e6f7..1cfcc1a0fdf1 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -109,12 +109,6 @@ void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
 	nilfs_palloc_commit_free_entry(dat, req);
 }
 
-void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
-{
-	nilfs_dat_abort_entry(dat, req);
-	nilfs_palloc_abort_free_entry(dat, req);
-}
-
 int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
 {
 	int ret;
@@ -140,11 +134,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
 	nilfs_dat_commit_entry(dat, req);
 }
 
-void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
-{
-	nilfs_dat_abort_entry(dat, req);
-}
-
 int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
 {
 	struct nilfs_dat_entry *entry;
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d328b81eead4..4b0b47ea85ed 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -39,7 +39,6 @@ void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
 int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
 void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
 			    sector_t);
-void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
 int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
 void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
 void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
-- 
cgit v1.2.3-70-g09d2


From 6d28f7ea43856449ed2f344cb209af3ba1c6b757 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 15 Aug 2009 01:14:10 +0900
Subject: nilfs2: remove unused btree argument from btree functions

Even though many btree functions take a btree object as their first
argument, most of them are not used in their functions.

This sticky use of the btree argument is hurting code readability and
giving the possibility of inefficient code generation.

So, this removes the unnecessary btree arguments.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/btree.c | 456 +++++++++++++++++++++++++-----------------------------
 1 file changed, 208 insertions(+), 248 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index aa412724b64e..21ed8ccea4b9 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -71,21 +71,17 @@ void nilfs_btree_path_cache_destroy(void)
 	kmem_cache_destroy(nilfs_btree_path_cache);
 }
 
-static inline struct nilfs_btree_path *
-nilfs_btree_alloc_path(const struct nilfs_btree *btree)
+static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
 {
-	return (struct nilfs_btree_path *)
-		kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
+	return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
 }
 
-static inline void nilfs_btree_free_path(const struct nilfs_btree *btree,
-					 struct nilfs_btree_path *path)
+static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
 {
 	kmem_cache_free(nilfs_btree_path_cache, path);
 }
 
-static void nilfs_btree_init_path(const struct nilfs_btree *btree,
-				  struct nilfs_btree_path *path)
+static void nilfs_btree_init_path(struct nilfs_btree_path *path)
 {
 	int level;
 
@@ -101,8 +97,7 @@ static void nilfs_btree_init_path(const struct nilfs_btree *btree,
 	}
 }
 
-static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
-				   struct nilfs_btree_path *path)
+static void nilfs_btree_clear_path(struct nilfs_btree_path *path)
 {
 	int level;
 
@@ -148,129 +143,110 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
 }
 
 static inline int
-nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
-			   const struct nilfs_btree_node *node)
+nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
 {
 	return node->bn_flags;
 }
 
 static inline void
-nilfs_btree_node_set_flags(struct nilfs_btree *btree,
-			   struct nilfs_btree_node *node,
-			   int flags)
+nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
 {
 	node->bn_flags = flags;
 }
 
-static inline int nilfs_btree_node_root(const struct nilfs_btree *btree,
-					const struct nilfs_btree_node *node)
+static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node)
 {
-	return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT;
+	return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
 }
 
 static inline int
-nilfs_btree_node_get_level(const struct nilfs_btree *btree,
-			   const struct nilfs_btree_node *node)
+nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
 {
 	return node->bn_level;
 }
 
 static inline void
-nilfs_btree_node_set_level(struct nilfs_btree *btree,
-			   struct nilfs_btree_node *node,
-			   int level)
+nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
 {
 	node->bn_level = level;
 }
 
 static inline int
-nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree,
-			       const struct nilfs_btree_node *node)
+nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
 {
 	return le16_to_cpu(node->bn_nchildren);
 }
 
 static inline void
-nilfs_btree_node_set_nchildren(struct nilfs_btree *btree,
-			       struct nilfs_btree_node *node,
-			       int nchildren)
+nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
 {
 	node->bn_nchildren = cpu_to_le16(nchildren);
 }
 
-static inline int
-nilfs_btree_node_size(const struct nilfs_btree *btree)
+static inline int nilfs_btree_node_size(const struct nilfs_btree *btree)
 {
 	return 1 << btree->bt_bmap.b_inode->i_blkbits;
 }
 
 static inline int
-nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree,
-			       const struct nilfs_btree_node *node)
+nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
+			       const struct nilfs_btree *btree)
 {
-	return nilfs_btree_node_root(btree, node) ?
+	return nilfs_btree_node_root(node) ?
 		NILFS_BTREE_ROOT_NCHILDREN_MIN :
 		NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
 }
 
 static inline int
-nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree,
-			       const struct nilfs_btree_node *node)
+nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
+			       const struct nilfs_btree *btree)
 {
-	return nilfs_btree_node_root(btree, node) ?
+	return nilfs_btree_node_root(node) ?
 		NILFS_BTREE_ROOT_NCHILDREN_MAX :
 		NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
 }
 
 static inline __le64 *
-nilfs_btree_node_dkeys(const struct nilfs_btree *btree,
-		       const struct nilfs_btree_node *node)
+nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
 {
 	return (__le64 *)((char *)(node + 1) +
-			  (nilfs_btree_node_root(btree, node) ?
+			  (nilfs_btree_node_root(node) ?
 			   0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
 }
 
 static inline __le64 *
-nilfs_btree_node_dptrs(const struct nilfs_btree *btree,
-		       const struct nilfs_btree_node *node)
+nilfs_btree_node_dptrs(const struct nilfs_btree_node *node,
+		       const struct nilfs_btree *btree)
 {
-	return (__le64 *)(nilfs_btree_node_dkeys(btree, node) +
-			  nilfs_btree_node_nchildren_max(btree, node));
+	return (__le64 *)(nilfs_btree_node_dkeys(node) +
+			  nilfs_btree_node_nchildren_max(node, btree));
 }
 
 static inline __u64
-nilfs_btree_node_get_key(const struct nilfs_btree *btree,
-			 const struct nilfs_btree_node *node, int index)
+nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
 {
-	return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) +
-					index));
+	return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index));
 }
 
 static inline void
-nilfs_btree_node_set_key(struct nilfs_btree *btree,
-			 struct nilfs_btree_node *node, int index, __u64 key)
+nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
 {
-	*(nilfs_btree_node_dkeys(btree, node) + index) =
-		nilfs_bmap_key_to_dkey(key);
+	*(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key);
 }
 
 static inline __u64
 nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
-			 const struct nilfs_btree_node *node,
-			 int index)
+			 const struct nilfs_btree_node *node, int index)
 {
-	return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) +
+	return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) +
 					index));
 }
 
 static inline void
 nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
-			 struct nilfs_btree_node *node,
-			 int index,
-			 __u64 ptr)
+			 struct nilfs_btree_node *node, int index, __u64 ptr)
 {
-	*(nilfs_btree_node_dptrs(btree, node) + index) =
+	*(nilfs_btree_node_dptrs(node, btree) + index) =
 		nilfs_bmap_ptr_to_dptr(ptr);
 }
 
@@ -283,12 +259,12 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
 	__le64 *dptrs;
 	int i;
 
-	nilfs_btree_node_set_flags(btree, node, flags);
-	nilfs_btree_node_set_level(btree, node, level);
-	nilfs_btree_node_set_nchildren(btree, node, nchildren);
+	nilfs_btree_node_set_flags(node, flags);
+	nilfs_btree_node_set_level(node, level);
+	nilfs_btree_node_set_nchildren(node, nchildren);
 
-	dkeys = nilfs_btree_node_dkeys(btree, node);
-	dptrs = nilfs_btree_node_dptrs(btree, node);
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, btree);
 	for (i = 0; i < nchildren; i++) {
 		dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
 		dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
@@ -305,13 +281,13 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
 	__le64 *ldptrs, *rdptrs;
 	int lnchildren, rnchildren;
 
-	ldkeys = nilfs_btree_node_dkeys(btree, left);
-	ldptrs = nilfs_btree_node_dptrs(btree, left);
-	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+	ldkeys = nilfs_btree_node_dkeys(left);
+	ldptrs = nilfs_btree_node_dptrs(left, btree);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
 
-	rdkeys = nilfs_btree_node_dkeys(btree, right);
-	rdptrs = nilfs_btree_node_dptrs(btree, right);
-	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+	rdkeys = nilfs_btree_node_dkeys(right);
+	rdptrs = nilfs_btree_node_dptrs(right, btree);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
 
 	memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
 	memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
@@ -320,8 +296,8 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
 
 	lnchildren += n;
 	rnchildren -= n;
-	nilfs_btree_node_set_nchildren(btree, left, lnchildren);
-	nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+	nilfs_btree_node_set_nchildren(left, lnchildren);
+	nilfs_btree_node_set_nchildren(right, rnchildren);
 }
 
 /* Assume that the buffer heads corresponding to left and right are locked. */
@@ -334,13 +310,13 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
 	__le64 *ldptrs, *rdptrs;
 	int lnchildren, rnchildren;
 
-	ldkeys = nilfs_btree_node_dkeys(btree, left);
-	ldptrs = nilfs_btree_node_dptrs(btree, left);
-	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+	ldkeys = nilfs_btree_node_dkeys(left);
+	ldptrs = nilfs_btree_node_dptrs(left, btree);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
 
-	rdkeys = nilfs_btree_node_dkeys(btree, right);
-	rdptrs = nilfs_btree_node_dptrs(btree, right);
-	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+	rdkeys = nilfs_btree_node_dkeys(right);
+	rdptrs = nilfs_btree_node_dptrs(right, btree);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
 
 	memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
 	memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
@@ -349,8 +325,8 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
 
 	lnchildren -= n;
 	rnchildren += n;
-	nilfs_btree_node_set_nchildren(btree, left, lnchildren);
-	nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+	nilfs_btree_node_set_nchildren(left, lnchildren);
+	nilfs_btree_node_set_nchildren(right, rnchildren);
 }
 
 /* Assume that the buffer head corresponding to node is locked. */
@@ -362,9 +338,9 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
 	__le64 *dptrs;
 	int nchildren;
 
-	dkeys = nilfs_btree_node_dkeys(btree, node);
-	dptrs = nilfs_btree_node_dptrs(btree, node);
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, btree);
+	nchildren = nilfs_btree_node_get_nchildren(node);
 	if (index < nchildren) {
 		memmove(dkeys + index + 1, dkeys + index,
 			(nchildren - index) * sizeof(*dkeys));
@@ -374,7 +350,7 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
 	dkeys[index] = nilfs_bmap_key_to_dkey(key);
 	dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
 	nchildren++;
-	nilfs_btree_node_set_nchildren(btree, node, nchildren);
+	nilfs_btree_node_set_nchildren(node, nchildren);
 }
 
 /* Assume that the buffer head corresponding to node is locked. */
@@ -388,11 +364,11 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
 	__le64 *dptrs;
 	int nchildren;
 
-	dkeys = nilfs_btree_node_dkeys(btree, node);
-	dptrs = nilfs_btree_node_dptrs(btree, node);
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, btree);
 	key = nilfs_bmap_dkey_to_key(dkeys[index]);
 	ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	nchildren = nilfs_btree_node_get_nchildren(node);
 	if (keyp != NULL)
 		*keyp = key;
 	if (ptrp != NULL)
@@ -405,11 +381,10 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
 			(nchildren - index - 1) * sizeof(*dptrs));
 	}
 	nchildren--;
-	nilfs_btree_node_set_nchildren(btree, node, nchildren);
+	nilfs_btree_node_set_nchildren(node, nchildren);
 }
 
-static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
-				   const struct nilfs_btree_node *node,
+static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
 				   __u64 key, int *indexp)
 {
 	__u64 nkey;
@@ -417,12 +392,12 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
 
 	/* binary search */
 	low = 0;
-	high = nilfs_btree_node_get_nchildren(btree, node) - 1;
+	high = nilfs_btree_node_get_nchildren(node) - 1;
 	index = 0;
 	s = 0;
 	while (low <= high) {
 		index = (low + high) / 2;
-		nkey = nilfs_btree_node_get_key(btree, node, index);
+		nkey = nilfs_btree_node_get_key(node, index);
 		if (nkey == key) {
 			s = 0;
 			goto out;
@@ -436,9 +411,8 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
 	}
 
 	/* adjust index */
-	if (nilfs_btree_node_get_level(btree, node) >
-	    NILFS_BTREE_LEVEL_NODE_MIN) {
-		if ((s > 0) && (index > 0))
+	if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) {
+		if (s > 0 && index > 0)
 			index--;
 	} else if (s < 0)
 		index++;
@@ -456,25 +430,20 @@ nilfs_btree_get_root(const struct nilfs_btree *btree)
 }
 
 static inline struct nilfs_btree_node *
-nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree,
-			     const struct nilfs_btree_path *path,
-			     int level)
+nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
 {
 	return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
 }
 
 static inline struct nilfs_btree_node *
-nilfs_btree_get_sib_node(const struct nilfs_btree *btree,
-			 const struct nilfs_btree_path *path,
-			 int level)
+nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
 {
 	return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
 }
 
 static inline int nilfs_btree_height(const struct nilfs_btree *btree)
 {
-	return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree))
-		+ 1;
+	return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
 }
 
 static inline struct nilfs_btree_node *
@@ -484,7 +453,7 @@ nilfs_btree_get_node(const struct nilfs_btree *btree,
 {
 	return (level == nilfs_btree_height(btree) - 1) ?
 		nilfs_btree_get_root(btree) :
-		nilfs_btree_get_nonroot_node(btree, path, level);
+		nilfs_btree_get_nonroot_node(path, level);
 }
 
 static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
@@ -496,12 +465,11 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
 	int level, index, found, ret;
 
 	node = nilfs_btree_get_root(btree);
-	level = nilfs_btree_node_get_level(btree, node);
-	if ((level < minlevel) ||
-	    (nilfs_btree_node_get_nchildren(btree, node) <= 0))
+	level = nilfs_btree_node_get_level(node);
+	if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0)
 		return -ENOENT;
 
-	found = nilfs_btree_node_lookup(btree, node, key, &index);
+	found = nilfs_btree_node_lookup(node, key, &index);
 	ptr = nilfs_btree_node_get_ptr(btree, node, index);
 	path[level].bp_bh = NULL;
 	path[level].bp_index = index;
@@ -510,14 +478,13 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
 		ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
 		if (ret < 0)
 			return ret;
-		node = nilfs_btree_get_nonroot_node(btree, path, level);
-		BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+		node = nilfs_btree_get_nonroot_node(path, level);
+		BUG_ON(level != nilfs_btree_node_get_level(node));
 		if (!found)
-			found = nilfs_btree_node_lookup(btree, node, key,
-							&index);
+			found = nilfs_btree_node_lookup(node, key, &index);
 		else
 			index = 0;
-		if (index < nilfs_btree_node_nchildren_max(btree, node))
+		if (index < nilfs_btree_node_nchildren_max(node, btree))
 			ptr = nilfs_btree_node_get_ptr(btree, node, index);
 		else {
 			WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
@@ -544,10 +511,10 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
 	int index, level, ret;
 
 	node = nilfs_btree_get_root(btree);
-	index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+	index = nilfs_btree_node_get_nchildren(node) - 1;
 	if (index < 0)
 		return -ENOENT;
-	level = nilfs_btree_node_get_level(btree, node);
+	level = nilfs_btree_node_get_level(node);
 	ptr = nilfs_btree_node_get_ptr(btree, node, index);
 	path[level].bp_bh = NULL;
 	path[level].bp_index = index;
@@ -556,15 +523,15 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
 		ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
 		if (ret < 0)
 			return ret;
-		node = nilfs_btree_get_nonroot_node(btree, path, level);
-		BUG_ON(level != nilfs_btree_node_get_level(btree, node));
-		index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+		node = nilfs_btree_get_nonroot_node(path, level);
+		BUG_ON(level != nilfs_btree_node_get_level(node));
+		index = nilfs_btree_node_get_nchildren(node) - 1;
 		ptr = nilfs_btree_node_get_ptr(btree, node, index);
 		path[level].bp_index = index;
 	}
 
 	if (keyp != NULL)
-		*keyp = nilfs_btree_node_get_key(btree, node, index);
+		*keyp = nilfs_btree_node_get_key(node, index);
 	if (ptrp != NULL)
 		*ptrp = ptr;
 
@@ -580,18 +547,18 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
 	int ret;
 
 	btree = (struct nilfs_btree *)bmap;
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 
 	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
 
 	if (ptrp != NULL)
 		*ptrp = ptr;
 
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 
 	return ret;
 }
@@ -608,10 +575,10 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 	int level = NILFS_BTREE_LEVEL_NODE_MIN;
 	int ret, cnt, index, maxlevel;
 
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
 	if (ret < 0)
 		goto out;
@@ -631,8 +598,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 	node = nilfs_btree_get_node(btree, path, level);
 	index = path[level].bp_index + 1;
 	for (;;) {
-		while (index < nilfs_btree_node_get_nchildren(btree, node)) {
-			if (nilfs_btree_node_get_key(btree, node, index) !=
+		while (index < nilfs_btree_node_get_nchildren(node)) {
+			if (nilfs_btree_node_get_key(node, index) !=
 			    key + cnt)
 				goto end;
 			ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
@@ -653,8 +620,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 		/* look-up right sibling node */
 		node = nilfs_btree_get_node(btree, path, level + 1);
 		index = path[level + 1].bp_index + 1;
-		if (index >= nilfs_btree_node_get_nchildren(btree, node) ||
-		    nilfs_btree_node_get_key(btree, node, index) != key + cnt)
+		if (index >= nilfs_btree_node_get_nchildren(node) ||
+		    nilfs_btree_node_get_key(node, index) != key + cnt)
 			break;
 		ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
 		path[level + 1].bp_index = index;
@@ -664,7 +631,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 		ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
 		if (ret < 0)
 			goto out;
-		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		node = nilfs_btree_get_nonroot_node(path, level);
 		index = 0;
 		path[level].bp_index = index;
 	}
@@ -672,8 +639,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 	*ptrp = ptr;
 	ret = cnt;
  out:
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 	return ret;
 }
 
@@ -685,9 +652,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
 		do {
 			lock_buffer(path[level].bp_bh);
 			nilfs_btree_node_set_key(
-				btree,
-				nilfs_btree_get_nonroot_node(
-					btree, path, level),
+				nilfs_btree_get_nonroot_node(path, level),
 				path[level].bp_index, key);
 			if (!buffer_dirty(path[level].bp_bh))
 				nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -698,8 +663,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
 
 	/* root */
 	if (level == nilfs_btree_height(btree) - 1) {
-		nilfs_btree_node_set_key(btree,
-					 nilfs_btree_get_root(btree),
+		nilfs_btree_node_set_key(nilfs_btree_get_root(btree),
 					 path[level].bp_index, key);
 	}
 }
@@ -712,7 +676,7 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
 
 	if (level < nilfs_btree_height(btree) - 1) {
 		lock_buffer(path[level].bp_bh);
-		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		node = nilfs_btree_get_nonroot_node(path, level);
 		nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
 					path[level].bp_index);
 		if (!buffer_dirty(path[level].bp_bh))
@@ -721,8 +685,8 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
 
 		if (path[level].bp_index == 0)
 			nilfs_btree_promote_key(btree, path, level + 1,
-						nilfs_btree_node_get_key(
-							btree, node, 0));
+						nilfs_btree_node_get_key(node,
+									 0));
 	} else {
 		node = nilfs_btree_get_root(btree);
 		nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
@@ -740,10 +704,10 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_bh);
 	lock_buffer(path[level].bp_sib_bh);
 
-	node = nilfs_btree_get_nonroot_node(btree, path, level);
-	left = nilfs_btree_get_sib_node(btree, path, level);
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
-	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+	node = nilfs_btree_get_nonroot_node(path, level);
+	left = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
 	move = 0;
 
 	n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -764,7 +728,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
 	unlock_buffer(path[level].bp_sib_bh);
 
 	nilfs_btree_promote_key(btree, path, level + 1,
-				nilfs_btree_node_get_key(btree, node, 0));
+				nilfs_btree_node_get_key(node, 0));
 
 	if (move) {
 		brelse(path[level].bp_bh);
@@ -791,10 +755,10 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_bh);
 	lock_buffer(path[level].bp_sib_bh);
 
-	node = nilfs_btree_get_nonroot_node(btree, path, level);
-	right = nilfs_btree_get_sib_node(btree, path, level);
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
-	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
 	move = 0;
 
 	n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -816,15 +780,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
 
 	path[level + 1].bp_index++;
 	nilfs_btree_promote_key(btree, path, level + 1,
-				nilfs_btree_node_get_key(btree, right, 0));
+				nilfs_btree_node_get_key(right, 0));
 	path[level + 1].bp_index--;
 
 	if (move) {
 		brelse(path[level].bp_bh);
 		path[level].bp_bh = path[level].bp_sib_bh;
 		path[level].bp_sib_bh = NULL;
-		path[level].bp_index -=
-			nilfs_btree_node_get_nchildren(btree, node);
+		path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
 		path[level + 1].bp_index++;
 	} else {
 		brelse(path[level].bp_sib_bh);
@@ -846,9 +809,9 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_bh);
 	lock_buffer(path[level].bp_sib_bh);
 
-	node = nilfs_btree_get_nonroot_node(btree, path, level);
-	right = nilfs_btree_get_sib_node(btree, path, level);
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
 	move = 0;
 
 	n = (nchildren + 1) / 2;
@@ -867,16 +830,15 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
 	unlock_buffer(path[level].bp_bh);
 	unlock_buffer(path[level].bp_sib_bh);
 
-	newkey = nilfs_btree_node_get_key(btree, right, 0);
+	newkey = nilfs_btree_node_get_key(right, 0);
 	newptr = path[level].bp_newreq.bpr_ptr;
 
 	if (move) {
-		path[level].bp_index -=
-			nilfs_btree_node_get_nchildren(btree, node);
+		path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
 		nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
 					path[level].bp_index);
 
-		*keyp = nilfs_btree_node_get_key(btree, right, 0);
+		*keyp = nilfs_btree_node_get_key(right, 0);
 		*ptrp = path[level].bp_newreq.bpr_ptr;
 
 		brelse(path[level].bp_bh);
@@ -885,7 +847,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
 	} else {
 		nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
 
-		*keyp = nilfs_btree_node_get_key(btree, right, 0);
+		*keyp = nilfs_btree_node_get_key(right, 0);
 		*ptrp = path[level].bp_newreq.bpr_ptr;
 
 		brelse(path[level].bp_sib_bh);
@@ -905,12 +867,12 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_sib_bh);
 
 	root = nilfs_btree_get_root(btree);
-	child = nilfs_btree_get_sib_node(btree, path, level);
+	child = nilfs_btree_get_sib_node(path, level);
 
-	n = nilfs_btree_node_get_nchildren(btree, root);
+	n = nilfs_btree_node_get_nchildren(root);
 
 	nilfs_btree_node_move_right(btree, root, child, n);
-	nilfs_btree_node_set_level(btree, root, level + 1);
+	nilfs_btree_node_set_level(root, level + 1);
 
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
@@ -922,7 +884,7 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
 
 	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
 
-	*keyp = nilfs_btree_node_get_key(btree, child, 0);
+	*keyp = nilfs_btree_node_get_key(child, 0);
 	*ptrp = path[level].bp_newreq.bpr_ptr;
 }
 
@@ -1007,9 +969,9 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
 	     level < nilfs_btree_height(btree) - 1;
 	     level++) {
-		node = nilfs_btree_get_nonroot_node(btree, path, level);
-		if (nilfs_btree_node_get_nchildren(btree, node) <
-		    nilfs_btree_node_nchildren_max(btree, node)) {
+		node = nilfs_btree_get_nonroot_node(path, level);
+		if (nilfs_btree_node_get_nchildren(node) <
+		    nilfs_btree_node_nchildren_max(node, btree)) {
 			path[level].bp_op = nilfs_btree_do_insert;
 			stats->bs_nblocks++;
 			goto out;
@@ -1026,8 +988,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 			if (ret < 0)
 				goto err_out_child_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
-			if (nilfs_btree_node_get_nchildren(btree, sib) <
-			    nilfs_btree_node_nchildren_max(btree, sib)) {
+			if (nilfs_btree_node_get_nchildren(sib) <
+			    nilfs_btree_node_nchildren_max(sib, btree)) {
 				path[level].bp_sib_bh = bh;
 				path[level].bp_op = nilfs_btree_carry_left;
 				stats->bs_nblocks++;
@@ -1038,15 +1000,15 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 
 		/* right sibling */
 		if (pindex <
-		    nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+		    nilfs_btree_node_get_nchildren(parent) - 1) {
 			sibptr = nilfs_btree_node_get_ptr(btree, parent,
 							  pindex + 1);
 			ret = nilfs_btree_get_block(btree, sibptr, &bh);
 			if (ret < 0)
 				goto err_out_child_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
-			if (nilfs_btree_node_get_nchildren(btree, sib) <
-			    nilfs_btree_node_nchildren_max(btree, sib)) {
+			if (nilfs_btree_node_get_nchildren(sib) <
+			    nilfs_btree_node_nchildren_max(sib, btree)) {
 				path[level].bp_sib_bh = bh;
 				path[level].bp_op = nilfs_btree_carry_right;
 				stats->bs_nblocks++;
@@ -1081,8 +1043,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 
 	/* root */
 	node = nilfs_btree_get_root(btree);
-	if (nilfs_btree_node_get_nchildren(btree, node) <
-	    nilfs_btree_node_nchildren_max(btree, node)) {
+	if (nilfs_btree_node_get_nchildren(node) <
+	    nilfs_btree_node_nchildren_max(node, btree)) {
 		path[level].bp_op = nilfs_btree_do_insert;
 		stats->bs_nblocks++;
 		goto out;
@@ -1164,10 +1126,10 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 	int level, ret;
 
 	btree = (struct nilfs_btree *)bmap;
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 
 	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
 				    NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1184,8 +1146,8 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
 
  out:
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 	return ret;
 }
 
@@ -1197,7 +1159,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
 
 	if (level < nilfs_btree_height(btree) - 1) {
 		lock_buffer(path[level].bp_bh);
-		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		node = nilfs_btree_get_nonroot_node(path, level);
 		nilfs_btree_node_delete(btree, node, keyp, ptrp,
 					path[level].bp_index);
 		if (!buffer_dirty(path[level].bp_bh))
@@ -1205,7 +1167,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
 		unlock_buffer(path[level].bp_bh);
 		if (path[level].bp_index == 0)
 			nilfs_btree_promote_key(btree, path, level + 1,
-				nilfs_btree_node_get_key(btree, node, 0));
+				nilfs_btree_node_get_key(node, 0));
 	} else {
 		node = nilfs_btree_get_root(btree);
 		nilfs_btree_node_delete(btree, node, keyp, ptrp,
@@ -1225,10 +1187,10 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_bh);
 	lock_buffer(path[level].bp_sib_bh);
 
-	node = nilfs_btree_get_nonroot_node(btree, path, level);
-	left = nilfs_btree_get_sib_node(btree, path, level);
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
-	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+	node = nilfs_btree_get_nonroot_node(path, level);
+	left = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	lnchildren = nilfs_btree_node_get_nchildren(left);
 
 	n = (nchildren + lnchildren) / 2 - nchildren;
 
@@ -1243,7 +1205,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
 	unlock_buffer(path[level].bp_sib_bh);
 
 	nilfs_btree_promote_key(btree, path, level + 1,
-				nilfs_btree_node_get_key(btree, node, 0));
+				nilfs_btree_node_get_key(node, 0));
 
 	brelse(path[level].bp_sib_bh);
 	path[level].bp_sib_bh = NULL;
@@ -1262,10 +1224,10 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_bh);
 	lock_buffer(path[level].bp_sib_bh);
 
-	node = nilfs_btree_get_nonroot_node(btree, path, level);
-	right = nilfs_btree_get_sib_node(btree, path, level);
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
-	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	rnchildren = nilfs_btree_node_get_nchildren(right);
 
 	n = (nchildren + rnchildren) / 2 - nchildren;
 
@@ -1281,7 +1243,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
 
 	path[level + 1].bp_index++;
 	nilfs_btree_promote_key(btree, path, level + 1,
-				nilfs_btree_node_get_key(btree, right, 0));
+				nilfs_btree_node_get_key(right, 0));
 	path[level + 1].bp_index--;
 
 	brelse(path[level].bp_sib_bh);
@@ -1300,10 +1262,10 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_bh);
 	lock_buffer(path[level].bp_sib_bh);
 
-	node = nilfs_btree_get_nonroot_node(btree, path, level);
-	left = nilfs_btree_get_sib_node(btree, path, level);
+	node = nilfs_btree_get_nonroot_node(path, level);
+	left = nilfs_btree_get_sib_node(path, level);
 
-	n = nilfs_btree_node_get_nchildren(btree, node);
+	n = nilfs_btree_node_get_nchildren(node);
 
 	nilfs_btree_node_move_left(btree, left, node, n);
 
@@ -1316,7 +1278,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
 	nilfs_btnode_delete(path[level].bp_bh);
 	path[level].bp_bh = path[level].bp_sib_bh;
 	path[level].bp_sib_bh = NULL;
-	path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
+	path[level].bp_index += nilfs_btree_node_get_nchildren(left);
 }
 
 static void nilfs_btree_concat_right(struct nilfs_btree *btree,
@@ -1331,10 +1293,10 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
 	lock_buffer(path[level].bp_bh);
 	lock_buffer(path[level].bp_sib_bh);
 
-	node = nilfs_btree_get_nonroot_node(btree, path, level);
-	right = nilfs_btree_get_sib_node(btree, path, level);
+	node = nilfs_btree_get_nonroot_node(path, level);
+	right = nilfs_btree_get_sib_node(path, level);
 
-	n = nilfs_btree_node_get_nchildren(btree, right);
+	n = nilfs_btree_node_get_nchildren(right);
 
 	nilfs_btree_node_move_left(btree, node, right, n);
 
@@ -1360,11 +1322,11 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
 
 	lock_buffer(path[level].bp_bh);
 	root = nilfs_btree_get_root(btree);
-	child = nilfs_btree_get_nonroot_node(btree, path, level);
+	child = nilfs_btree_get_nonroot_node(path, level);
 
 	nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
-	nilfs_btree_node_set_level(btree, root, level);
-	n = nilfs_btree_node_get_nchildren(btree, child);
+	nilfs_btree_node_set_level(root, level);
+	n = nilfs_btree_node_get_nchildren(child);
 	nilfs_btree_node_move_left(btree, root, child, n);
 	unlock_buffer(path[level].bp_bh);
 
@@ -1388,7 +1350,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
 	     level < nilfs_btree_height(btree) - 1;
 	     level++) {
-		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		node = nilfs_btree_get_nonroot_node(path, level);
 		path[level].bp_oldreq.bpr_ptr =
 			nilfs_btree_node_get_ptr(btree, node,
 						 path[level].bp_index);
@@ -1397,8 +1359,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 		if (ret < 0)
 			goto err_out_child_node;
 
-		if (nilfs_btree_node_get_nchildren(btree, node) >
-		    nilfs_btree_node_nchildren_min(btree, node)) {
+		if (nilfs_btree_node_get_nchildren(node) >
+		    nilfs_btree_node_nchildren_min(node, btree)) {
 			path[level].bp_op = nilfs_btree_do_delete;
 			stats->bs_nblocks++;
 			goto out;
@@ -1415,8 +1377,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 			if (ret < 0)
 				goto err_out_curr_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
-			if (nilfs_btree_node_get_nchildren(btree, sib) >
-			    nilfs_btree_node_nchildren_min(btree, sib)) {
+			if (nilfs_btree_node_get_nchildren(sib) >
+			    nilfs_btree_node_nchildren_min(sib, btree)) {
 				path[level].bp_sib_bh = bh;
 				path[level].bp_op = nilfs_btree_borrow_left;
 				stats->bs_nblocks++;
@@ -1428,7 +1390,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 				/* continue; */
 			}
 		} else if (pindex <
-			   nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+			   nilfs_btree_node_get_nchildren(parent) - 1) {
 			/* right sibling */
 			sibptr = nilfs_btree_node_get_ptr(btree, parent,
 							  pindex + 1);
@@ -1436,8 +1398,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 			if (ret < 0)
 				goto err_out_curr_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
-			if (nilfs_btree_node_get_nchildren(btree, sib) >
-			    nilfs_btree_node_nchildren_min(btree, sib)) {
+			if (nilfs_btree_node_get_nchildren(sib) >
+			    nilfs_btree_node_nchildren_min(sib, btree)) {
 				path[level].bp_sib_bh = bh;
 				path[level].bp_op = nilfs_btree_borrow_right;
 				stats->bs_nblocks++;
@@ -1452,7 +1414,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 			/* no siblings */
 			/* the only child of the root node */
 			WARN_ON(level != nilfs_btree_height(btree) - 2);
-			if (nilfs_btree_node_get_nchildren(btree, node) - 1 <=
+			if (nilfs_btree_node_get_nchildren(node) - 1 <=
 			    NILFS_BTREE_ROOT_NCHILDREN_MAX) {
 				path[level].bp_op = nilfs_btree_shrink;
 				stats->bs_nblocks += 2;
@@ -1523,10 +1485,10 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
 	int level, ret;
 
 	btree = (struct nilfs_btree *)bmap;
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
 				    NILFS_BTREE_LEVEL_NODE_MIN);
 	if (ret < 0)
@@ -1539,8 +1501,8 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
 	nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
 
 out:
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 	return ret;
 }
 
@@ -1551,15 +1513,15 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
 	int ret;
 
 	btree = (struct nilfs_btree *)bmap;
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 
 	ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
 
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 
 	return ret;
 }
@@ -1581,7 +1543,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
 		node = root;
 		break;
 	case 3:
-		nchildren = nilfs_btree_node_get_nchildren(btree, root);
+		nchildren = nilfs_btree_node_get_nchildren(root);
 		if (nchildren > 1)
 			return 0;
 		ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
@@ -1594,10 +1556,10 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
 		return 0;
 	}
 
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
-	maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1);
+	nchildren = nilfs_btree_node_get_nchildren(node);
+	maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
 	nextmaxkey = (nchildren > 1) ?
-		nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
+		nilfs_btree_node_get_key(node, nchildren - 2) : 0;
 	if (bh != NULL)
 		brelse(bh);
 
@@ -1623,7 +1585,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
 		node = root;
 		break;
 	case 3:
-		nchildren = nilfs_btree_node_get_nchildren(btree, root);
+		nchildren = nilfs_btree_node_get_nchildren(root);
 		WARN_ON(nchildren > 1);
 		ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
 		ret = nilfs_btree_get_block(btree, ptr, &bh);
@@ -1636,11 +1598,11 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
 		return -EINVAL;
 	}
 
-	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	nchildren = nilfs_btree_node_get_nchildren(node);
 	if (nchildren < nitems)
 		nitems = nchildren;
-	dkeys = nilfs_btree_node_dkeys(btree, node);
-	dptrs = nilfs_btree_node_dptrs(btree, node);
+	dkeys = nilfs_btree_node_dkeys(node);
+	dptrs = nilfs_btree_node_dptrs(node, btree);
 	for (i = 0; i < nitems; i++) {
 		keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
 		ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
@@ -1986,15 +1948,15 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 	WARN_ON(!buffer_dirty(bh));
 
 	btree = (struct nilfs_btree *)bmap;
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 
 	if (buffer_nilfs_node(bh)) {
 		node = (struct nilfs_btree_node *)bh->b_data;
-		key = nilfs_btree_node_get_key(btree, node, 0);
-		level = nilfs_btree_node_get_level(btree, node);
+		key = nilfs_btree_node_get_key(node, 0);
+		level = nilfs_btree_node_get_level(node);
 	} else {
 		key = nilfs_bmap_data_get_key(bmap, bh);
 		level = NILFS_BTREE_LEVEL_DATA;
@@ -2013,8 +1975,8 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 		nilfs_btree_propagate_p(btree, path, level, bh);
 
  out:
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 
 	return ret;
 }
@@ -2037,12 +1999,12 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
 
 	get_bh(bh);
 	node = (struct nilfs_btree_node *)bh->b_data;
-	key = nilfs_btree_node_get_key(btree, node, 0);
-	level = nilfs_btree_node_get_level(btree, node);
+	key = nilfs_btree_node_get_key(node, 0);
+	level = nilfs_btree_node_get_level(node);
 	list_for_each(head, &lists[level]) {
 		cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
 		cnode = (struct nilfs_btree_node *)cbh->b_data;
-		ckey = nilfs_btree_node_get_key(btree, cnode, 0);
+		ckey = nilfs_btree_node_get_key(cnode, 0);
 		if (key < ckey)
 			break;
 	}
@@ -2120,8 +2082,7 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
 	nilfs_btree_node_set_ptr(btree, parent,
 				 path[level + 1].bp_index, blocknr);
 
-	key = nilfs_btree_node_get_key(btree, parent,
-				       path[level + 1].bp_index);
+	key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
 	/* on-disk format */
 	binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
 	binfo->bi_dat.bi_level = level;
@@ -2150,8 +2111,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
 	if (unlikely(ret < 0))
 		return ret;
 
-	key = nilfs_btree_node_get_key(btree, parent,
-				       path[level + 1].bp_index);
+	key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
 	/* on-disk format */
 	binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
 	binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
@@ -2171,15 +2131,15 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
 	int level, ret;
 
 	btree = (struct nilfs_btree *)bmap;
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 
 	if (buffer_nilfs_node(*bh)) {
 		node = (struct nilfs_btree_node *)(*bh)->b_data;
-		key = nilfs_btree_node_get_key(btree, node, 0);
-		level = nilfs_btree_node_get_level(btree, node);
+		key = nilfs_btree_node_get_key(node, 0);
+		level = nilfs_btree_node_get_level(node);
 	} else {
 		key = nilfs_bmap_data_get_key(bmap, *bh);
 		level = NILFS_BTREE_LEVEL_DATA;
@@ -2196,8 +2156,8 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
 		nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 
  out:
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 
 	return ret;
 }
@@ -2219,7 +2179,7 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
 
 	if (buffer_nilfs_node(*bh)) {
 		node = (struct nilfs_btree_node *)(*bh)->b_data;
-		key = nilfs_btree_node_get_key(btree, node, 0);
+		key = nilfs_btree_node_get_key(node, 0);
 	} else
 		key = nilfs_bmap_data_get_key(bmap, *bh);
 
@@ -2239,10 +2199,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 	int ret;
 
 	btree = (struct nilfs_btree *)bmap;
-	path = nilfs_btree_alloc_path(btree);
+	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(btree, path);
+	nilfs_btree_init_path(path);
 
 	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
 	if (ret < 0) {
@@ -2262,8 +2222,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 		nilfs_bmap_set_dirty(&btree->bt_bmap);
 
  out:
-	nilfs_btree_clear_path(btree, path);
-	nilfs_btree_free_path(btree, path);
+	nilfs_btree_clear_path(path);
+	nilfs_btree_free_path(path);
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 3218929dbd25245e0f601df1e359a3ed3f7fb03b Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 15 Aug 2009 01:54:59 +0900
Subject: nilfs2: stop zero-fill of btree path just before free it

The btree path object is cleared just before it is freed.

This will remove the code doing the unnecessary clear operation.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/btree.c | 36 ++++++++++++------------------------
 1 file changed, 12 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 21ed8ccea4b9..115b157d508b 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -97,25 +97,13 @@ static void nilfs_btree_init_path(struct nilfs_btree_path *path)
 	}
 }
 
-static void nilfs_btree_clear_path(struct nilfs_btree_path *path)
+static void nilfs_btree_release_path(struct nilfs_btree_path *path)
 {
 	int level;
 
-	for (level = NILFS_BTREE_LEVEL_DATA;
-	     level < NILFS_BTREE_LEVEL_MAX;
-	     level++) {
-		if (path[level].bp_bh != NULL) {
-			brelse(path[level].bp_bh);
-			path[level].bp_bh = NULL;
-		}
-		/* sib_bh is released or deleted by prepare or commit
-		 * operations. */
-		path[level].bp_sib_bh = NULL;
-		path[level].bp_index = 0;
-		path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
-		path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
-		path[level].bp_op = NULL;
-	}
+	for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
+	     level++)
+		brelse(path[level].bp_bh);
 }
 
 /*
@@ -557,7 +545,7 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
 	if (ptrp != NULL)
 		*ptrp = ptr;
 
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 
 	return ret;
@@ -639,7 +627,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 	*ptrp = ptr;
 	ret = cnt;
  out:
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 	return ret;
 }
@@ -1146,7 +1134,7 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
 
  out:
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 	return ret;
 }
@@ -1501,7 +1489,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
 	nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
 
 out:
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 	return ret;
 }
@@ -1520,7 +1508,7 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
 
 	ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
 
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 
 	return ret;
@@ -1975,7 +1963,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 		nilfs_btree_propagate_p(btree, path, level, bh);
 
  out:
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 
 	return ret;
@@ -2156,7 +2144,7 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
 		nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 
  out:
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 
 	return ret;
@@ -2222,7 +2210,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 		nilfs_bmap_set_dirty(&btree->bt_bmap);
 
  out:
-	nilfs_btree_clear_path(path);
+	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 7a102b09232be1ad7c180dfd1f46c7aa95dff1e0 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 15 Aug 2009 13:47:09 +0900
Subject: nilfs2: remove individual gfp constants for each metadata file

This gets rid of NILFS_CPFILE_GFP, NILFS_SUFILE_GFP, NILFS_DAT_GFP,
and NILFS_IFILE_GFP.  All of these constants refer to NILFS_MDT_GFP,
and can be removed.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/cpfile.h    |  2 --
 fs/nilfs2/dat.h       |  1 -
 fs/nilfs2/ifile.h     |  1 -
 fs/nilfs2/mdt.c       |  5 +++--
 fs/nilfs2/mdt.h       |  3 +--
 fs/nilfs2/sufile.h    |  1 -
 fs/nilfs2/super.c     |  3 +--
 fs/nilfs2/the_nilfs.c | 12 ++++--------
 8 files changed, 9 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 788a45950197..debea896e701 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -27,8 +27,6 @@
 #include <linux/buffer_head.h>
 #include <linux/nilfs2_fs.h>
 
-#define NILFS_CPFILE_GFP	NILFS_MDT_GFP
-
 
 int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
 				struct nilfs_checkpoint **,
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index 4b0b47ea85ed..91dc3372a39a 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -27,7 +27,6 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 
-#define NILFS_DAT_GFP	NILFS_MDT_GFP
 
 struct nilfs_palloc_req;
 
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 5d30a35679b5..ecc3ba76db47 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -31,7 +31,6 @@
 #include "mdt.h"
 #include "alloc.h"
 
-#define NILFS_IFILE_GFP  NILFS_MDT_GFP
 
 static inline struct nilfs_inode *
 nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 1ae8d56052c3..9cb831899119 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -513,9 +513,10 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
 }
 
 struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
-			    ino_t ino, gfp_t gfp_mask)
+			    ino_t ino)
 {
-	struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask);
+	struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino,
+						   NILFS_MDT_GFP);
 
 	if (!inode)
 		return NULL;
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index df683e0bca6a..431599733c9b 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -74,8 +74,7 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
 int nilfs_mdt_fetch_dirty(struct inode *);
 
-struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
-			    gfp_t);
+struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t);
 struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
 				   ino_t, gfp_t);
 void nilfs_mdt_destroy(struct inode *);
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a2c4d76c3366..0e99e5c0bd0f 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -28,7 +28,6 @@
 #include <linux/nilfs2_fs.h>
 #include "mdt.h"
 
-#define NILFS_SUFILE_GFP	NILFS_MDT_GFP
 
 static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
 {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 50284add7880..55f3d6b60732 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -363,8 +363,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 	list_add(&sbi->s_list, &nilfs->ns_supers);
 	up_write(&nilfs->ns_super_sem);
 
-	sbi->s_ifile = nilfs_mdt_new(
-		nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
+	sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO);
 	if (!sbi->s_ifile)
 		return -ENOMEM;
 
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 0c4573653b87..d4168e269c5d 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -187,23 +187,19 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
 	inode_size = nilfs->ns_inode_size;
 
 	err = -ENOMEM;
-	nilfs->ns_dat = nilfs_mdt_new(
-		nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+	nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
 	if (unlikely(!nilfs->ns_dat))
 		goto failed;
 
-	nilfs->ns_gc_dat = nilfs_mdt_new(
-		nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+	nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
 	if (unlikely(!nilfs->ns_gc_dat))
 		goto failed_dat;
 
-	nilfs->ns_cpfile = nilfs_mdt_new(
-		nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
+	nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO);
 	if (unlikely(!nilfs->ns_cpfile))
 		goto failed_gc_dat;
 
-	nilfs->ns_sufile = nilfs_mdt_new(
-		nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
+	nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO);
 	if (unlikely(!nilfs->ns_sufile))
 		goto failed_cpfile;
 
-- 
cgit v1.2.3-70-g09d2


From bd8169efae8bdd292675c386010f6b35f0771057 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 15 Aug 2009 17:22:13 +0900
Subject: nilfs2: add update functions of virtual block address to dat

This is a preparation for the successive cleanup ("nilfs2: allow btree
to directly call dat operations").

This adds functions bundling a few operations to change an entry of
virtual block address on the dat file.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/bmap.c | 27 +++++++--------------------
 fs/nilfs2/dat.c  | 31 +++++++++++++++++++++++++++++++
 fs/nilfs2/dat.h  |  6 ++++++
 3 files changed, 44 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 99d58a028b94..13e95a907ece 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -533,38 +533,25 @@ int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
 				union nilfs_bmap_ptr_req *oldreq,
 				union nilfs_bmap_ptr_req *newreq)
 {
-	struct inode *dat = nilfs_bmap_get_dat(bmap);
-	int ret;
-
-	ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req);
-	if (ret < 0)
-		return ret;
-	ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req);
-	if (ret < 0)
-		nilfs_dat_abort_end(dat, &oldreq->bpr_req);
-
-	return ret;
+	return nilfs_dat_prepare_update(nilfs_bmap_get_dat(bmap),
+					&oldreq->bpr_req, &newreq->bpr_req);
 }
 
 void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
 				union nilfs_bmap_ptr_req *oldreq,
 				union nilfs_bmap_ptr_req *newreq)
 {
-	struct inode *dat = nilfs_bmap_get_dat(bmap);
-
-	nilfs_dat_commit_end(dat, &oldreq->bpr_req,
-			     bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-	nilfs_dat_commit_alloc(dat, &newreq->bpr_req);
+	nilfs_dat_commit_update(nilfs_bmap_get_dat(bmap),
+				&oldreq->bpr_req, &newreq->bpr_req,
+				bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
 }
 
 void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
 			       union nilfs_bmap_ptr_req *oldreq,
 			       union nilfs_bmap_ptr_req *newreq)
 {
-	struct inode *dat = nilfs_bmap_get_dat(bmap);
-
-	nilfs_dat_abort_end(dat, &oldreq->bpr_req);
-	nilfs_dat_abort_alloc(dat, &newreq->bpr_req);
+	nilfs_dat_abort_update(nilfs_bmap_get_dat(bmap),
+			       &oldreq->bpr_req, &newreq->bpr_req);
 }
 
 static struct lock_class_key nilfs_bmap_dat_lock_key;
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 1cfcc1a0fdf1..1ff8e15bd36b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -211,6 +211,37 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
 	nilfs_dat_abort_entry(dat, req);
 }
 
+int nilfs_dat_prepare_update(struct inode *dat,
+			     struct nilfs_palloc_req *oldreq,
+			     struct nilfs_palloc_req *newreq)
+{
+	int ret;
+
+	ret = nilfs_dat_prepare_end(dat, oldreq);
+	if (!ret) {
+		ret = nilfs_dat_prepare_alloc(dat, newreq);
+		if (ret < 0)
+			nilfs_dat_abort_end(dat, oldreq);
+	}
+	return ret;
+}
+
+void nilfs_dat_commit_update(struct inode *dat,
+			     struct nilfs_palloc_req *oldreq,
+			     struct nilfs_palloc_req *newreq, int dead)
+{
+	nilfs_dat_commit_end(dat, oldreq, dead);
+	nilfs_dat_commit_alloc(dat, newreq);
+}
+
+void nilfs_dat_abort_update(struct inode *dat,
+			    struct nilfs_palloc_req *oldreq,
+			    struct nilfs_palloc_req *newreq)
+{
+	nilfs_dat_abort_end(dat, oldreq);
+	nilfs_dat_abort_alloc(dat, newreq);
+}
+
 /**
  * nilfs_dat_mark_dirty -
  * @dat: DAT file inode
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index 91dc3372a39a..406070d3ff49 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -41,6 +41,12 @@ void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
 int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
 void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
 void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *,
+			     struct nilfs_palloc_req *);
+void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *,
+			     struct nilfs_palloc_req *, int);
+void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
+			    struct nilfs_palloc_req *);
 
 int nilfs_dat_mark_dirty(struct inode *, __u64);
 int nilfs_dat_freev(struct inode *, __u64 *, size_t);
-- 
cgit v1.2.3-70-g09d2


From 2e0c2c73923fed27337039ddfd69985e6c4b91fe Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 15 Aug 2009 15:34:33 +0900
Subject: nilfs2: allow btree code to directly call dat operations

The current btree code is written so that btree functions call dat
operations via wrapper functions in bmap.c when they allocate, free,
or modify virtual block addresses.

This abstraction requires additional function calls and causes
frequent call of nilfs_bmap_get_dat() function since it is used in the
every wrapper function.

This removes the wrapper functions and makes them available from
btree.c and direct.c, which will increase the opportunity of
compiler optimization.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/bmap.c   |  85 ----------------------------
 fs/nilfs2/bmap.h   |  69 +++++++++--------------
 fs/nilfs2/btree.c  | 151 +++++++++++++++++++++++++++----------------------
 fs/nilfs2/direct.c | 161 +++++++++++++++++++----------------------------------
 4 files changed, 167 insertions(+), 299 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 13e95a907ece..f98c5c4cf6e7 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -469,91 +469,6 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
 		(entries_per_group / NILFS_BMAP_GROUP_DIV);
 }
 
-int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
-				 union nilfs_bmap_ptr_req *req)
-{
-	return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
-				 union nilfs_bmap_ptr_req *req)
-{
-	nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
-			      union nilfs_bmap_ptr_req *req)
-{
-	nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req,
-		       sector_t blocknr)
-{
-	struct inode *dat = nilfs_bmap_get_dat(bmap);
-	int ret;
-
-	ret = nilfs_dat_prepare_start(dat, &req->bpr_req);
-	if (likely(!ret))
-		nilfs_dat_commit_start(dat, &req->bpr_req, blocknr);
-	return ret;
-}
-
-int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
-			     union nilfs_bmap_ptr_req *req)
-{
-	return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
-			     union nilfs_bmap_ptr_req *req)
-{
-	nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req,
-			     bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-}
-
-void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
-			    union nilfs_bmap_ptr_req *req)
-{
-	nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
-		      sector_t blocknr)
-{
-	return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
-}
-
-int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
-{
-	return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
-}
-
-int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
-				union nilfs_bmap_ptr_req *oldreq,
-				union nilfs_bmap_ptr_req *newreq)
-{
-	return nilfs_dat_prepare_update(nilfs_bmap_get_dat(bmap),
-					&oldreq->bpr_req, &newreq->bpr_req);
-}
-
-void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
-				union nilfs_bmap_ptr_req *oldreq,
-				union nilfs_bmap_ptr_req *newreq)
-{
-	nilfs_dat_commit_update(nilfs_bmap_get_dat(bmap),
-				&oldreq->bpr_req, &newreq->bpr_req,
-				bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-}
-
-void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
-			       union nilfs_bmap_ptr_req *oldreq,
-			       union nilfs_bmap_ptr_req *newreq)
-{
-	nilfs_dat_abort_update(nilfs_bmap_get_dat(bmap),
-			       &oldreq->bpr_req, &newreq->bpr_req);
-}
-
 static struct lock_class_key nilfs_bmap_dat_lock_key;
 static struct lock_class_key nilfs_bmap_mdt_lock_key;
 
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b2890cdcef12..a4f64e54424c 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -28,6 +28,7 @@
 #include <linux/buffer_head.h>
 #include <linux/nilfs2_fs.h>
 #include "alloc.h"
+#include "dat.h"
 
 #define NILFS_BMAP_INVALID_PTR	0
 
@@ -164,86 +165,66 @@ void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
  * Internal use only
  */
 struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
-int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *,
-			       union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *,
-			       union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *,
-			      union nilfs_bmap_ptr_req *);
 
 static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
-					       union nilfs_bmap_ptr_req *req)
+					       union nilfs_bmap_ptr_req *req,
+					       struct inode *dat)
 {
-	if (NILFS_BMAP_USE_VBN(bmap))
-		return nilfs_bmap_prepare_alloc_v(bmap, req);
+	if (dat)
+		return nilfs_dat_prepare_alloc(dat, &req->bpr_req);
 	/* ignore target ptr */
 	req->bpr_ptr = bmap->b_last_allocated_ptr++;
 	return 0;
 }
 
 static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
-					       union nilfs_bmap_ptr_req *req)
+					       union nilfs_bmap_ptr_req *req,
+					       struct inode *dat)
 {
-	if (NILFS_BMAP_USE_VBN(bmap))
-		nilfs_bmap_commit_alloc_v(bmap, req);
+	if (dat)
+		nilfs_dat_commit_alloc(dat, &req->bpr_req);
 }
 
 static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
-					      union nilfs_bmap_ptr_req *req)
+					      union nilfs_bmap_ptr_req *req,
+					      struct inode *dat)
 {
-	if (NILFS_BMAP_USE_VBN(bmap))
-		nilfs_bmap_abort_alloc_v(bmap, req);
+	if (dat)
+		nilfs_dat_abort_alloc(dat, &req->bpr_req);
 	else
 		bmap->b_last_allocated_ptr--;
 }
 
-int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-
 static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
-					     union nilfs_bmap_ptr_req *req)
+					     union nilfs_bmap_ptr_req *req,
+					     struct inode *dat)
 {
-	return NILFS_BMAP_USE_VBN(bmap) ?
-		nilfs_bmap_prepare_end_v(bmap, req) : 0;
+	return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0;
 }
 
 static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
-					     union nilfs_bmap_ptr_req *req)
+					     union nilfs_bmap_ptr_req *req,
+					     struct inode *dat)
 {
-	if (NILFS_BMAP_USE_VBN(bmap))
-		nilfs_bmap_commit_end_v(bmap, req);
+	if (dat)
+		nilfs_dat_commit_end(dat, &req->bpr_req,
+				     bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
 }
 
 static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
-					    union nilfs_bmap_ptr_req *req)
+					    union nilfs_bmap_ptr_req *req,
+					    struct inode *dat)
 {
-	if (NILFS_BMAP_USE_VBN(bmap))
-		nilfs_bmap_abort_end_v(bmap, req);
+	if (dat)
+		nilfs_dat_abort_end(dat, &req->bpr_req);
 }
 
-int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *,
-		       sector_t);
-int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
-int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
-
-
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
 			      const struct buffer_head *);
 
 __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
 
-int nilfs_bmap_prepare_update_v(struct nilfs_bmap *,
-				union nilfs_bmap_ptr_req *,
-				union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_update_v(struct nilfs_bmap *,
-				union nilfs_bmap_ptr_req *,
-				union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_update_v(struct nilfs_bmap *,
-			       union nilfs_bmap_ptr_req *,
-			       union nilfs_bmap_ptr_req *);
-
 void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
 void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
 
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 115b157d508b..e25b507a474f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -940,17 +940,20 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 	struct nilfs_btree_node *node, *parent, *sib;
 	__u64 sibptr;
 	int pindex, level, ret;
+	struct inode *dat = NULL;
 
 	stats->bs_nblocks = 0;
 	level = NILFS_BTREE_LEVEL_DATA;
 
 	/* allocate a new ptr for data block */
-	if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
+	if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
 		path[level].bp_newreq.bpr_ptr =
 			nilfs_btree_find_target_v(btree, path, key);
+		dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+	}
 
 	ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-					   &path[level].bp_newreq);
+					   &path[level].bp_newreq, dat);
 	if (ret < 0)
 		goto err_out_data;
 
@@ -1009,7 +1012,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 		path[level].bp_newreq.bpr_ptr =
 			path[level - 1].bp_newreq.bpr_ptr + 1;
 		ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-						   &path[level].bp_newreq);
+						   &path[level].bp_newreq, dat);
 		if (ret < 0)
 			goto err_out_child_node;
 		ret = nilfs_btree_get_new_block(btree,
@@ -1041,7 +1044,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 	/* grow */
 	path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
 	ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-					   &path[level].bp_newreq);
+					   &path[level].bp_newreq, dat);
 	if (ret < 0)
 		goto err_out_child_node;
 	ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1069,16 +1072,18 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 
 	/* error */
  err_out_curr_node:
-	nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
+	nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+				   dat);
  err_out_child_node:
 	for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
 		nilfs_btnode_delete(path[level].bp_sib_bh);
 		nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
-					   &path[level].bp_newreq);
+					   &path[level].bp_newreq, dat);
 
 	}
 
-	nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
+	nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+				   dat);
  err_out_data:
 	*levelp = level;
 	stats->bs_nblocks = 0;
@@ -1089,16 +1094,19 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
 				      struct nilfs_btree_path *path,
 				      int maxlevel, __u64 key, __u64 ptr)
 {
+	struct inode *dat = NULL;
 	int level;
 
 	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
 	ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
-	if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
+	if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
 		nilfs_btree_set_target_v(btree, key, ptr);
+		dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+	}
 
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
 		nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
-					    &path[level - 1].bp_newreq);
+					    &path[level - 1].bp_newreq, dat);
 		path[level].bp_op(btree, path, level, &key, &ptr);
 	}
 
@@ -1326,7 +1334,8 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
 static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 				      struct nilfs_btree_path *path,
 				      int *levelp,
-				      struct nilfs_bmap_stats *stats)
+				      struct nilfs_bmap_stats *stats,
+				      struct inode *dat)
 {
 	struct buffer_head *bh;
 	struct nilfs_btree_node *node, *parent, *sib;
@@ -1343,7 +1352,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 			nilfs_btree_node_get_ptr(btree, node,
 						 path[level].bp_index);
 		ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-						 &path[level].bp_oldreq);
+						 &path[level].bp_oldreq, dat);
 		if (ret < 0)
 			goto err_out_child_node;
 
@@ -1421,7 +1430,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 		nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
 
 	ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-					 &path[level].bp_oldreq);
+					 &path[level].bp_oldreq, dat);
 	if (ret < 0)
 		goto err_out_child_node;
 
@@ -1436,12 +1445,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 
 	/* error */
  err_out_curr_node:
-	nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq);
+	nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat);
  err_out_child_node:
 	for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
 		brelse(path[level].bp_sib_bh);
 		nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
-					 &path[level].bp_oldreq);
+					 &path[level].bp_oldreq, dat);
 	}
 	*levelp = level;
 	stats->bs_nblocks = 0;
@@ -1450,13 +1459,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 
 static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
 				      struct nilfs_btree_path *path,
-				      int maxlevel)
+				      int maxlevel, struct inode *dat)
 {
 	int level;
 
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
 		nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
-					  &path[level].bp_oldreq);
+					  &path[level].bp_oldreq, dat);
 		path[level].bp_op(btree, path, level, NULL, NULL);
 	}
 
@@ -1470,6 +1479,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
 	struct nilfs_btree *btree;
 	struct nilfs_btree_path *path;
 	struct nilfs_bmap_stats stats;
+	struct inode *dat;
 	int level, ret;
 
 	btree = (struct nilfs_btree *)bmap;
@@ -1482,10 +1492,14 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
 	if (ret < 0)
 		goto out;
 
-	ret = nilfs_btree_prepare_delete(btree, path, &level, &stats);
+
+	dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ?
+		nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
+
+	ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
 	if (ret < 0)
 		goto out;
-	nilfs_btree_commit_delete(btree, path, level);
+	nilfs_btree_commit_delete(btree, path, level, dat);
 	nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
 
 out:
@@ -1610,18 +1624,20 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
 				       struct nilfs_bmap_stats *stats)
 {
 	struct buffer_head *bh;
-	struct nilfs_btree *btree;
+	struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+	struct inode *dat = NULL;
 	int ret;
 
-	btree = (struct nilfs_btree *)bmap;
 	stats->bs_nblocks = 0;
 
 	/* for data */
 	/* cannot find near ptr */
-	if (NILFS_BMAP_USE_VBN(bmap))
+	if (NILFS_BMAP_USE_VBN(bmap)) {
 		dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
+		dat = nilfs_bmap_get_dat(bmap);
+	}
 
-	ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq);
+	ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat);
 	if (ret < 0)
 		return ret;
 
@@ -1629,7 +1645,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
 	stats->bs_nblocks++;
 	if (nreq != NULL) {
 		nreq->bpr_ptr = dreq->bpr_ptr + 1;
-		ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq);
+		ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat);
 		if (ret < 0)
 			goto err_out_dreq;
 
@@ -1646,9 +1662,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
 
 	/* error */
  err_out_nreq:
-	nilfs_bmap_abort_alloc_ptr(bmap, nreq);
+	nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat);
  err_out_dreq:
-	nilfs_bmap_abort_alloc_ptr(bmap, dreq);
+	nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat);
 	stats->bs_nblocks = 0;
 	return ret;
 
@@ -1663,8 +1679,9 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 				      union nilfs_bmap_ptr_req *nreq,
 				      struct buffer_head *bh)
 {
-	struct nilfs_btree *btree;
+	struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
 	struct nilfs_btree_node *node;
+	struct inode *dat;
 	__u64 tmpptr;
 
 	/* free resources */
@@ -1675,11 +1692,11 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
 
 	/* convert and insert */
-	btree = (struct nilfs_btree *)bmap;
+	dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
 	nilfs_btree_init(bmap);
 	if (nreq != NULL) {
-		nilfs_bmap_commit_alloc_ptr(bmap, dreq);
-		nilfs_bmap_commit_alloc_ptr(bmap, nreq);
+		nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
+		nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
 
 		/* create child node at level 1 */
 		lock_buffer(bh);
@@ -1701,7 +1718,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 		nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
 				      2, 1, &keys[0], &tmpptr);
 	} else {
-		nilfs_bmap_commit_alloc_ptr(bmap, dreq);
+		nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
 
 		/* create root node at level 1 */
 		node = nilfs_btree_get_root(btree);
@@ -1772,7 +1789,7 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
 
 static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
 					struct nilfs_btree_path *path,
-					int level)
+					int level, struct inode *dat)
 {
 	struct nilfs_btree_node *parent;
 	int ret;
@@ -1782,9 +1799,8 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
 		nilfs_btree_node_get_ptr(btree, parent,
 					 path[level + 1].bp_index);
 	path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
-	ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap,
-					  &path[level].bp_oldreq,
-					  &path[level].bp_newreq);
+	ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
+				       &path[level].bp_newreq.bpr_req);
 	if (ret < 0)
 		return ret;
 
@@ -1796,9 +1812,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
 			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
 			&path[level].bp_ctxt);
 		if (ret < 0) {
-			nilfs_bmap_abort_update_v(&btree->bt_bmap,
-						  &path[level].bp_oldreq,
-						  &path[level].bp_newreq);
+			nilfs_dat_abort_update(dat,
+					       &path[level].bp_oldreq.bpr_req,
+					       &path[level].bp_newreq.bpr_req);
 			return ret;
 		}
 	}
@@ -1808,13 +1824,13 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
 
 static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
 					struct nilfs_btree_path *path,
-					int level)
+					int level, struct inode *dat)
 {
 	struct nilfs_btree_node *parent;
 
-	nilfs_bmap_commit_update_v(&btree->bt_bmap,
-				   &path[level].bp_oldreq,
-				   &path[level].bp_newreq);
+	nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
+				&path[level].bp_newreq.bpr_req,
+				btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS);
 
 	if (buffer_nilfs_node(path[level].bp_bh)) {
 		nilfs_btnode_commit_change_key(
@@ -1831,11 +1847,10 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
 
 static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
 				       struct nilfs_btree_path *path,
-				       int level)
+				       int level, struct inode *dat)
 {
-	nilfs_bmap_abort_update_v(&btree->bt_bmap,
-				  &path[level].bp_oldreq,
-				  &path[level].bp_newreq);
+	nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req,
+			       &path[level].bp_newreq.bpr_req);
 	if (buffer_nilfs_node(path[level].bp_bh))
 		nilfs_btnode_abort_change_key(
 			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
@@ -1844,14 +1859,14 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
 
 static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
 					   struct nilfs_btree_path *path,
-					   int minlevel,
-					   int *maxlevelp)
+					   int minlevel, int *maxlevelp,
+					   struct inode *dat)
 {
 	int level, ret;
 
 	level = minlevel;
 	if (!buffer_nilfs_volatile(path[level].bp_bh)) {
-		ret = nilfs_btree_prepare_update_v(btree, path, level);
+		ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
 		if (ret < 0)
 			return ret;
 	}
@@ -1859,7 +1874,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
 	       !buffer_dirty(path[level].bp_bh)) {
 
 		WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
-		ret = nilfs_btree_prepare_update_v(btree, path, level);
+		ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
 		if (ret < 0)
 			goto out;
 	}
@@ -1871,39 +1886,40 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
 	/* error */
  out:
 	while (--level > minlevel)
-		nilfs_btree_abort_update_v(btree, path, level);
+		nilfs_btree_abort_update_v(btree, path, level, dat);
 	if (!buffer_nilfs_volatile(path[level].bp_bh))
-		nilfs_btree_abort_update_v(btree, path, level);
+		nilfs_btree_abort_update_v(btree, path, level, dat);
 	return ret;
 }
 
 static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
 					   struct nilfs_btree_path *path,
-					   int minlevel,
-					   int maxlevel,
-					   struct buffer_head *bh)
+					   int minlevel, int maxlevel,
+					   struct buffer_head *bh,
+					   struct inode *dat)
 {
 	int level;
 
 	if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
-		nilfs_btree_commit_update_v(btree, path, minlevel);
+		nilfs_btree_commit_update_v(btree, path, minlevel, dat);
 
 	for (level = minlevel + 1; level <= maxlevel; level++)
-		nilfs_btree_commit_update_v(btree, path, level);
+		nilfs_btree_commit_update_v(btree, path, level, dat);
 }
 
 static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
 				   struct nilfs_btree_path *path,
-				   int level,
-				   struct buffer_head *bh)
+				   int level, struct buffer_head *bh)
 {
 	int maxlevel, ret;
 	struct nilfs_btree_node *parent;
+	struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
 	__u64 ptr;
 
 	get_bh(bh);
 	path[level].bp_bh = bh;
-	ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel);
+	ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel,
+					      dat);
 	if (ret < 0)
 		goto out;
 
@@ -1911,12 +1927,12 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
 		parent = nilfs_btree_get_node(btree, path, level + 1);
 		ptr = nilfs_btree_node_get_ptr(btree, parent,
 					       path[level + 1].bp_index);
-		ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr);
+		ret = nilfs_dat_mark_dirty(dat, ptr);
 		if (ret < 0)
 			goto out;
 	}
 
-	nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh);
+	nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat);
 
  out:
 	brelse(path[level].bp_bh);
@@ -1972,7 +1988,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
 				    struct buffer_head *bh)
 {
-	return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr);
+	return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr);
 }
 
 static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
@@ -2086,6 +2102,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
 				union nilfs_binfo *binfo)
 {
 	struct nilfs_btree_node *parent;
+	struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
 	__u64 key;
 	__u64 ptr;
 	union nilfs_bmap_ptr_req req;
@@ -2095,9 +2112,10 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
 	ptr = nilfs_btree_node_get_ptr(btree, parent,
 				       path[level + 1].bp_index);
 	req.bpr_ptr = ptr;
-	ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr);
-	if (unlikely(ret < 0))
+	ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
+	if (ret < 0)
 		return ret;
+	nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
 
 	key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
 	/* on-disk format */
@@ -2155,13 +2173,12 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
 				 sector_t blocknr,
 				 union nilfs_binfo *binfo)
 {
-	struct nilfs_btree *btree;
 	struct nilfs_btree_node *node;
 	__u64 key;
 	int ret;
 
-	btree = (struct nilfs_btree *)bmap;
-	ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr);
+	ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr,
+			     blocknr);
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 342d9765df8d..d369ac718277 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -125,106 +125,64 @@ static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
 	direct->d_bmap.b_last_allocated_ptr = ptr;
 }
 
-static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
-				       __u64 key,
-				       union nilfs_bmap_ptr_req *req,
-				       struct nilfs_bmap_stats *stats)
-{
-	int ret;
-
-	if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
-		req->bpr_ptr = nilfs_direct_find_target_v(direct, key);
-	ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req);
-	if (ret < 0)
-		return ret;
-
-	stats->bs_nblocks = 1;
-	return 0;
-}
-
-static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
-				       union nilfs_bmap_ptr_req *req,
-				       __u64 key, __u64 ptr)
-{
-	struct buffer_head *bh;
-
-	/* ptr must be a pointer to a buffer head. */
-	bh = (struct buffer_head *)((unsigned long)ptr);
-	set_buffer_nilfs_volatile(bh);
-
-	nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req);
-	nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
-
-	if (!nilfs_bmap_dirty(&direct->d_bmap))
-		nilfs_bmap_set_dirty(&direct->d_bmap);
-
-	if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
-		nilfs_direct_set_target_v(direct, key, req->bpr_ptr);
-}
-
 static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 {
-	struct nilfs_direct *direct;
+	struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
 	union nilfs_bmap_ptr_req req;
-	struct nilfs_bmap_stats stats;
+	struct inode *dat = NULL;
+	struct buffer_head *bh;
 	int ret;
 
-	direct = (struct nilfs_direct *)bmap;
 	if (key > NILFS_DIRECT_KEY_MAX)
 		return -ENOENT;
 	if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
 		return -EEXIST;
 
-	ret = nilfs_direct_prepare_insert(direct, key, &req, &stats);
-	if (ret < 0)
-		return ret;
-	nilfs_direct_commit_insert(direct, &req, key, ptr);
-	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+	if (NILFS_BMAP_USE_VBN(bmap)) {
+		req.bpr_ptr = nilfs_direct_find_target_v(direct, key);
+		dat = nilfs_bmap_get_dat(bmap);
+	}
+	ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
+	if (!ret) {
+		/* ptr must be a pointer to a buffer head. */
+		bh = (struct buffer_head *)((unsigned long)ptr);
+		set_buffer_nilfs_volatile(bh);
 
-	return 0;
-}
+		nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
+		nilfs_direct_set_ptr(direct, key, req.bpr_ptr);
 
-static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
-				       union nilfs_bmap_ptr_req *req,
-				       __u64 key,
-				       struct nilfs_bmap_stats *stats)
-{
-	int ret;
+		if (!nilfs_bmap_dirty(bmap))
+			nilfs_bmap_set_dirty(bmap);
 
-	req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
-	ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req);
-	if (!ret)
-		stats->bs_nblocks = 1;
-	return ret;
-}
+		if (NILFS_BMAP_USE_VBN(bmap))
+			nilfs_direct_set_target_v(direct, key, req.bpr_ptr);
 
-static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
-				       union nilfs_bmap_ptr_req *req,
-				       __u64 key)
-{
-	nilfs_bmap_commit_end_ptr(&direct->d_bmap, req);
-	nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+		nilfs_bmap_add_blocks(bmap, 1);
+	}
+	return ret;
 }
 
 static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
 {
-	struct nilfs_direct *direct;
+	struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
 	union nilfs_bmap_ptr_req req;
-	struct nilfs_bmap_stats stats;
+	struct inode *dat;
 	int ret;
 
-	direct = (struct nilfs_direct *)bmap;
-	if ((key > NILFS_DIRECT_KEY_MAX) ||
+	if (key > NILFS_DIRECT_KEY_MAX ||
 	    nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
 		return -ENOENT;
 
-	ret = nilfs_direct_prepare_delete(direct, &req, key, &stats);
-	if (ret < 0)
-		return ret;
-	nilfs_direct_commit_delete(direct, &req, key);
-	nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+	dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
+	req.bpr_ptr = nilfs_direct_get_ptr(direct, key);
 
-	return 0;
+	ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
+	if (!ret) {
+		nilfs_bmap_commit_end_ptr(bmap, &req, dat);
+		nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+		nilfs_bmap_sub_blocks(bmap, 1);
+	}
+	return ret;
 }
 
 static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
@@ -310,59 +268,56 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
 	return 0;
 }
 
-static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
-				    struct buffer_head *bh)
+static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
+				  struct buffer_head *bh)
 {
-	union nilfs_bmap_ptr_req oldreq, newreq;
+	struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
+	struct nilfs_palloc_req oldreq, newreq;
+	struct inode *dat;
 	__u64 key;
 	__u64 ptr;
 	int ret;
 
-	key = nilfs_bmap_data_get_key(&direct->d_bmap, bh);
+	if (!NILFS_BMAP_USE_VBN(bmap))
+		return 0;
+
+	dat = nilfs_bmap_get_dat(bmap);
+	key = nilfs_bmap_data_get_key(bmap, bh);
 	ptr = nilfs_direct_get_ptr(direct, key);
 	if (!buffer_nilfs_volatile(bh)) {
-		oldreq.bpr_ptr = ptr;
-		newreq.bpr_ptr = ptr;
-		ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq,
-						  &newreq);
+		oldreq.pr_entry_nr = ptr;
+		newreq.pr_entry_nr = ptr;
+		ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq);
 		if (ret < 0)
 			return ret;
-		nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq);
+		nilfs_dat_commit_update(dat, &oldreq, &newreq,
+					bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
 		set_buffer_nilfs_volatile(bh);
-		nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
+		nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr);
 	} else
-		ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr);
+		ret = nilfs_dat_mark_dirty(dat, ptr);
 
 	return ret;
 }
 
-static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
-				  struct buffer_head *bh)
-{
-	struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
-
-	return NILFS_BMAP_USE_VBN(bmap) ?
-		nilfs_direct_propagate_v(direct, bh) : 0;
-}
-
 static int nilfs_direct_assign_v(struct nilfs_direct *direct,
 				 __u64 key, __u64 ptr,
 				 struct buffer_head **bh,
 				 sector_t blocknr,
 				 union nilfs_binfo *binfo)
 {
+	struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap);
 	union nilfs_bmap_ptr_req req;
 	int ret;
 
 	req.bpr_ptr = ptr;
-	ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr);
-	if (unlikely(ret < 0))
-		return ret;
-
-	binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
-	binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
-
-	return 0;
+	ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
+	if (!ret) {
+		nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
+		binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+		binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+	}
+	return ret;
 }
 
 static int nilfs_direct_assign_p(struct nilfs_direct *direct,
-- 
cgit v1.2.3-70-g09d2


From 0f3fe33b398abbecfcf9f08c16959d1a9a14a49a Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 15 Aug 2009 02:29:28 +0900
Subject: nilfs2: convert nilfs_bmap_lookup to an inline function

The nilfs_bmap_lookup() is now a wrapper function of
nilfs_bmap_lookup_at_level().

This moves the nilfs_bmap_lookup() to a header file converting it to
an inline function and gives an opportunity for optimization.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/bmap.c | 53 ++++++++++++++++++++---------------------------------
 fs/nilfs2/bmap.h |  7 ++++++-
 fs/nilfs2/mdt.c  |  4 ++--
 3 files changed, 28 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index f98c5c4cf6e7..08834df6ec68 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -36,6 +36,26 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
 	return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
 }
 
+/**
+ * nilfs_bmap_lookup_at_level - find a data block or node block
+ * @bmap: bmap
+ * @key: key
+ * @level: level
+ * @ptrp: place to store the value associated to @key
+ *
+ * Description: nilfs_bmap_lookup_at_level() finds a record whose key
+ * matches @key in the block at @level of the bmap.
+ *
+ * Return Value: On success, 0 is returned and the record associated with @key
+ * is stored in the place pointed by @ptrp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
 int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
 			       __u64 *ptrp)
 {
@@ -69,39 +89,6 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
 	return ret;
 }
 
-/**
- * nilfs_bmap_lookup - find a record
- * @bmap: bmap
- * @key: key
- * @recp: pointer to record
- *
- * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
- * @bmap.
- *
- * Return Value: On success, 0 is returned and the record associated with @key
- * is stored in the place pointed by @recp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - A record associated with @key does not exist.
- */
-int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
-		      unsigned long key,
-		      unsigned long *recp)
-{
-	__u64 ptr;
-	int ret;
-
-	/* XXX: use macro for level 1 */
-	ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
-	if (recp != NULL)
-		*recp = ptr;
-	return ret;
-}
-
 static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 {
 	__u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index a4f64e54424c..9980d7dbab91 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -142,7 +142,6 @@ struct nilfs_bmap {
 int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
 void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
-int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
 int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
 int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
 int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
@@ -161,6 +160,12 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
 void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
 
 
+static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
+				    __u64 *ptr)
+{
+	return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr);
+}
+
 /*
  * Internal use only
  */
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 9cb831899119..156bf6091a96 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -136,7 +136,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 		       int mode, struct buffer_head **out_bh)
 {
 	struct buffer_head *bh;
-	unsigned long blknum = 0;
+	__u64 blknum = 0;
 	int ret = -ENOMEM;
 
 	bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
@@ -166,7 +166,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 		goto failed_bh;
 	}
 	bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
-	bh->b_blocknr = blknum;
+	bh->b_blocknr = (sector_t)blknum;
 	set_buffer_mapped(bh);
 
 	bh->b_end_io = end_buffer_read_sync;
-- 
cgit v1.2.3-70-g09d2


From 41f4db0f48c72db3a93cc1a0b18368d9a9aca700 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 8 Aug 2009 16:09:46 +0900
Subject: fs/Kconfig: move nilfs2 outside misc filesystems

Some people asked me questions like the following:

On Wed, 15 Jul 2009 13:11:21 +0200, Leon Woestenberg wrote:
> just wondering, any reasons why NILFS2 is one of the miscellaneous
> filesystems and, for example, btrfs, is not in Kconfig?

Actually, nilfs is NOT a filesystem came from other operating systems,
but a filesystem created purely for Linux.  Nor is it a flash
filesystem but that for generic block devices.

So, this moves nilfs outside the misc category as I responded in LKML
"Re: Why does NILFS2 hide under Miscellaneous filesystems?"
(Message-Id: <20090716.002526.93465395.ryusuke@osrg.net>).

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/Kconfig        | 2 +-
 fs/nilfs2/Kconfig | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 0e7da7bb5d93..455aa207e67e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,6 +43,7 @@ source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
 source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
+source "fs/nilfs2/Kconfig"
 
 endif # BLOCK
 
@@ -186,7 +187,6 @@ source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
-source "fs/nilfs2/Kconfig"
 
 endif # MISC_FILESYSTEMS
 
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 72da095d4009..251da07b2a1d 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,6 @@
 config NILFS2_FS
 	tristate "NILFS2 file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
+	depends on EXPERIMENTAL
 	select CRC32
 	help
 	  NILFS2 is a log-structured file system (LFS) supporting continuous
-- 
cgit v1.2.3-70-g09d2


From 918941a3f3d46c2a69971b4718aaf13b1be2f1a7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 17 Aug 2009 18:50:08 +0200
Subject: ocfs2: Use __generic_file_aio_write instead of
 generic_file_aio_write_nolock

Use the new helper. We have to submit data pages ourselves in case of O_SYNC
write because __generic_file_aio_write does not do it for us. OCFS2 developpers
might think about moving the sync out of i_mutex which seems to be easily
possible but that's out of scope of this patch.

CC: ocfs2-devel@oss.oracle.com
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ocfs2/file.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index aa501d3f93f1..600227389387 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1871,8 +1871,7 @@ relock:
 			goto out_dio;
 		}
 	} else {
-		written = generic_file_aio_write_nolock(iocb, iov, nr_segs,
-							*ppos);
+		written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
 	}
 
 out_dio:
@@ -1880,18 +1879,21 @@ out_dio:
 	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
 
 	if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
-		/*
-		 * The generic write paths have handled getting data
-		 * to disk, but since we don't make use of the dirty
-		 * inode list, a manual journal commit is necessary
-		 * here.
-		 */
-		if (old_size != i_size_read(inode) ||
-		    old_clusters != OCFS2_I(inode)->ip_clusters) {
+		ret = filemap_fdatawrite_range(file->f_mapping, pos,
+					       pos + count - 1);
+		if (ret < 0)
+			written = ret;
+
+		if (!ret && (old_size != i_size_read(inode) ||
+		    old_clusters != OCFS2_I(inode)->ip_clusters)) {
 			ret = jbd2_journal_force_commit(osb->journal->j_journal);
 			if (ret < 0)
 				written = ret;
 		}
+
+		if (!ret)
+			ret = filemap_fdatawait_range(file->f_mapping, pos,
+						      pos + count - 1);
 	}
 
 	/* 
-- 
cgit v1.2.3-70-g09d2


From eef99380679e20e7edc096aa4d8a98b875404d79 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Aug 2009 17:43:41 +0200
Subject: vfs: Rename generic_file_aio_write_nolock

generic_file_aio_write_nolock() is now used only by block devices and raw
character device. Filesystems should use __generic_file_aio_write() in case
generic_file_aio_write() doesn't suit them. So rename the function to
blkdev_aio_write() and move it to fs/blockdev.c.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 drivers/char/raw.c |  2 +-
 fs/block_dev.c     | 29 ++++++++++++++++++++++++++++-
 include/linux/fs.h |  6 ++++--
 mm/filemap.c       | 39 ---------------------------------------
 4 files changed, 33 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 05f9d18b9361..40268db02e22 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -246,7 +246,7 @@ static const struct file_operations raw_fops = {
 	.read	=	do_sync_read,
 	.aio_read = 	generic_file_aio_read,
 	.write	=	do_sync_write,
-	.aio_write = 	generic_file_aio_write_nolock,
+	.aio_write =	blkdev_aio_write,
 	.open	=	raw_open,
 	.release=	raw_release,
 	.ioctl	=	raw_ioctl,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 94dfda24c06e..3581a4e53942 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1404,6 +1404,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 
+/*
+ * Write data to the block device.  Only intended for the block device itself
+ * and the raw driver which basically is a fake block device.
+ *
+ * Does not take i_mutex for the write and thus is not for general purpose
+ * use.
+ */
+ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			 unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t ret;
+
+	BUG_ON(iocb->ki_pos != pos);
+
+	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+	if (ret > 0 || ret == -EIOCBQUEUED) {
+		ssize_t err;
+
+		err = generic_write_sync(file, pos, ret);
+		if (err < 0 && ret > 0)
+			ret = err;
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_aio_write);
+
 /*
  * Try to release a page associated with block device when the system
  * is under memory pressure.
@@ -1436,7 +1463,7 @@ const struct file_operations def_blk_fops = {
 	.read		= do_sync_read,
 	.write		= do_sync_write,
   	.aio_read	= generic_file_aio_read,
-  	.aio_write	= generic_file_aio_write_nolock,
+	.aio_write	= blkdev_aio_write,
 	.mmap		= generic_file_mmap,
 	.fsync		= block_fsync,
 	.unlocked_ioctl	= block_ioctl,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ea099d3a18d9..6c1be3a4edea 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2207,8 +2207,6 @@ extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsig
 extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long,
 		loff_t *);
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
-extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *,
-		unsigned long, loff_t);
 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
 		unsigned long *, loff_t, loff_t *, size_t, size_t);
 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
@@ -2218,6 +2216,10 @@ extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t l
 extern int generic_segment_checks(const struct iovec *iov,
 		unsigned long *nr_segs, size_t *count, int access_flags);
 
+/* fs/block_dev.c */
+extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos);
+
 /* fs/splice.c */
 extern ssize_t generic_file_splice_read(struct file *, loff_t *,
 		struct pipe_inode_info *, size_t, unsigned int);
diff --git a/mm/filemap.c b/mm/filemap.c
index f863e1d7e227..3587554f45ef 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2462,45 +2462,6 @@ out:
 }
 EXPORT_SYMBOL(__generic_file_aio_write);
 
-
-/**
- * generic_file_aio_write_nolock - write data, usually to a device
- * @iocb:	IO state structure
- * @iov:	vector with data to write
- * @nr_segs:	number of segments in the vector
- * @pos:	position in file where to write
- *
- * This is a wrapper around __generic_file_aio_write() which takes care of
- * syncing the file in case of O_SYNC file. It does not take i_mutex for the
- * write itself but may do so during syncing. It is meant for users like block
- * devices which do not need i_mutex during write. If your filesystem needs to
- * do a write but already holds i_mutex, use __generic_file_aio_write()
- * directly and then sync the file like generic_file_aio_write().
- */
-ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
-		const struct iovec *iov, unsigned long nr_segs, loff_t pos)
-{
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	ssize_t ret;
-
-	BUG_ON(iocb->ki_pos != pos);
-
-	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
-
-	if ((ret > 0 || ret == -EIOCBQUEUED) &&
-	    ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		ssize_t err;
-
-		err = sync_page_range_nolock(inode, mapping, pos, ret);
-		if (err < 0 && ret > 0)
-			ret = err;
-	}
-	return ret;
-}
-EXPORT_SYMBOL(generic_file_aio_write_nolock);
-
 /**
  * generic_file_aio_write - write data to a file
  * @iocb:	IO state structure
-- 
cgit v1.2.3-70-g09d2


From 148f948ba877f4d3cdef036b1ff6d9f68986706a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 17 Aug 2009 19:52:36 +0200
Subject: vfs: Introduce new helpers for syncing after writing to O_SYNC file
 or IS_SYNC inode

Introduce new function for generic inode syncing (vfs_fsync_range) and use
it from fsync() path. Introduce also new helper for syncing after a sync
write (generic_write_sync) using the generic function.

Use these new helpers for syncing from generic VFS functions. This makes
O_SYNC writes to block devices acquire i_mutex for syncing. If we really
care about this, we can make block_fsync() drop the i_mutex and reacquire
it before it returns.

CC: Evgeniy Polyakov <zbr@ioremap.net>
CC: ocfs2-devel@oss.oracle.com
CC: Joel Becker <joel.becker@oracle.com>
CC: Felix Blyakher <felixb@sgi.com>
CC: xfs@oss.sgi.com
CC: Anton Altaparmakov <aia21@cantab.net>
CC: linux-ntfs-dev@lists.sourceforge.net
CC: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
CC: linux-ext4@vger.kernel.org
CC: tytso@mit.edu
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/splice.c        | 22 ++++++----------------
 fs/sync.c          | 55 +++++++++++++++++++++++++++++++++++++++++++++++-------
 include/linux/fs.h |  3 +++
 mm/filemap.c       | 11 ++++-------
 4 files changed, 61 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 73766d24f97b..819023733f8e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -976,25 +976,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 	if (ret > 0) {
 		unsigned long nr_pages;
+		int err;
 
-		*ppos += ret;
 		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-		/*
-		 * If file or inode is SYNC and we actually wrote some data,
-		 * sync it.
-		 */
-		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
-			int err;
-
-			mutex_lock(&inode->i_mutex);
-			err = generic_osync_inode(inode, mapping,
-						  OSYNC_METADATA|OSYNC_DATA);
-			mutex_unlock(&inode->i_mutex);
-
-			if (err)
-				ret = err;
-		}
+		err = generic_write_sync(out, *ppos, ret);
+		if (err)
+			ret = err;
+		else
+			*ppos += ret;
 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 	}
 
diff --git a/fs/sync.c b/fs/sync.c
index 103cc7fdd3df..4e15da01923c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -178,19 +178,23 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 }
 
 /**
- * vfs_fsync - perform a fsync or fdatasync on a file
+ * vfs_fsync_range - helper to sync a range of data & metadata to disk
  * @file:		file to sync
  * @dentry:		dentry of @file
- * @data:		only perform a fdatasync operation
+ * @start:		offset in bytes of the beginning of data range to sync
+ * @end:		offset in bytes of the end of data range (inclusive)
+ * @datasync:		perform only datasync
  *
- * Write back data and metadata for @file to disk.  If @datasync is
- * set only metadata needed to access modified file data is written.
+ * Write back data in range @start..@end and metadata for @file to disk.  If
+ * @datasync is set only metadata needed to access modified file data is
+ * written.
  *
  * In case this function is called from nfsd @file may be %NULL and
  * only @dentry is set.  This can only happen when the filesystem
  * implements the export_operations API.
  */
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
+		    loff_t end, int datasync)
 {
 	const struct file_operations *fop;
 	struct address_space *mapping;
@@ -214,7 +218,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
-	ret = filemap_fdatawrite(mapping);
+	ret = filemap_fdatawrite_range(mapping, start, end);
 
 	/*
 	 * We need to protect against concurrent writers, which could cause
@@ -225,12 +229,32 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	if (!ret)
 		ret = err;
 	mutex_unlock(&mapping->host->i_mutex);
-	err = filemap_fdatawait(mapping);
+
+	err = filemap_fdatawait_range(mapping, start, end);
 	if (!ret)
 		ret = err;
 out:
 	return ret;
 }
+EXPORT_SYMBOL(vfs_fsync_range);
+
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:		file to sync
+ * @dentry:		dentry of @file
+ * @datasync:		only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set.  This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
+}
 EXPORT_SYMBOL(vfs_fsync);
 
 static int do_fsync(unsigned int fd, int datasync)
@@ -256,6 +280,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
 	return do_fsync(fd, 1);
 }
 
+/**
+ * generic_write_sync - perform syncing after a write if file / inode is sync
+ * @file:	file to which the write happened
+ * @pos:	offset where the write started
+ * @count:	length of the write
+ *
+ * This is just a simple wrapper about our general syncing function.
+ */
+int generic_write_sync(struct file *file, loff_t pos, loff_t count)
+{
+	if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
+		return 0;
+	return vfs_fsync_range(file, file->f_path.dentry, pos,
+			       pos + count - 1, 1);
+}
+EXPORT_SYMBOL(generic_write_sync);
+
 /*
  * sys_sync_file_range() permits finely controlled syncing over a segment of
  * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6c1be3a4edea..e2c7f5167662 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2098,7 +2098,10 @@ extern int __filemap_fdatawrite_range(struct address_space *mapping,
 extern int filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end);
 
+extern int vfs_fsync_range(struct file *file, struct dentry *dentry,
+			   loff_t start, loff_t end, int datasync);
 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int generic_write_sync(struct file *file, loff_t pos, loff_t count);
 extern void sync_supers(void);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
diff --git a/mm/filemap.c b/mm/filemap.c
index 3587554f45ef..849293c4f418 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -39,11 +39,10 @@
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
-#include <linux/buffer_head.h> /* for generic_osync_inode */
+#include <linux/buffer_head.h> /* for try_to_free_buffers */
 
 #include <asm/mman.h>
 
-
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  * though.
@@ -2477,8 +2476,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
+	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
 
 	BUG_ON(iocb->ki_pos != pos);
@@ -2487,11 +2485,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
 
-	if ((ret > 0 || ret == -EIOCBQUEUED) &&
-	    ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+	if (ret > 0 || ret == -EIOCBQUEUED) {
 		ssize_t err;
 
-		err = sync_page_range(inode, mapping, pos, ret);
+		err = generic_write_sync(file, pos, ret);
 		if (err < 0 && ret > 0)
 			ret = err;
 	}
-- 
cgit v1.2.3-70-g09d2


From a2a735ad666a04306a708b5a0109cc1fe113f569 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 18 Aug 2009 17:54:11 +0200
Subject: ext2: Update comment about generic_osync_inode

We rely on generic_write_sync() now.

CC: linux-ext4@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e27130341d4f..1c1638f873a4 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode,
 		unlock_buffer(bh);
 		mark_buffer_dirty_inode(bh, inode);
 		/* We used to sync bh here if IS_SYNC(inode).
-		 * But we now rely upon generic_osync_inode()
+		 * But we now rely upon generic_write_sync()
 		 * and b_inode_buffers.  But not for directories.
 		 */
 		if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
-- 
cgit v1.2.3-70-g09d2


From e367626b6164aeecb97fb7c20509ed8696babc26 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 18 Aug 2009 17:51:23 +0200
Subject: ext3: Remove syncing logic from ext3_file_write

Syncing is now properly done by generic_file_aio_write() so no special logic is
needed in ext3.

CC: linux-ext4@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/file.c | 61 +---------------------------------------------------------
 1 file changed, 1 insertion(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 299253214789..388bbdfa0b4e 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
 	return 0;
 }
 
-static ssize_t
-ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	ssize_t ret;
-	int err;
-
-	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
-	/*
-	 * Skip flushing if there was an error, or if nothing was written.
-	 */
-	if (ret <= 0)
-		return ret;
-
-	/*
-	 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-	 * journalling then we need to make sure that we force the transaction
-	 * to disk to keep all metadata uptodate synchronously.
-	 */
-	if (file->f_flags & O_SYNC) {
-		/*
-		 * If we are non-data-journaled, then the dirty data has
-		 * already been flushed to backing store by generic_osync_inode,
-		 * and the inode has been flushed too if there have been any
-		 * modifications other than mere timestamp updates.
-		 *
-		 * Open question --- do we care about flushing timestamps too
-		 * if the inode is IS_SYNC?
-		 */
-		if (!ext3_should_journal_data(inode))
-			return ret;
-
-		goto force_commit;
-	}
-
-	/*
-	 * So we know that there has been no forced data flush.  If the inode
-	 * is marked IS_SYNC, we need to force one ourselves.
-	 */
-	if (!IS_SYNC(inode))
-		return ret;
-
-	/*
-	 * Open question #2 --- should we force data to disk here too?  If we
-	 * don't, the only impact is that data=writeback filesystems won't
-	 * flush data to disk automatically on IS_SYNC, only metadata (but
-	 * historically, that is what ext2 has done.)
-	 */
-
-force_commit:
-	err = ext3_force_commit(inode->i_sb);
-	if (err)
-		return err;
-	return ret;
-}
-
 const struct file_operations ext3_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
-	.aio_write	= ext3_file_write,
+	.aio_write	= generic_file_aio_write,
 	.unlocked_ioctl	= ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
-- 
cgit v1.2.3-70-g09d2


From 0d34ec62e18984ac9476208660372306ef54e70d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 18 Aug 2009 17:48:27 +0200
Subject: ext4: Remove syncing logic from ext4_file_write

The syncing is now properly handled by generic_file_aio_write() so
no special ext4 code is needed.

CC: linux-ext4@vger.kernel.org
CC: tytso@mit.edu
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/file.c | 53 ++---------------------------------------------------
 1 file changed, 2 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 27f3c5354c0e..5ca3eca70a1e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -58,10 +58,7 @@ static ssize_t
 ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	ssize_t ret;
-	int err;
+	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 
 	/*
 	 * If we have encountered a bitmap-format file, the size limit
@@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 		}
 	}
 
-	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-	/*
-	 * Skip flushing if there was an error, or if nothing was written.
-	 */
-	if (ret <= 0)
-		return ret;
-
-	/*
-	 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-	 * journalling then we need to make sure that we force the transaction
-	 * to disk to keep all metadata uptodate synchronously.
-	 */
-	if (file->f_flags & O_SYNC) {
-		/*
-		 * If we are non-data-journaled, then the dirty data has
-		 * already been flushed to backing store by generic_osync_inode,
-		 * and the inode has been flushed too if there have been any
-		 * modifications other than mere timestamp updates.
-		 *
-		 * Open question --- do we care about flushing timestamps too
-		 * if the inode is IS_SYNC?
-		 */
-		if (!ext4_should_journal_data(inode))
-			return ret;
-
-		goto force_commit;
-	}
-
-	/*
-	 * So we know that there has been no forced data flush.  If the inode
-	 * is marked IS_SYNC, we need to force one ourselves.
-	 */
-	if (!IS_SYNC(inode))
-		return ret;
-
-	/*
-	 * Open question #2 --- should we force data to disk here too?  If we
-	 * don't, the only impact is that data=writeback filesystems won't
-	 * flush data to disk automatically on IS_SYNC, only metadata (but
-	 * historically, that is what ext2 has done.)
-	 */
-
-force_commit:
-	err = ext4_force_commit(inode->i_sb);
-	if (err)
-		return err;
-	return ret;
+	return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
 
 static struct vm_operations_struct ext4_file_vm_ops = {
-- 
cgit v1.2.3-70-g09d2


From ebbbf757c6b8577ac2fb6181c08c2059153bb0e2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 18 Aug 2009 18:13:58 +0200
Subject: ntfs: Use new syncing helpers and update comments

Use new syncing helpers in .write and .aio_write functions. Also
remove superfluous syncing in ntfs_file_buffered_write() and update
comments about generic_osync_inode().

CC: Anton Altaparmakov <aia21@cantab.net>
CC: linux-ntfs-dev@lists.sourceforge.net
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ntfs/file.c | 16 ++++------------
 fs/ntfs/mft.c  | 13 ++++++-------
 2 files changed, 10 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 3140a4429af1..4350d4993b18 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2076,14 +2076,6 @@ err_out:
 	*ppos = pos;
 	if (cached_page)
 		page_cache_release(cached_page);
-	/* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
-	if (likely(!status)) {
-		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
-			if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
-				status = generic_osync_inode(vi, mapping,
-						OSYNC_METADATA|OSYNC_DATA);
-		}
-  	}
 	pagevec_lru_add_file(&lru_pvec);
 	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
 			written ? "written" : "status", (unsigned long)written,
@@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	mutex_lock(&inode->i_mutex);
 	ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		int err = sync_page_range(inode, mapping, pos, ret);
+	if (ret > 0) {
+		int err = generic_write_sync(file, pos, ret);
 		if (err < 0)
 			ret = err;
 	}
@@ -2173,8 +2165,8 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
 	if (ret == -EIOCBQUEUED)
 		ret = wait_on_sync_kiocb(&kiocb);
 	mutex_unlock(&inode->i_mutex);
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		int err = sync_page_range(inode, mapping, *ppos - ret, ret);
+	if (ret > 0) {
+		int err = generic_write_sync(file, *ppos - ret, ret);
 		if (err < 0)
 			ret = err;
 	}
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 23bf68453d7d..1caa0ef0b2bb 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -384,13 +384,12 @@ unm_err_out:
  * it is dirty in the inode meta data rather than the data page cache of the
  * inode, and thus there are no data pages that need writing out.  Therefore, a
  * full mark_inode_dirty() is overkill.  A mark_inode_dirty_sync(), on the
- * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
- * ensure ->write_inode is called from generic_osync_inode() and this needs to
- * happen or the file data would not necessarily hit the device synchronously,
- * even though the vfs inode has the O_SYNC flag set.  Also, I_DIRTY_DATASYNC
- * simply "feels" better than just I_DIRTY_SYNC, since the file data has not
- * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
- * would suggest.
+ * other hand, is not sufficient, because ->write_inode needs to be called even
+ * in case of fdatasync. This needs to happen or the file data would not
+ * necessarily hit the device synchronously, even though the vfs inode has the
+ * O_SYNC flag set.  Also, I_DIRTY_DATASYNC simply "feels" better than just
+ * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
+ * which is not what I_DIRTY_SYNC on its own would suggest.
  */
 void __mark_mft_record_dirty(ntfs_inode *ni)
 {
-- 
cgit v1.2.3-70-g09d2


From d23c937b0f740888765676f6f82f509dbbb2bbad Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 18 Aug 2009 18:24:31 +0200
Subject: ocfs2: Update syncing after splicing to match generic version

Update ocfs2 specific splicing code to use generic syncing helper. The sync now
does not happen under rw_lock because generic_write_sync() acquires i_mutex
which ranks above rw_lock. That should not matter because standard fsync path
does not hold it either.

Acked-by: Joel Becker <Joel.Becker@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
CC: ocfs2-devel@oss.oracle.com
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ocfs2/file.c | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 600227389387..221c5e98957b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1993,31 +1993,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 
 	if (ret > 0) {
 		unsigned long nr_pages;
+		int err;
 
-		*ppos += ret;
 		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-		/*
-		 * If file or inode is SYNC and we actually wrote some data,
-		 * sync it.
-		 */
-		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
-			int err;
-
-			mutex_lock(&inode->i_mutex);
-			err = ocfs2_rw_lock(inode, 1);
-			if (err < 0) {
-				mlog_errno(err);
-			} else {
-				err = generic_osync_inode(inode, mapping,
-						  OSYNC_METADATA|OSYNC_DATA);
-				ocfs2_rw_unlock(inode, 1);
-			}
-			mutex_unlock(&inode->i_mutex);
+		err = generic_write_sync(out, *ppos, ret);
+		if (err)
+			ret = err;
+		else
+			*ppos += ret;
 
-			if (err)
-				ret = err;
-		}
 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From af0f4414f343429971d33b0dd8dccc85c1f3dcd2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 18 Aug 2009 18:32:55 +0200
Subject: xfs: Convert sync_page_range() to simple
 filemap_write_and_wait_range()

Christoph Hellwig says that it is enough for XFS to call
filemap_write_and_wait_range() instead of sync_page_range() because we do
all the metadata syncing when forcing the log.

CC: Felix Blyakher <felixb@sgi.com>
CC: xfs@oss.sgi.com
CC: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/xfs/linux-2.6/xfs_lrw.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7078974a6eee..fde63a3c4ecc 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -817,7 +817,8 @@ write_retry:
 		xfs_iunlock(xip, iolock);
 		if (need_i_mutex)
 			mutex_unlock(&inode->i_mutex);
-		error2 = sync_page_range(inode, mapping, pos, ret);
+		error2 = filemap_write_and_wait_range(mapping, pos,
+						      pos + ret - 1);
 		if (!error)
 			error = error2;
 		if (need_i_mutex)
-- 
cgit v1.2.3-70-g09d2


From 2f3d675bcd4a84251d6e8eea8096ec8fc795e5d6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 17 Aug 2009 17:00:02 +0200
Subject: fat: Opencode sync_page_range_nolock()

fat_cont_expand() is the only user of sync_page_range_nolock(). It's also the
only user of generic_osync_inode() which does not have a file open.  So
opencode needed actions for FAT so that we can convert generic_osync_inode() to
a standard syncing path.

Update a comment about generic_osync_inode().

CC: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/fat/file.c | 22 ++++++++++++++++++++--
 fs/fat/misc.c |  4 ++--
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/file.c b/fs/fat/file.c
index f042b965c95c..e8c159de236b 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
 
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-	if (IS_SYNC(inode))
-		err = sync_page_range_nolock(inode, mapping, start, count);
+	if (IS_SYNC(inode)) {
+		int err2;
+
+		/*
+		 * Opencode syncing since we don't have a file open to use
+		 * standard fsync path.
+		 */
+		err = filemap_fdatawrite_range(mapping, start,
+					       start + count - 1);
+		err2 = sync_mapping_buffers(mapping);
+		if (!err)
+			err = err2;
+		err2 = write_inode_now(inode, 1);
+		if (!err)
+			err = err2;
+		if (!err) {
+			err =  filemap_fdatawait_range(mapping, start,
+						       start + count - 1);
+		}
+	}
 out:
 	return err;
 }
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index a6c20473dfd7..4e35be873e09 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -119,8 +119,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
 		MSDOS_I(inode)->i_start = new_dclus;
 		MSDOS_I(inode)->i_logstart = new_dclus;
 		/*
-		 * Since generic_osync_inode() synchronize later if
-		 * this is not directory, we don't here.
+		 * Since generic_write_sync() synchronizes regular files later,
+		 * we sync here only directories.
 		 */
 		if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
 			ret = fat_sync_inode(inode);
-- 
cgit v1.2.3-70-g09d2


From 18f2ee705d98034b0f229a3202d827468d4bffd9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 18 Aug 2009 18:43:15 +0200
Subject: vfs: Remove generic_osync_inode() and sync_page_range{_nolock}()

Remove these three functions since nobody uses them anymore.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/fs-writeback.c         | 54 ---------------------------------------
 include/linux/fs.h        |  5 ----
 include/linux/writeback.h |  4 ---
 mm/filemap.c              | 64 -----------------------------------------------
 4 files changed, 127 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index da86ef58e427..628235cf44b5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1242,57 +1242,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 	return ret;
 }
 EXPORT_SYMBOL(sync_inode);
-
-/**
- * generic_osync_inode - flush all dirty data for a given inode to disk
- * @inode: inode to write
- * @mapping: the address_space that should be flushed
- * @what:  what to write and wait upon
- *
- * This can be called by file_write functions for files which have the
- * O_SYNC flag set, to flush dirty writes to disk.
- *
- * @what is a bitmask, specifying which part of the inode's data should be
- * written and waited upon.
- *
- *    OSYNC_DATA:     i_mapping's dirty data
- *    OSYNC_METADATA: the buffers at i_mapping->private_list
- *    OSYNC_INODE:    the inode itself
- */
-
-int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
-{
-	int err = 0;
-	int need_write_inode_now = 0;
-	int err2;
-
-	if (what & OSYNC_DATA)
-		err = filemap_fdatawrite(mapping);
-	if (what & (OSYNC_METADATA|OSYNC_DATA)) {
-		err2 = sync_mapping_buffers(mapping);
-		if (!err)
-			err = err2;
-	}
-	if (what & OSYNC_DATA) {
-		err2 = filemap_fdatawait(mapping);
-		if (!err)
-			err = err2;
-	}
-
-	spin_lock(&inode_lock);
-	if ((inode->i_state & I_DIRTY) &&
-	    ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
-		need_write_inode_now = 1;
-	spin_unlock(&inode_lock);
-
-	if (need_write_inode_now) {
-		err2 = write_inode_now(inode, 1);
-		if (!err)
-			err = err2;
-	}
-	else
-		inode_sync_wait(inode);
-
-	return err;
-}
-EXPORT_SYMBOL(generic_osync_inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e2c7f5167662..37f53216998a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1455,11 +1455,6 @@ int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
 #define DT_SOCK		12
 #define DT_WHT		14
 
-#define OSYNC_METADATA	(1<<0)
-#define OSYNC_DATA	(1<<1)
-#define OSYNC_INODE	(1<<2)
-int generic_osync_inode(struct inode *, struct address_space *, int);
-
 /*
  * This is the "filldir" function type, used by readdir() to let
  * the kernel specify what kind of dirent layout it wants to have.
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 78b1e4684cc9..d347632f1861 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -150,10 +150,6 @@ int write_cache_pages(struct address_space *mapping,
 		      struct writeback_control *wbc, writepage_t writepage,
 		      void *data);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
-int sync_page_range(struct inode *inode, struct address_space *mapping,
-			loff_t pos, loff_t count);
-int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
-			   loff_t pos, loff_t count);
 void set_page_dirty_balance(struct page *page, int page_mkwrite);
 void writeback_set_ratelimit(void);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 849293c4f418..dd51c68e2b86 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -325,70 +325,6 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start,
 }
 EXPORT_SYMBOL(filemap_fdatawait_range);
 
-/**
- * sync_page_range - write and wait on all pages in the passed range
- * @inode:	target inode
- * @mapping:	target address_space
- * @pos:	beginning offset in pages to write
- * @count:	number of bytes to write
- *
- * Write and wait upon all the pages in the passed range.  This is a "data
- * integrity" operation.  It waits upon in-flight writeout before starting and
- * waiting upon new writeout.  If there was an IO error, return it.
- *
- * We need to re-take i_mutex during the generic_osync_inode list walk because
- * it is otherwise livelockable.
- */
-int sync_page_range(struct inode *inode, struct address_space *mapping,
-			loff_t pos, loff_t count)
-{
-	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
-	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
-	int ret;
-
-	if (!mapping_cap_writeback_dirty(mapping) || !count)
-		return 0;
-	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
-	if (ret == 0) {
-		mutex_lock(&inode->i_mutex);
-		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
-		mutex_unlock(&inode->i_mutex);
-	}
-	if (ret == 0)
-		ret = wait_on_page_writeback_range(mapping, start, end);
-	return ret;
-}
-EXPORT_SYMBOL(sync_page_range);
-
-/**
- * sync_page_range_nolock - write & wait on all pages in the passed range without locking
- * @inode:	target inode
- * @mapping:	target address_space
- * @pos:	beginning offset in pages to write
- * @count:	number of bytes to write
- *
- * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea
- * as it forces O_SYNC writers to different parts of the same file
- * to be serialised right until io completion.
- */
-int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
-			   loff_t pos, loff_t count)
-{
-	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
-	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
-	int ret;
-
-	if (!mapping_cap_writeback_dirty(mapping) || !count)
-		return 0;
-	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
-	if (ret == 0)
-		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
-	if (ret == 0)
-		ret = wait_on_page_writeback_range(mapping, start, end);
-	return ret;
-}
-EXPORT_SYMBOL(sync_page_range_nolock);
-
 /**
  * filemap_fdatawait - wait for all under-writeback pages to complete
  * @mapping: address space structure to wait for
-- 
cgit v1.2.3-70-g09d2


From 2daea67e966dc0c42067ebea015ddac6834cef88 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 3 Sep 2009 12:39:39 +0200
Subject: fsync: wait for data writeout completion before calling ->fsync

Currenly vfs_fsync(_range) first calls filemap_fdatawrite to write out
the data, the calls into ->fsync to write out the metadata and then finally
calls filemap_fdatawait to wait for the data I/O to complete.  What sounds
like a clever micro-optimization actually is nast trap for many filesystems.

For many modern filesystems i_size or other inode information is only
updated on I/O completion and we need to wait for I/O to finish before
we can write out the metadata.  For old fashionen filesystems that
instanciate blocks during the actual write and also update the metadata
at that point it opens up a large window were we could expose uninitialized
blocks after a crash.  While a few filesystems that need it already wait
for the I/O to finish inside their ->fsync methods it is rather suboptimal
as it is done under the i_mutex and also always for the whole file instead
of just a part as we could do for O_SYNC handling.

Here is a small audit of all fsync instances in the tree:

 - spufs_mfc_fsync:
 - ps3flash_fsync:
 - vol_cdev_fsync:
 - printer_fsync:
 - fb_deferred_io_fsync:
 - bad_file_fsync:
 - simple_sync_file:

	don't care - filesystems/drivers do't use the page cache or are
	purely in-memory.

 - simple_fsync:
 - file_fsync:
 - affs_file_fsync:
 - fat_file_fsync:
 - jfs_fsync:
 - ubifs_fsync:
 - reiserfs_dir_fsync:
 - reiserfs_sync_file:

	never touch pagecache themselves.  We need to wait before if we do
	not want to expose stale data after an allocation.

 - afs_fsync:
 - fuse_fsync_common:

	do the waiting writeback itself in awkward ways, would benefit from
	proper semantics

 - block_fsync:

	Does a filemap_write_and_wait on the block device inode.  Because we
	now have f_mapping that is the same inode we call it on in vfs_fsync.
	So just removing it and letting the VFS do the work in one go would
	be an improvement.

 - btrfs_sync_file:
 - cifs_fsync:
 - xfs_file_fsync:

	need the wait first and currently do it themselves. would benefit from
	doing it outside i_mutex.

 - coda_fsync:
 - ecryptfs_fsync:
 - exofs_file_fsync:
 - shm_fsync:

	only passes the fsync through to the lower layer

 - ext3_sync_file:

	doesn't seem to care, comments are confusing.

 - ext4_sync_file:

	would need the wait to work correctly for delalloc mode with late
	i_size updates.  Otherwise the ext3 comment applies.

	currently implemens it's own writeback and wait in an odd way,
	could benefit from doing it properly.

 - gfs2_fsync:

	not needed for journaled data mode, but probably harmless there.
	Currently writes back data asynchronously itself.  Needs some
	major audit.

 - hostfs_fsync:

	just calls fsync/datasync on the host FD.  Without the wait before
	data might not even be inflight yet if we're unlucky.

 - hpfs_file_fsync:
 - ncp_fsync:

	no-ops.  Dangerous before and after.

 - jffs2_fsync:

	just calls jffs2_flush_wbuf_gc, not sure how this relates to data.

 - nfs_fsync_dir:

	just increments stats, claims all directory operations are synchronous

 - nfs_file_fsync:

	only writes out data???  Looks very odd.

 - nilfs_sync_file:

	looks like it expects all data done, but not sure from the code

 - ntfs_dir_fsync:
 - ntfs_file_fsync:

	appear to do their own data writeback.  Very convoluted code.

 - ocfs2_sync_file:

	does it's own data writeback, but no wait.  probably needs the wait.

 - smb_fsync:

	according to a comment expects all pages written already, probably needs
	the wait before.

This patch only changes vfs_fsync_range, removal of the wait in the methods
that have it is left to the filesystem maintainers.  Note that most
filesystems really do need an audit for their fsync methods given the
gems found in this very brief audit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/sync.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/sync.c b/fs/sync.c
index 4e15da01923c..192340930bb4 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -218,7 +218,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
 		goto out;
 	}
 
-	ret = filemap_fdatawrite_range(mapping, start, end);
+	ret = filemap_write_and_wait_range(mapping, start, end);
 
 	/*
 	 * We need to protect against concurrent writers, which could cause
@@ -230,9 +230,6 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
 		ret = err;
 	mutex_unlock(&mapping->host->i_mutex);
 
-	err = filemap_fdatawait_range(mapping, start, end);
-	if (!ret)
-		ret = err;
 out:
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 5891d9dd2a47d38c205115211841a3d82304628f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 16 Jul 2009 17:35:11 +0200
Subject: udf: Remove dead code

Remove code that gets never used.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/directory.c | 86 ------------------------------------------------------
 fs/udf/lowlevel.c  |  4 ---
 2 files changed, 90 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 1d2c570704c8..2ffdb6733af1 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -18,59 +18,6 @@
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 
-#if 0
-static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
-				uint8_t ad_size, struct kernel_lb_addr fe_loc,
-				int *pos, int *offset, struct buffer_head **bh,
-				int *error)
-{
-	int loffset = *offset;
-	int block;
-	uint8_t *ad;
-	int remainder;
-
-	*error = 0;
-
-	ad = (uint8_t *)(*bh)->b_data + *offset;
-	*offset += ad_size;
-
-	if (!ad) {
-		brelse(*bh);
-		*error = 1;
-		return NULL;
-	}
-
-	if (*offset == dir->i_sb->s_blocksize) {
-		brelse(*bh);
-		block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
-		if (!block)
-			return NULL;
-		*bh = udf_tread(dir->i_sb, block);
-		if (!*bh)
-			return NULL;
-	} else if (*offset > dir->i_sb->s_blocksize) {
-		ad = tmpad;
-
-		remainder = dir->i_sb->s_blocksize - loffset;
-		memcpy((uint8_t *)ad, (*bh)->b_data + loffset, remainder);
-
-		brelse(*bh);
-		block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
-		if (!block)
-			return NULL;
-		(*bh) = udf_tread(dir->i_sb, block);
-		if (!*bh)
-			return NULL;
-
-		memcpy((uint8_t *)ad + remainder, (*bh)->b_data,
-			ad_size - remainder);
-		*offset = ad_size - remainder;
-	}
-
-	return ad;
-}
-#endif
-
 struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 					 struct udf_fileident_bh *fibh,
 					 struct fileIdentDesc *cfi,
@@ -248,39 +195,6 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
 	return fi;
 }
 
-#if 0
-static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
-{
-	struct extent_ad *ext;
-	struct fileEntry *fe;
-	uint8_t *ptr;
-
-	if ((!buffer) || (!offset)) {
-		printk(KERN_ERR "udf: udf_get_fileextent() invalidparms\n");
-		return NULL;
-	}
-
-	fe = (struct fileEntry *)buffer;
-
-	if (fe->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FE)) {
-		udf_debug("0x%x != TAG_IDENT_FE\n",
-			  le16_to_cpu(fe->descTag.tagIdent));
-		return NULL;
-	}
-
-	ptr = (uint8_t *)(fe->extendedAttr) +
-		le32_to_cpu(fe->lengthExtendedAttr);
-
-	if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
-		ptr += *offset;
-
-	ext = (struct extent_ad *)ptr;
-
-	*offset = *offset + sizeof(struct extent_ad);
-	return ext;
-}
-#endif
-
 struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
 			      int inc)
 {
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 1b88fd5df05d..43e24a3b8e10 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -36,14 +36,10 @@ unsigned int udf_get_last_session(struct super_block *sb)
 	ms_info.addr_format = CDROM_LBA;
 	i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
 
-#define WE_OBEY_THE_WRITTEN_STANDARDS 1
-
 	if (i == 0) {
 		udf_debug("XA disk: %s, vol_desc_start=%d\n",
 			  (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba);
-#if WE_OBEY_THE_WRITTEN_STANDARDS
 		if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
-#endif
 			vol_desc_start = ms_info.addr.lba;
 	} else {
 		udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i);
-- 
cgit v1.2.3-70-g09d2


From 7c6e3d1aaeb3b25b49510d193000c27037004acb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 16 Jul 2009 17:36:54 +0200
Subject: udf: Remove wrong assignment in udf_symlink

Recomputation of the pointer was wrong (it should have been just increment).
Luckily, we never use the computed value. Remove it.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/namei.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6a29fa34c478..21dad8c608f9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -943,7 +943,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 		pc->componentType = 1;
 		pc->lengthComponentIdent = 0;
 		pc->componentFileVersionNum = 0;
-		pc += sizeof(struct pathComponent);
 		elen += sizeof(struct pathComponent);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 81056dd04465902461b627169c4b4487a11acba1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 16 Jul 2009 18:02:25 +0200
Subject: udf: Perform preallocation only for regular files

So far we preallocated blocks also for directories but that brings a
problem, when to get rid of preallocated blocks we don't need. So far
we removed them in udf_clear_inode() which has a disadvantage that
1) blocks are unavailable long after writing to a directory finished
   and thus one can get out of space unnecessarily early
2) releasing blocks from udf_clear_inode is problematic because VFS
   does not expect us to redirty inode there and it also slows down
   memory reclaim.

So preallocate blocks only for regular files where we can drop preallocation
in udf_release_file.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index e7533f785636..6d24c2c63f93 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -90,19 +90,16 @@ no_delete:
 }
 
 /*
- * If we are going to release inode from memory, we discard preallocation and
- * truncate last inode extent to proper length. We could use drop_inode() but
- * it's called under inode_lock and thus we cannot mark inode dirty there.  We
- * use clear_inode() but we have to make sure to write inode as it's not written
- * automatically.
+ * If we are going to release inode from memory, we truncate last inode extent
+ * to proper length. We could use drop_inode() but it's called under inode_lock
+ * and thus we cannot mark inode dirty there.  We use clear_inode() but we have
+ * to make sure to write inode as it's not written automatically.
  */
 void udf_clear_inode(struct inode *inode)
 {
 	struct udf_inode_info *iinfo;
 	if (!(inode->i_sb->s_flags & MS_RDONLY)) {
 		lock_kernel();
-		/* Discard preallocation for directories, symlinks, etc. */
-		udf_discard_prealloc(inode);
 		udf_truncate_tail_extent(inode);
 		unlock_kernel();
 		write_inode_now(inode, 0);
@@ -664,8 +661,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 	udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum);
 
 #ifdef UDF_PREALLOCATE
-	/* preallocate blocks */
-	udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
+	/* We preallocate blocks only for regular files. It also makes sense
+	 * for directories but there's a problem when to drop the
+	 * preallocation. We might use some delayed work for that but I feel
+	 * it's overengineering for a filesystem like UDF. */
+	if (S_ISREG(inode->i_mode))
+		udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
 #endif
 
 	/* merge any continuous blocks in laarr */
-- 
cgit v1.2.3-70-g09d2


From cbc8cc33529b0e0e55ae0ff077b8cb0b71d54c7a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 7 Aug 2009 00:27:27 +0200
Subject: udf: Fix possible corruption when close races with write

When we close a file, we remove preallocated blocks from it. But this
truncation was not protected by i_mutex and thus it could have raced with a
write through a different fd and cause crashes or even filesystem corruption.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/file.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7464305382b5..b80cbd78833c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -193,9 +193,11 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 static int udf_release_file(struct inode *inode, struct file *filp)
 {
 	if (filp->f_mode & FMODE_WRITE) {
+		mutex_lock(&inode->i_mutex);
 		lock_kernel();
 		udf_discard_prealloc(inode);
 		unlock_kernel();
+		mutex_unlock(&inode->i_mutex);
 	}
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 3661d28615ea580c1db02a972fd4d3898df1cb01 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 14 Sep 2009 22:59:50 -0400
Subject: ext4: Fix include/trace/events/ext4.h to work with Systemtap

Using relative pathnames in #include statements interacts badly with
SystemTap, since the fs/ext4/*.h header files are not packaged up as
part of a distribution kernel's header files.  Since systemtap doesn't
use TP_fast_assign(), we can use a blind structure definition and then
make sure the needed header files are defined before the ext4 source
files #include the trace/events/ext4.h header file.

https://bugzilla.redhat.com/show_bug.cgi?id=512478

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c             | 1 +
 include/trace/events/ext4.h | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 04c693357336..af95dd8ba54b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "mballoc.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 68b53c7ef8a7..6fe6ce9ee071 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -5,10 +5,12 @@
 #define _TRACE_EXT4_H
 
 #include <linux/writeback.h>
-#include "../../../fs/ext4/ext4.h"
-#include "../../../fs/ext4/mballoc.h"
 #include <linux/tracepoint.h>
 
+struct ext4_allocation_context;
+struct ext4_allocation_request;
+struct ext4_prealloc_space;
+
 TRACE_EVENT(ext4_free_inode,
 	TP_PROTO(struct inode *inode),
 
-- 
cgit v1.2.3-70-g09d2


From a4dbd6740df0872cdf0a86841f75beec8381964d Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 24 Jun 2009 10:06:31 -0700
Subject: driver model: constify attribute groups

Let attribute group vectors be declared "const".  We'd
like to let most attribute metadata live in read-only
sections... this is a start.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/genhd.c                              | 2 +-
 drivers/base/core.c                        | 4 ++--
 drivers/base/driver.c                      | 4 ++--
 drivers/block/cciss.c                      | 2 +-
 drivers/firewire/core-device.c             | 2 +-
 drivers/firmware/dmi-id.c                  | 2 +-
 drivers/infiniband/hw/ehca/ehca_main.c     | 2 +-
 drivers/infiniband/hw/ipath/ipath_kernel.h | 2 +-
 drivers/infiniband/hw/ipath/ipath_sysfs.c  | 2 +-
 drivers/input/input.c                      | 2 +-
 drivers/misc/enclosure.c                   | 4 ++--
 drivers/mmc/core/mmc.c                     | 2 +-
 drivers/mmc/core/sd.c                      | 2 +-
 drivers/mtd/mtdcore.c                      | 2 +-
 drivers/s390/cio/css.c                     | 2 +-
 drivers/s390/cio/device.c                  | 2 +-
 drivers/s390/net/netiucv.c                 | 2 +-
 drivers/scsi/scsi_priv.h                   | 2 +-
 drivers/scsi/scsi_sysfs.c                  | 4 ++--
 drivers/usb/core/endpoint.c                | 2 +-
 drivers/usb/core/sysfs.c                   | 4 ++--
 drivers/usb/core/usb.h                     | 4 ++--
 drivers/uwb/lc-dev.c                       | 2 +-
 fs/partitions/check.c                      | 2 +-
 include/linux/attribute_container.h        | 2 +-
 include/linux/device.h                     | 6 +++---
 include/linux/netdevice.h                  | 2 +-
 include/linux/transport_class.h            | 2 +-
 net/bluetooth/hci_sysfs.c                  | 4 ++--
 net/core/net-sysfs.c                       | 2 +-
 30 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/block/genhd.c b/block/genhd.c
index 5b76bf55d05c..2ad91ddad8e2 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -903,7 +903,7 @@ static struct attribute_group disk_attr_group = {
 	.attrs = disk_attrs,
 };
 
-static struct attribute_group *disk_attr_groups[] = {
+static const struct attribute_group *disk_attr_groups[] = {
 	&disk_attr_group,
 	NULL
 };
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 99dfe96fffcb..a992985d1fab 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -341,7 +341,7 @@ static void device_remove_attributes(struct device *dev,
 }
 
 static int device_add_groups(struct device *dev,
-			     struct attribute_group **groups)
+			     const struct attribute_group **groups)
 {
 	int error = 0;
 	int i;
@@ -361,7 +361,7 @@ static int device_add_groups(struct device *dev,
 }
 
 static void device_remove_groups(struct device *dev,
-				 struct attribute_group **groups)
+				 const struct attribute_group **groups)
 {
 	int i;
 
diff --git a/drivers/base/driver.c b/drivers/base/driver.c
index 8ae0f63602e0..ed2ebd3c287d 100644
--- a/drivers/base/driver.c
+++ b/drivers/base/driver.c
@@ -181,7 +181,7 @@ void put_driver(struct device_driver *drv)
 EXPORT_SYMBOL_GPL(put_driver);
 
 static int driver_add_groups(struct device_driver *drv,
-			     struct attribute_group **groups)
+			     const struct attribute_group **groups)
 {
 	int error = 0;
 	int i;
@@ -201,7 +201,7 @@ static int driver_add_groups(struct device_driver *drv,
 }
 
 static void driver_remove_groups(struct device_driver *drv,
-				 struct attribute_group **groups)
+				 const struct attribute_group **groups)
 {
 	int i;
 
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 0589dfbbd7db..d8372b432826 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -572,7 +572,7 @@ static struct attribute_group cciss_dev_attr_group = {
 	.attrs = cciss_dev_attrs,
 };
 
-static struct attribute_group *cciss_dev_attr_groups[] = {
+static const struct attribute_group *cciss_dev_attr_groups[] = {
 	&cciss_dev_attr_group,
 	NULL
 };
diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index 97e656af2d22..9d0dfcbe2c1c 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -312,7 +312,7 @@ static void init_fw_attribute_group(struct device *dev,
 	group->groups[0] = &group->group;
 	group->groups[1] = NULL;
 	group->group.attrs = group->attrs;
-	dev->groups = group->groups;
+	dev->groups = (const struct attribute_group **) group->groups;
 }
 
 static ssize_t modalias_show(struct device *dev,
diff --git a/drivers/firmware/dmi-id.c b/drivers/firmware/dmi-id.c
index 5a76d056b9d0..dbdf6fadfc79 100644
--- a/drivers/firmware/dmi-id.c
+++ b/drivers/firmware/dmi-id.c
@@ -139,7 +139,7 @@ static struct attribute_group sys_dmi_attribute_group = {
 	.attrs = sys_dmi_attributes,
 };
 
-static struct attribute_group* sys_dmi_attribute_groups[] = {
+static const struct attribute_group* sys_dmi_attribute_groups[] = {
 	&sys_dmi_attribute_group,
 	NULL
 };
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 5b635aa5947e..fb2d83c5bf01 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -623,7 +623,7 @@ static struct attribute_group ehca_drv_attr_grp = {
 	.attrs = ehca_drv_attrs
 };
 
-static struct attribute_group *ehca_drv_attr_groups[] = {
+static const struct attribute_group *ehca_drv_attr_groups[] = {
 	&ehca_drv_attr_grp,
 	NULL,
 };
diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h
index 6ba4861dd6ac..b3d7efcdf021 100644
--- a/drivers/infiniband/hw/ipath/ipath_kernel.h
+++ b/drivers/infiniband/hw/ipath/ipath_kernel.h
@@ -1286,7 +1286,7 @@ struct device_driver;
 
 extern const char ib_ipath_version[];
 
-extern struct attribute_group *ipath_driver_attr_groups[];
+extern const struct attribute_group *ipath_driver_attr_groups[];
 
 int ipath_device_create_group(struct device *, struct ipath_devdata *);
 void ipath_device_remove_group(struct device *, struct ipath_devdata *);
diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c
index a6c8efbdc0c9..b8cb2f145ae4 100644
--- a/drivers/infiniband/hw/ipath/ipath_sysfs.c
+++ b/drivers/infiniband/hw/ipath/ipath_sysfs.c
@@ -1069,7 +1069,7 @@ static ssize_t show_tempsense(struct device *dev,
 	return ret;
 }
 
-struct attribute_group *ipath_driver_attr_groups[] = {
+const struct attribute_group *ipath_driver_attr_groups[] = {
 	&driver_attr_group,
 	NULL,
 };
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 7c237e6ac711..851791d955f3 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -1144,7 +1144,7 @@ static struct attribute_group input_dev_caps_attr_group = {
 	.attrs	= input_dev_caps_attrs,
 };
 
-static struct attribute_group *input_dev_attr_groups[] = {
+static const struct attribute_group *input_dev_attr_groups[] = {
 	&input_dev_attr_group,
 	&input_dev_id_attr_group,
 	&input_dev_caps_attr_group,
diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c
index 7b039306037f..e9eae4a78402 100644
--- a/drivers/misc/enclosure.c
+++ b/drivers/misc/enclosure.c
@@ -238,7 +238,7 @@ static void enclosure_component_release(struct device *dev)
 	put_device(dev->parent);
 }
 
-static struct attribute_group *enclosure_groups[];
+static const struct attribute_group *enclosure_groups[];
 
 /**
  * enclosure_component_register - add a particular component to an enclosure
@@ -536,7 +536,7 @@ static struct attribute_group enclosure_group = {
 	.attrs = enclosure_component_attrs,
 };
 
-static struct attribute_group *enclosure_groups[] = {
+static const struct attribute_group *enclosure_groups[] = {
 	&enclosure_group,
 	NULL
 };
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 06084dbf1277..2fb9d5f271ea 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -276,7 +276,7 @@ static struct attribute_group mmc_std_attr_group = {
 	.attrs = mmc_std_attrs,
 };
 
-static struct attribute_group *mmc_attr_groups[] = {
+static const struct attribute_group *mmc_attr_groups[] = {
 	&mmc_std_attr_group,
 	NULL,
 };
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index cd81c395e164..7ad646fe077e 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -314,7 +314,7 @@ static struct attribute_group sd_std_attr_group = {
 	.attrs = sd_std_attrs,
 };
 
-static struct attribute_group *sd_attr_groups[] = {
+static const struct attribute_group *sd_attr_groups[] = {
 	&sd_std_attr_group,
 	NULL,
 };
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 00ebf7af7467..69007a6eff50 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -217,7 +217,7 @@ struct attribute_group mtd_group = {
 	.attrs		= mtd_attrs,
 };
 
-struct attribute_group *mtd_groups[] = {
+const struct attribute_group *mtd_groups[] = {
 	&mtd_group,
 	NULL,
 };
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index e995123fd805..393c73c47f87 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -266,7 +266,7 @@ static struct attribute_group subch_attr_group = {
 	.attrs = subch_attrs,
 };
 
-static struct attribute_group *default_subch_attr_groups[] = {
+static const struct attribute_group *default_subch_attr_groups[] = {
 	&subch_attr_group,
 	NULL,
 };
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 0f95405c2c5e..6527f3f34493 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -656,7 +656,7 @@ static struct attribute_group ccwdev_attr_group = {
 	.attrs = ccwdev_attrs,
 };
 
-static struct attribute_group *ccwdev_attr_groups[] = {
+static const struct attribute_group *ccwdev_attr_groups[] = {
 	&ccwdev_attr_group,
 	NULL,
 };
diff --git a/drivers/s390/net/netiucv.c b/drivers/s390/net/netiucv.c
index 9215fbbccc08..a4b2c576144b 100644
--- a/drivers/s390/net/netiucv.c
+++ b/drivers/s390/net/netiucv.c
@@ -2159,7 +2159,7 @@ static struct attribute_group netiucv_drv_attr_group = {
 	.attrs = netiucv_drv_attrs,
 };
 
-static struct attribute_group *netiucv_drv_attr_groups[] = {
+static const struct attribute_group *netiucv_drv_attr_groups[] = {
 	&netiucv_drv_attr_group,
 	NULL,
 };
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 021e503c8c44..1fbf7c78bba0 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -132,7 +132,7 @@ extern struct scsi_transport_template blank_transport_template;
 extern void __scsi_remove_device(struct scsi_device *);
 
 extern struct bus_type scsi_bus_type;
-extern struct attribute_group *scsi_sysfs_shost_attr_groups[];
+extern const struct attribute_group *scsi_sysfs_shost_attr_groups[];
 
 /* scsi_netlink.c */
 #ifdef CONFIG_SCSI_NETLINK
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 91482f2dcc50..fde54537d715 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -275,7 +275,7 @@ struct attribute_group scsi_shost_attr_group = {
 	.attrs =	scsi_sysfs_shost_attrs,
 };
 
-struct attribute_group *scsi_sysfs_shost_attr_groups[] = {
+const struct attribute_group *scsi_sysfs_shost_attr_groups[] = {
 	&scsi_shost_attr_group,
 	NULL
 };
@@ -745,7 +745,7 @@ static struct attribute_group scsi_sdev_attr_group = {
 	.attrs =	scsi_sdev_attrs,
 };
 
-static struct attribute_group *scsi_sdev_attr_groups[] = {
+static const struct attribute_group *scsi_sdev_attr_groups[] = {
 	&scsi_sdev_attr_group,
 	NULL
 };
diff --git a/drivers/usb/core/endpoint.c b/drivers/usb/core/endpoint.c
index bc39fc40bbde..fdfaa7885515 100644
--- a/drivers/usb/core/endpoint.c
+++ b/drivers/usb/core/endpoint.c
@@ -154,7 +154,7 @@ static struct attribute *ep_dev_attrs[] = {
 static struct attribute_group ep_dev_attr_grp = {
 	.attrs = ep_dev_attrs,
 };
-static struct attribute_group *ep_dev_groups[] = {
+static const struct attribute_group *ep_dev_groups[] = {
 	&ep_dev_attr_grp,
 	NULL
 };
diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c
index b5c72e458943..7ec3041ae79e 100644
--- a/drivers/usb/core/sysfs.c
+++ b/drivers/usb/core/sysfs.c
@@ -573,7 +573,7 @@ static struct attribute_group dev_string_attr_grp = {
 	.is_visible =	dev_string_attrs_are_visible,
 };
 
-struct attribute_group *usb_device_groups[] = {
+const struct attribute_group *usb_device_groups[] = {
 	&dev_attr_grp,
 	&dev_string_attr_grp,
 	NULL
@@ -799,7 +799,7 @@ static struct attribute_group intf_assoc_attr_grp = {
 	.is_visible =	intf_assoc_attrs_are_visible,
 };
 
-struct attribute_group *usb_interface_groups[] = {
+const struct attribute_group *usb_interface_groups[] = {
 	&intf_attr_grp,
 	&intf_assoc_attr_grp,
 	NULL
diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index e2a8cfaade1d..c0e0ae2bb8e7 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h
@@ -152,8 +152,8 @@ static inline int is_active(const struct usb_interface *f)
 extern const char *usbcore_name;
 
 /* sysfs stuff */
-extern struct attribute_group *usb_device_groups[];
-extern struct attribute_group *usb_interface_groups[];
+extern const struct attribute_group *usb_device_groups[];
+extern const struct attribute_group *usb_interface_groups[];
 
 /* usbfs stuff */
 extern struct mutex usbfs_mutex;
diff --git a/drivers/uwb/lc-dev.c b/drivers/uwb/lc-dev.c
index e9fe1bb7eb23..1097e81b56d0 100644
--- a/drivers/uwb/lc-dev.c
+++ b/drivers/uwb/lc-dev.c
@@ -255,7 +255,7 @@ static struct attribute_group dev_attr_group = {
 	.attrs = dev_attrs,
 };
 
-static struct attribute_group *groups[] = {
+static const struct attribute_group *groups[] = {
 	&dev_attr_group,
 	NULL,
 };
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 619ba99dfe39..fbeaddf595d3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -312,7 +312,7 @@ static struct attribute_group part_attr_group = {
 	.attrs = part_attrs,
 };
 
-static struct attribute_group *part_attr_groups[] = {
+static const struct attribute_group *part_attr_groups[] = {
 	&part_attr_group,
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	&blk_trace_attr_group,
diff --git a/include/linux/attribute_container.h b/include/linux/attribute_container.h
index 794ad74b1d61..c3ab81428c66 100644
--- a/include/linux/attribute_container.h
+++ b/include/linux/attribute_container.h
@@ -17,7 +17,7 @@ struct attribute_container {
 	struct list_head	node;
 	struct klist		containers;
 	struct class		*class;
-	struct attribute_group	*grp;
+	const struct attribute_group *grp;
 	struct device_attribute **attrs;
 	int (*match)(struct attribute_container *, struct device *);
 #define	ATTRIBUTE_CONTAINER_NO_CLASSDEVS	0x01
diff --git a/include/linux/device.h b/include/linux/device.h
index 3f33f17f556c..e19e40a3dcbe 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -131,7 +131,7 @@ struct device_driver {
 	void (*shutdown) (struct device *dev);
 	int (*suspend) (struct device *dev, pm_message_t state);
 	int (*resume) (struct device *dev);
-	struct attribute_group **groups;
+	const struct attribute_group **groups;
 
 	const struct dev_pm_ops *pm;
 
@@ -288,7 +288,7 @@ extern void class_destroy(struct class *cls);
  */
 struct device_type {
 	const char *name;
-	struct attribute_group **groups;
+	const struct attribute_group **groups;
 	int (*uevent)(struct device *dev, struct kobj_uevent_env *env);
 	char *(*nodename)(struct device *dev);
 	void (*release)(struct device *dev);
@@ -412,7 +412,7 @@ struct device {
 
 	struct klist_node	knode_class;
 	struct class		*class;
-	struct attribute_group	**groups;	/* optional groups */
+	const struct attribute_group **groups;	/* optional groups */
 
 	void	(*release)(struct device *dev);
 };
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 65ee1929b2b1..a9aa4b5917d7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -895,7 +895,7 @@ struct net_device
 	/* class/net/name entry */
 	struct device		dev;
 	/* space for optional statistics and wireless sysfs groups */
-	struct attribute_group  *sysfs_groups[3];
+	const struct attribute_group *sysfs_groups[3];
 
 	/* rtnetlink link ops */
 	const struct rtnl_link_ops *rtnl_link_ops;
diff --git a/include/linux/transport_class.h b/include/linux/transport_class.h
index eaec1ea9558e..9ae8da3e6407 100644
--- a/include/linux/transport_class.h
+++ b/include/linux/transport_class.h
@@ -55,7 +55,7 @@ struct anon_transport_class cls = {				\
 
 struct transport_container {
 	struct attribute_container ac;
-	struct attribute_group *statistics;
+	const struct attribute_group *statistics;
 };
 
 #define attribute_container_to_transport_container(x) \
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 95f7a7a544b4..7f939ce29801 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -68,7 +68,7 @@ static struct attribute_group bt_link_group = {
 	.attrs = bt_link_attrs,
 };
 
-static struct attribute_group *bt_link_groups[] = {
+static const struct attribute_group *bt_link_groups[] = {
 	&bt_link_group,
 	NULL
 };
@@ -392,7 +392,7 @@ static struct attribute_group bt_host_group = {
 	.attrs = bt_host_attrs,
 };
 
-static struct attribute_group *bt_host_groups[] = {
+static const struct attribute_group *bt_host_groups[] = {
 	&bt_host_group,
 	NULL
 };
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index ad91e9e5f475..7d4c57523b09 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -493,7 +493,7 @@ void netdev_unregister_kobject(struct net_device * net)
 int netdev_register_kobject(struct net_device *net)
 {
 	struct device *dev = &(net->dev);
-	struct attribute_group **groups = net->sysfs_groups;
+	const struct attribute_group **groups = net->sysfs_groups;
 
 	dev->class = &net_class;
 	dev->platform_data = net;
-- 
cgit v1.2.3-70-g09d2


From 361735fd8ff8f7d8b9f0e134d0d99d99ee193d92 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 28 Aug 2009 23:55:00 +0400
Subject: xfs: switch to seq_file

create_proc_read_entry() is getting deprecated.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_stats.c | 51 +++++++++++++----------------
 fs/xfs/quota/xfs_qm_stats.c  | 78 ++++++++++++++++++--------------------------
 2 files changed, 55 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index c3526d445f6a..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -20,16 +20,9 @@
 
 DEFINE_PER_CPU(struct xfsstats, xfsstats);
 
-STATIC int
-xfs_read_xfsstats(
-	char		*buffer,
-	char		**start,
-	off_t		offset,
-	int		count,
-	int		*eof,
-	void		*data)
+static int xfs_stat_proc_show(struct seq_file *m, void *v)
 {
-	int		c, i, j, len, val;
+	int		c, i, j, val;
 	__uint64_t	xs_xstrat_bytes = 0;
 	__uint64_t	xs_write_bytes = 0;
 	__uint64_t	xs_read_bytes = 0;
@@ -60,18 +53,18 @@ xfs_read_xfsstats(
 	};
 
 	/* Loop over all stats groups */
-	for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) {
-		len += sprintf(buffer + len, "%s", xstats[i].desc);
+	for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
+		seq_printf(m, "%s", xstats[i].desc);
 		/* inner loop does each group */
 		while (j < xstats[i].endpoint) {
 			val = 0;
 			/* sum over all cpus */
 			for_each_possible_cpu(c)
 				val += *(((__u32*)&per_cpu(xfsstats, c) + j));
-			len += sprintf(buffer + len, " %u", val);
+			seq_printf(m, " %u", val);
 			j++;
 		}
-		buffer[len++] = '\n';
+		seq_putc(m, '\n');
 	}
 	/* extra precision counters */
 	for_each_possible_cpu(i) {
@@ -80,36 +73,38 @@ xfs_read_xfsstats(
 		xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
 	}
 
-	len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n",
+	seq_printf(m, "xpc %Lu %Lu %Lu\n",
 			xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
-	len += sprintf(buffer + len, "debug %u\n",
+	seq_printf(m, "debug %u\n",
 #if defined(DEBUG)
 		1);
 #else
 		0);
 #endif
+	return 0;
+}
 
-	if (offset >= len) {
-		*start = buffer;
-		*eof = 1;
-		return 0;
-	}
-	*start = buffer + offset;
-	if ((len -= offset) > count)
-		return count;
-	*eof = 1;
-
-	return len;
+static int xfs_stat_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xfs_stat_proc_show, NULL);
 }
 
+static const struct file_operations xfs_stat_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= xfs_stat_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 int
 xfs_init_procfs(void)
 {
 	if (!proc_mkdir("fs/xfs", NULL))
 		goto out;
 
-	if (!create_proc_read_entry("fs/xfs/stat", 0, NULL,
-			xfs_read_xfsstats, NULL))
+	if (!proc_create("fs/xfs/stat", 0, NULL,
+			 &xfs_stat_proc_fops))
 		goto out_remove_entry;
 	return 0;
 
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 21b08c0396a1..83e7ea3e25fa 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -48,50 +48,34 @@
 
 struct xqmstats xqmstats;
 
-STATIC int
-xfs_qm_read_xfsquota(
-	char		*buffer,
-	char		**start,
-	off_t		offset,
-	int		count,
-	int		*eof,
-	void		*data)
+static int xqm_proc_show(struct seq_file *m, void *v)
 {
-	int		len;
-
 	/* maximum; incore; ratio free to inuse; freelist */
-	len = sprintf(buffer, "%d\t%d\t%d\t%u\n",
+	seq_printf(m, "%d\t%d\t%d\t%u\n",
 			ndquot,
 			xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
 			xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
 			xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
-
-	if (offset >= len) {
-		*start = buffer;
-		*eof = 1;
-		return 0;
-	}
-	*start = buffer + offset;
-	if ((len -= offset) > count)
-		return count;
-	*eof = 1;
-
-	return len;
+	return 0;
 }
 
-STATIC int
-xfs_qm_read_stats(
-	char		*buffer,
-	char		**start,
-	off_t		offset,
-	int		count,
-	int		*eof,
-	void		*data)
+static int xqm_proc_open(struct inode *inode, struct file *file)
 {
-	int		len;
+	return single_open(file, xqm_proc_show, NULL);
+}
+
+static const struct file_operations xqm_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= xqm_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
 	/* quota performance statistics */
-	len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n",
+	seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
 			xqmstats.xs_qm_dqreclaims,
 			xqmstats.xs_qm_dqreclaim_misses,
 			xqmstats.xs_qm_dquot_dups,
@@ -100,25 +84,27 @@ xfs_qm_read_stats(
 			xqmstats.xs_qm_dqwants,
 			xqmstats.xs_qm_dqshake_reclaims,
 			xqmstats.xs_qm_dqinact_reclaims);
+	return 0;
+}
 
-	if (offset >= len) {
-		*start = buffer;
-		*eof = 1;
-		return 0;
-	}
-	*start = buffer + offset;
-	if ((len -= offset) > count)
-		return count;
-	*eof = 1;
-
-	return len;
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xqmstat_proc_show, NULL);
 }
 
+static const struct file_operations xqmstat_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= xqmstat_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 void
 xfs_qm_init_procfs(void)
 {
-	create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL);
-	create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL);
+	proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
+	proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
 }
 
 void
-- 
cgit v1.2.3-70-g09d2


From 9ef96da6ec5e1b4cf7eb8e30852cd88ec7d5fdc0 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 9 Aug 2009 20:23:12 +0530
Subject: xfs: includecheck fix for fs/xfs/xfs_iops.c

fix the following 'make includecheck' warning:

  fs/xfs/linux-2.6/xfs_iops.c: xfs_acl.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 8070b34cc287..626b474b2cdd 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -43,7 +43,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
-- 
cgit v1.2.3-70-g09d2


From d6db07ded51c5fb4df2f4a32e6a41e9bb5db7fc4 Mon Sep 17 00:00:00 2001
From: Csaba Henk <csaba@gluster.com>
Date: Mon, 24 Aug 2009 06:14:07 +0200
Subject: fuse: use drop_nlink() instead of direct nlink manipulation

drop_nlink() is the API function to decrease the link count of an inode.
However, at a place the control filesystem used the decrement operator
on i_nlink directly. Fix this.

Cc: Anand Avati <avati@gluster.com>
Signed-off-by: Csaba Henk <csaba@gluster.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/control.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 99c99dfb0373..218d514924c0 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -156,7 +156,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
 		d_drop(dentry);
 		dput(dentry);
 	}
-	fuse_control_sb->s_root->d_inode->i_nlink--;
+	drop_nlink(fuse_control_sb->s_root->d_inode);
 }
 
 static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
-- 
cgit v1.2.3-70-g09d2


From 487ea5af6358cb27c994e2cf056d4ee0872e43c3 Mon Sep 17 00:00:00 2001
From: Csaba Henk <csaba@gluster.com>
Date: Wed, 26 Aug 2009 19:17:22 +0200
Subject: fuse: limit user-specified values of max background requests

An untrusted user could DoS the system if s/he were allowed to accumulate an
arbitrary number of pending background requests by setting the above limits
to extremely high values in INIT. This patch excludes this possibility by
imposing global upper limits on the possible values of per-mount "max
background requests" and "congestion threshold" parameters for unprivileged
FUSE filesystems.

These global limits are implemented as module parameters.

Signed-off-by: Csaba Henk <csaba@gluster.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/inode.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 72 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9aa6f46d0c32..e54ddbda208c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -14,6 +14,7 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/parser.h>
 #include <linux/statfs.h>
 #include <linux/random.h>
@@ -28,6 +29,24 @@ static struct kmem_cache *fuse_inode_cachep;
 struct list_head fuse_conn_list;
 DEFINE_MUTEX(fuse_mutex);
 
+static int set_global_limit(const char *val, struct kernel_param *kp);
+
+static unsigned max_user_bgreq;
+module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
+		  &max_user_bgreq, 0644);
+__MODULE_PARM_TYPE(max_user_bgreq, "uint");
+MODULE_PARM_DESC(max_user_bgreq,
+ "Global limit for the maximum number of backgrounded requests an "
+ "unprivileged user can set");
+
+static unsigned max_user_congthresh;
+module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
+		  &max_user_congthresh, 0644);
+__MODULE_PARM_TYPE(max_user_congthresh, "uint");
+MODULE_PARM_DESC(max_user_congthresh,
+ "Global limit for the maximum congestion threshold an "
+ "unprivileged user can set");
+
 #define FUSE_SUPER_MAGIC 0x65735546
 
 #define FUSE_DEFAULT_BLKSIZE 512
@@ -735,6 +754,54 @@ static const struct super_operations fuse_super_operations = {
 	.show_options	= fuse_show_options,
 };
 
+static void sanitize_global_limit(unsigned *limit)
+{
+	if (*limit == 0)
+		*limit = ((num_physpages << PAGE_SHIFT) >> 13) /
+			 sizeof(struct fuse_req);
+
+	if (*limit >= 1 << 16)
+		*limit = (1 << 16) - 1;
+}
+
+static int set_global_limit(const char *val, struct kernel_param *kp)
+{
+	int rv;
+
+	rv = param_set_uint(val, kp);
+	if (rv)
+		return rv;
+
+	sanitize_global_limit((unsigned *)kp->arg);
+
+	return 0;
+}
+
+static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
+{
+	int cap_sys_admin = capable(CAP_SYS_ADMIN);
+
+	if (arg->minor < 13)
+		return;
+
+	sanitize_global_limit(&max_user_bgreq);
+	sanitize_global_limit(&max_user_congthresh);
+
+	if (arg->max_background) {
+		fc->max_background = arg->max_background;
+
+		if (!cap_sys_admin && fc->max_background > max_user_bgreq)
+			fc->max_background = max_user_bgreq;
+	}
+	if (arg->congestion_threshold) {
+		fc->congestion_threshold = arg->congestion_threshold;
+
+		if (!cap_sys_admin &&
+		    fc->congestion_threshold > max_user_congthresh)
+			fc->congestion_threshold = max_user_congthresh;
+	}
+}
+
 static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 {
 	struct fuse_init_out *arg = &req->misc.init_out;
@@ -744,12 +811,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 	else {
 		unsigned long ra_pages;
 
-		if (arg->minor >= 13) {
-			if (arg->max_background)
-				fc->max_background = arg->max_background;
-			if (arg->congestion_threshold)
-				fc->congestion_threshold = arg->congestion_threshold;
-		}
+		process_init_limits(fc, arg);
+
 		if (arg->minor >= 6) {
 			ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
 			if (arg->flags & FUSE_ASYNC_READ)
@@ -1161,6 +1224,9 @@ static int __init fuse_init(void)
 	if (res)
 		goto err_sysfs_cleanup;
 
+	sanitize_global_limit(&max_user_bgreq);
+	sanitize_global_limit(&max_user_congthresh);
+
 	return 0;
 
  err_sysfs_cleanup:
-- 
cgit v1.2.3-70-g09d2


From 79a9d99434b104c562f30f21b75317667f444793 Mon Sep 17 00:00:00 2001
From: Csaba Henk <csaba@gluster.com>
Date: Wed, 26 Aug 2009 19:18:24 +0200
Subject: fuse: add fusectl interface to max_background

Make the max_background and congestion_threshold parameters of a FUSE
mount tunable at runtime by adding the respective knobs to its directory
within the fusectl filesystem.

Signed-off-by: Csaba Henk <csaba@gluster.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/control.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/fuse/fuse_i.h  |   6 ++-
 fs/fuse/inode.c   |   4 +-
 3 files changed, 141 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 218d514924c0..3773fd63d2f9 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -61,6 +61,121 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
 	return simple_read_from_buffer(buf, len, ppos, tmp, size);
 }
 
+static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
+				    size_t len, loff_t *ppos, unsigned val)
+{
+	char tmp[32];
+	size_t size = sprintf(tmp, "%u\n", val);
+
+	return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
+				     size_t count, loff_t *ppos, unsigned *val,
+				     unsigned global_limit)
+{
+	unsigned long t;
+	char tmp[32];
+	unsigned limit = (1 << 16) - 1;
+	int err;
+
+	if (*ppos || count >= sizeof(tmp) - 1)
+		return -EINVAL;
+
+	if (copy_from_user(tmp, buf, count))
+		return -EINVAL;
+
+	tmp[count] = '\0';
+
+	err = strict_strtoul(tmp, 0, &t);
+	if (err)
+		return err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		limit = min(limit, global_limit);
+
+	if (t > limit)
+		return -EINVAL;
+
+	*val = t;
+
+	return count;
+}
+
+static ssize_t fuse_conn_max_background_read(struct file *file,
+					     char __user *buf, size_t len,
+					     loff_t *ppos)
+{
+	struct fuse_conn *fc;
+	unsigned val;
+
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		return 0;
+
+	val = fc->max_background;
+	fuse_conn_put(fc);
+
+	return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_max_background_write(struct file *file,
+					      const char __user *buf,
+					      size_t count, loff_t *ppos)
+{
+	unsigned val;
+	ssize_t ret;
+
+	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+				    max_user_bgreq);
+	if (ret > 0) {
+		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+		if (fc) {
+			fc->max_background = val;
+			fuse_conn_put(fc);
+		}
+	}
+
+	return ret;
+}
+
+static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
+						   char __user *buf, size_t len,
+						   loff_t *ppos)
+{
+	struct fuse_conn *fc;
+	unsigned val;
+
+	fc = fuse_ctl_file_conn_get(file);
+	if (!fc)
+		return 0;
+
+	val = fc->congestion_threshold;
+	fuse_conn_put(fc);
+
+	return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
+						    const char __user *buf,
+						    size_t count, loff_t *ppos)
+{
+	unsigned val;
+	ssize_t ret;
+
+	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+				    max_user_congthresh);
+	if (ret > 0) {
+		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+		if (fc) {
+			fc->congestion_threshold = val;
+			fuse_conn_put(fc);
+		}
+	}
+
+	return ret;
+}
+
 static const struct file_operations fuse_ctl_abort_ops = {
 	.open = nonseekable_open,
 	.write = fuse_conn_abort_write,
@@ -71,6 +186,18 @@ static const struct file_operations fuse_ctl_waiting_ops = {
 	.read = fuse_conn_waiting_read,
 };
 
+static const struct file_operations fuse_conn_max_background_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_max_background_read,
+	.write = fuse_conn_max_background_write,
+};
+
+static const struct file_operations fuse_conn_congestion_threshold_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_congestion_threshold_read,
+	.write = fuse_conn_congestion_threshold_write,
+};
+
 static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 					  struct fuse_conn *fc,
 					  const char *name,
@@ -127,9 +254,14 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
 		goto err;
 
 	if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
-				NULL, &fuse_ctl_waiting_ops) ||
+				 NULL, &fuse_ctl_waiting_ops) ||
 	    !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
-				 NULL, &fuse_ctl_abort_ops))
+				 NULL, &fuse_ctl_abort_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
+				 1, NULL, &fuse_conn_max_background_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
+				 S_IFREG | 0600, 1, NULL,
+				 &fuse_conn_congestion_threshold_ops))
 		goto err;
 
 	return 0;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6bcfab04396f..fc9c79feb5f7 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -32,7 +32,7 @@
 #define FUSE_NAME_MAX 1024
 
 /** Number of dentries for each connection in the control filesystem */
-#define FUSE_CTL_NUM_DENTRIES 3
+#define FUSE_CTL_NUM_DENTRIES 5
 
 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
     module will check permissions based on the file mode.  Otherwise no
@@ -49,6 +49,10 @@ extern struct list_head fuse_conn_list;
 /** Global mutex protecting fuse_conn_list and the control filesystem */
 extern struct mutex fuse_mutex;
 
+/** Module parameters */
+extern unsigned max_user_bgreq;
+extern unsigned max_user_congthresh;
+
 /** FUSE inode */
 struct fuse_inode {
 	/** Inode data */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e54ddbda208c..8f9aca8d4ad5 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -31,7 +31,7 @@ DEFINE_MUTEX(fuse_mutex);
 
 static int set_global_limit(const char *val, struct kernel_param *kp);
 
-static unsigned max_user_bgreq;
+unsigned max_user_bgreq;
 module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
 		  &max_user_bgreq, 0644);
 __MODULE_PARM_TYPE(max_user_bgreq, "uint");
@@ -39,7 +39,7 @@ MODULE_PARM_DESC(max_user_bgreq,
  "Global limit for the maximum number of backgrounded requests an "
  "unprivileged user can set");
 
-static unsigned max_user_congthresh;
+unsigned max_user_congthresh;
 module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
 		  &max_user_congthresh, 0644);
 __MODULE_PARM_TYPE(max_user_congthresh, "uint");
-- 
cgit v1.2.3-70-g09d2


From 2c96ce9f2084c1e04d02883e622f74a537a63aea Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 15 Sep 2009 09:43:56 +0200
Subject: fs: remove bdev->bd_inode_backing_dev_info

It has been unused since it was introduced in:

commit 520808bf20e90fdbdb320264ba7dd5cf9d47dcac
Author: Andrew Morton <akpm@osdl.org>
Date:   Fri May 21 00:46:17 2004 -0700

    [PATCH] block device layer: separate backing_dev_info infrastructure

So lets just kill it.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c        | 1 -
 fs/inode.c            | 4 +---
 fs/nilfs2/the_nilfs.c | 4 +---
 include/linux/fs.h    | 1 -
 4 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3581a4e53942..71e7e03ac343 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -420,7 +420,6 @@ static void bdev_destroy_inode(struct inode *inode)
 {
 	struct bdev_inode *bdi = BDEV_I(inode);
 
-	bdi->bdev.bd_inode_backing_dev_info = NULL;
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index ae7b67e48661..b2ba83d2c4e1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -182,9 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	if (sb->s_bdev) {
 		struct backing_dev_info *bdi;
 
-		bdi = sb->s_bdev->bd_inode_backing_dev_info;
-		if (!bdi)
-			bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+		bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
 		mapping->backing_dev_info = bdi;
 	}
 	inode->i_private = NULL;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index d4168e269c5d..ad391a8c3e7e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -591,9 +591,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 
 	nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
 
-	bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
-	if (!bdi)
-		bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
+	bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
 	nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
 
 	/* Finding last segment */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b21cf6b9c80b..db29588874ac 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -655,7 +655,6 @@ struct block_device {
 	int			bd_invalidated;
 	struct gendisk *	bd_disk;
 	struct list_head	bd_list;
-	struct backing_dev_info *bd_inode_backing_dev_info;
 	/*
 	 * Private data.  You must have bd_claim'ed the block_device
 	 * to use this.  NOTE:  bd_claim allows an owner to claim
-- 
cgit v1.2.3-70-g09d2


From 1fe06ad89255c211fe100d7f690d10b161398df8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 15 Sep 2009 15:10:20 +0200
Subject: writeback: get rid of wbc->for_writepages

It's only set, it's never checked. Kill it.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/afs/write.c              | 1 -
 fs/btrfs/ordered-data.c     | 1 -
 fs/jbd2/commit.c            | 1 -
 fs/nfs/write.c              | 1 -
 include/linux/writeback.h   | 1 -
 include/trace/events/ext4.h | 6 ++----
 mm/page-writeback.c         | 2 --
 7 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/write.c b/fs/afs/write.c
index c2e7a7ff0080..c63a3c8beb73 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
 		.bdi		= mapping->backing_dev_info,
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_to_write	= LONG_MAX,
-		.for_writepages = 1,
 		.range_cyclic	= 1,
 	};
 	int ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682f..7b2f401e604e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -740,7 +740,6 @@ int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
 		.nr_to_write = mapping->nrpages * 2,
 		.range_start = start,
 		.range_end = end,
-		.for_writepages = 1,
 	};
 	return btrfs_writepages(mapping, &wbc);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7b4088b2364d..0df600e9162d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -220,7 +220,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping)
 		.nr_to_write = mapping->nrpages * 2,
 		.range_start = 0,
 		.range_end = i_size_read(mapping->host),
-		.for_writepages = 1,
 	};
 
 	ret = generic_writepages(mapping, &wbc);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 120acadc6a84..53eb26c16b50 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1490,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
 		.nr_to_write = LONG_MAX,
 		.range_start = 0,
 		.range_end = LLONG_MAX,
-		.for_writepages = 1,
 	};
 
 	return __nfs_write_mapping(mapping, &wbc, how);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d347632f1861..48a054e2b716 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -50,7 +50,6 @@ struct writeback_control {
 	unsigned encountered_congestion:1; /* An output: a queue is full */
 	unsigned for_kupdate:1;		/* A kupdate writeback */
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
-	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
 	/*
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 7d8b5bc74185..8d433c4e3709 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -227,7 +227,6 @@ TRACE_EVENT(ext4_da_writepages,
 		__field(	char,	nonblocking		)
 		__field(	char,	for_kupdate		)
 		__field(	char,	for_reclaim		)
-		__field(	char,	for_writepages		)
 		__field(	char,	range_cyclic		)
 	),
 
@@ -241,16 +240,15 @@ TRACE_EVENT(ext4_da_writepages,
 		__entry->nonblocking	= wbc->nonblocking;
 		__entry->for_kupdate	= wbc->for_kupdate;
 		__entry->for_reclaim	= wbc->for_reclaim;
-		__entry->for_writepages	= wbc->for_writepages;
 		__entry->range_cyclic	= wbc->range_cyclic;
 	),
 
-	TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d",
+	TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d",
 		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write,
 		  __entry->pages_skipped, __entry->range_start,
 		  __entry->range_end, __entry->nonblocking,
 		  __entry->for_kupdate, __entry->for_reclaim,
-		  __entry->for_writepages, __entry->range_cyclic)
+		  __entry->range_cyclic)
 );
 
 TRACE_EVENT(ext4_da_writepages_result,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index dd73d29c15a8..abc648f5de00 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1020,12 +1020,10 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 
 	if (wbc->nr_to_write <= 0)
 		return 0;
-	wbc->for_writepages = 1;
 	if (mapping->a_ops->writepages)
 		ret = mapping->a_ops->writepages(mapping, wbc);
 	else
 		ret = generic_writepages(mapping, wbc);
-	wbc->for_writepages = 0;
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From f0fad8a530e7cbad5f686dbca3079d1a626a3882 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 11 Sep 2009 09:47:56 +0200
Subject: writeback: merely wakeup flusher thread if work allocation fails for
 WB_SYNC_NONE

Since it's an opportunistic writeback and not a data integrity action,
don't punt to blocking writeback. Just wakeup the thread and it will
flush old data.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 46 ++++++++++++++--------------------------------
 1 file changed, 14 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 628235cf44b5..783ed44c7cfe 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -75,13 +75,6 @@ static inline void bdi_work_init(struct bdi_work *work,
 	work->state = WS_USED;
 }
 
-static inline void bdi_work_init_on_stack(struct bdi_work *work,
-					  struct writeback_control *wbc)
-{
-	bdi_work_init(work, wbc);
-	work->state |= WS_ONSTACK;
-}
-
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
@@ -207,34 +200,23 @@ static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
 
 void bdi_start_writeback(struct writeback_control *wbc)
 {
-	const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
-	struct bdi_work work_stack, *work = NULL;
-
-	if (!must_wait)
-		work = bdi_alloc_work(wbc);
+	/*
+	 * WB_SYNC_NONE is opportunistic writeback. If this allocation fails,
+	 * bdi_queue_work() will wake up the thread and flush old data. This
+	 * should ensure some amount of progress in freeing memory.
+	 */
+	if (wbc->sync_mode != WB_SYNC_ALL) {
+		struct bdi_work *w = bdi_alloc_work(wbc);
 
-	if (!work) {
-		work = &work_stack;
-		bdi_work_init_on_stack(work, wbc);
-	}
+		bdi_queue_work(wbc->bdi, w);
+	} else {
+		struct bdi_work work;
 
-	bdi_queue_work(wbc->bdi, work);
+		bdi_work_init(&work, wbc);
+		work.state |= WS_ONSTACK;
 
-	/*
-	 * If the sync mode is WB_SYNC_ALL, block waiting for the work to
-	 * complete. If not, we only need to wait for the work to be started,
-	 * if we allocated it on-stack. We use the same mechanism, if the
-	 * wait bit is set in the bdi_work struct, then threads will not
-	 * clear pending until after they are done.
-	 *
-	 * Note that work == &work_stack if must_wait is true, so we don't
-	 * need to do call_rcu() here ever, since the completion path will
-	 * have done that for us.
-	 */
-	if (must_wait || work == &work_stack) {
-		bdi_wait_on_work_clear(work);
-		if (work != &work_stack)
-			call_rcu(&work->rcu_head, bdi_work_free);
+		bdi_queue_work(wbc->bdi, &work);
+		bdi_wait_on_work_clear(&work);
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From c4a77a6c7dcff04a2abc7fe4b6b2ae605be41c5b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 16 Sep 2009 15:18:25 +0200
Subject: writeback: make wb_writeback() take an argument structure

We need to be able to pass in range_cyclic as well, so instead
of growing yet another argument, split the arguments into a
struct wb_writeback_args structure that we can use internally.
Also makes it easier to just copy all members to an on-stack
struct, since we can't access work after clearing the pending
bit.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 78 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 49 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 783ed44c7cfe..c5e91225501d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -34,6 +34,17 @@
  */
 int nr_pdflush_threads;
 
+/*
+ * Passed into wb_writeback(), essentially a subset of writeback_control
+ */
+struct wb_writeback_args {
+	long nr_pages;
+	struct super_block *sb;
+	enum writeback_sync_modes sync_mode;
+	int for_kupdate;
+	int range_cyclic;
+};
+
 /*
  * Work items for the bdi_writeback threads
  */
@@ -45,9 +56,7 @@ struct bdi_work {
 	unsigned long seen;
 	atomic_t pending;
 
-	struct super_block *sb;
-	unsigned long nr_pages;
-	enum writeback_sync_modes sync_mode;
+	struct wb_writeback_args args;
 
 	unsigned long state;
 };
@@ -69,9 +78,11 @@ static inline void bdi_work_init(struct bdi_work *work,
 				 struct writeback_control *wbc)
 {
 	INIT_RCU_HEAD(&work->rcu_head);
-	work->sb = wbc->sb;
-	work->nr_pages = wbc->nr_to_write;
-	work->sync_mode = wbc->sync_mode;
+	work->args.sb = wbc->sb;
+	work->args.nr_pages = wbc->nr_to_write;
+	work->args.sync_mode = wbc->sync_mode;
+	work->args.range_cyclic = wbc->range_cyclic;
+	work->args.for_kupdate = 0;
 	work->state = WS_USED;
 }
 
@@ -106,7 +117,7 @@ static void bdi_work_free(struct rcu_head *head)
 
 static void wb_work_complete(struct bdi_work *work)
 {
-	const enum writeback_sync_modes sync_mode = work->sync_mode;
+	const enum writeback_sync_modes sync_mode = work->args.sync_mode;
 
 	/*
 	 * For allocated work, we can clear the done/seen bit right here.
@@ -653,17 +664,16 @@ static inline bool over_bground_thresh(void)
  * older_than_this takes precedence over nr_to_write.  So we'll only write back
  * all dirty pages if they are all attached to "old" mappings.
  */
-static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
-			 struct super_block *sb,
-			 enum writeback_sync_modes sync_mode, int for_kupdate)
+static long wb_writeback(struct bdi_writeback *wb,
+			 struct wb_writeback_args *args)
 {
 	struct writeback_control wbc = {
 		.bdi			= wb->bdi,
-		.sb			= sb,
-		.sync_mode		= sync_mode,
+		.sb			= args->sb,
+		.sync_mode		= args->sync_mode,
 		.older_than_this	= NULL,
-		.for_kupdate		= for_kupdate,
-		.range_cyclic		= 1,
+		.for_kupdate		= args->for_kupdate,
+		.range_cyclic		= args->range_cyclic,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
@@ -673,13 +683,18 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
 		oldest_jif = jiffies -
 				msecs_to_jiffies(dirty_expire_interval * 10);
 	}
+	if (!wbc.range_cyclic) {
+		wbc.range_start = 0;
+		wbc.range_end = LLONG_MAX;
+	}
 
 	for (;;) {
 		/*
 		 * Don't flush anything for non-integrity writeback where
 		 * no nr_pages was given
 		 */
-		if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
+		if (!args->for_kupdate && args->nr_pages <= 0 &&
+		     args->sync_mode == WB_SYNC_NONE)
 			break;
 
 		/*
@@ -687,7 +702,8 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
 		 * periodic background writeout and we are below the
 		 * background dirty threshold, don't do anything
 		 */
-		if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
+		if (args->for_kupdate && args->nr_pages <= 0 &&
+		    !over_bground_thresh())
 			break;
 
 		wbc.more_io = 0;
@@ -695,7 +711,7 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
 		writeback_inodes_wb(wb, &wbc);
-		nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+		args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 
 		/*
@@ -749,8 +765,16 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
-	if (nr_pages)
-		return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
+	if (nr_pages) {
+		struct wb_writeback_args args = {
+			.nr_pages	= nr_pages,
+			.sync_mode	= WB_SYNC_NONE,
+			.for_kupdate	= 1,
+			.range_cyclic	= 1,
+		};
+
+		return wb_writeback(wb, &args);
+	}
 
 	return 0;
 }
@@ -762,35 +786,31 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 {
 	struct backing_dev_info *bdi = wb->bdi;
 	struct bdi_work *work;
-	long nr_pages, wrote = 0;
+	long wrote = 0;
 
 	while ((work = get_next_work_item(bdi, wb)) != NULL) {
-		enum writeback_sync_modes sync_mode;
-
-		nr_pages = work->nr_pages;
+		struct wb_writeback_args args = work->args;
 
 		/*
 		 * Override sync mode, in case we must wait for completion
 		 */
 		if (force_wait)
-			work->sync_mode = sync_mode = WB_SYNC_ALL;
-		else
-			sync_mode = work->sync_mode;
+			work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
 
 		/*
 		 * If this isn't a data integrity operation, just notify
 		 * that we have seen this work and we are now starting it.
 		 */
-		if (sync_mode == WB_SYNC_NONE)
+		if (args.sync_mode == WB_SYNC_NONE)
 			wb_clear_pending(wb, work);
 
-		wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
+		wrote += wb_writeback(wb, &args);
 
 		/*
 		 * This is a data integrity writeback, so only do the
 		 * notification when we have completed the work.
 		 */
-		if (sync_mode == WB_SYNC_ALL)
+		if (args.sync_mode == WB_SYNC_ALL)
 			wb_clear_pending(wb, work);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 32a88aa1b6dfb901cec64e1898cac78d0f25028a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 16 Sep 2009 15:02:33 +0200
Subject: fs: Assign bdi in super_block

We do this automatically in get_sb_bdev() from the set_bdev_super()
callback. Filesystems that have their own private backing_dev_info
must assign that in ->fill_super().

Note that ->s_bdi assignment is required for proper writeback!

Acked-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/btrfs/disk-io.c | 1 +
 fs/fuse/inode.c    | 2 ++
 fs/nfs/super.c     | 2 ++
 fs/super.c         | 6 ++++++
 fs/sync.c          | 9 ++++++++-
 fs/ubifs/super.c   | 1 +
 include/linux/fs.h | 1 +
 7 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 15831d5c7367..8b8192790011 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1600,6 +1600,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	sb->s_blocksize = 4096;
 	sb->s_blocksize_bits = blksize_bits(4096);
+	sb->s_bdi = &fs_info->bdi;
 
 	/*
 	 * we set the i_size on the btree inode to the max possible int.
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4567db6f9430..e5dbecd87b0f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -894,6 +894,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (err)
 		goto err_put_conn;
 
+	sb->s_bdi = &fc->bdi;
+
 	/* Handle umasking inside the fuse code */
 	if (sb->s_flags & MS_POSIXACL)
 		fc->dont_mask = 1;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 867f70504531..de935692d40d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1918,6 +1918,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
 	if (server->flags & NFS_MOUNT_NOAC)
 		sb->s_flags |= MS_SYNCHRONOUS;
 
+	sb->s_bdi = &server->backing_dev_info;
+
 	nfs_super_set_maxbytes(sb, server->maxfilesize);
 }
 
diff --git a/fs/super.c b/fs/super.c
index 9cda337ddae2..b03fea8fbfb6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -707,6 +707,12 @@ static int set_bdev_super(struct super_block *s, void *data)
 {
 	s->s_bdev = data;
 	s->s_dev = s->s_bdev->bd_dev;
+
+	/*
+	 * We set the bdi here to the queue backing, file systems can
+	 * overwrite this in ->fill_super()
+	 */
+	s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
 	return 0;
 }
 
diff --git a/fs/sync.c b/fs/sync.c
index 192340930bb4..c08467a5d7cb 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,6 +27,13 @@
  */
 static int __sync_filesystem(struct super_block *sb, int wait)
 {
+	/*
+	 * This should be safe, as we require bdi backing to actually
+	 * write out data in the first place
+	 */
+	if (!sb->s_bdi)
+		return 0;
+
 	/* Avoid doing twice syncing and cache pruning for quota sync */
 	if (!wait) {
 		writeout_quota_sb(sb, -1);
@@ -101,7 +108,7 @@ restart:
 		spin_unlock(&sb_lock);
 
 		down_read(&sb->s_umount);
-		if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
+		if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
 			__sync_filesystem(sb, wait);
 		up_read(&sb->s_umount);
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 51763aa8f4de..c4af069df1ad 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1980,6 +1980,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	if (err)
 		goto out_bdi;
 
+	sb->s_bdi = &c->bdi;
 	sb->s_fs_info = c;
 	sb->s_magic = UBIFS_SUPER_MAGIC;
 	sb->s_blocksize = UBIFS_BLOCK_SIZE;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index db29588874ac..90162fb3bf04 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1342,6 +1342,7 @@ struct super_block {
 	int			s_nr_dentry_unused;	/* # of dentry on lru */
 
 	struct block_device	*s_bdev;
+	struct backing_dev_info *s_bdi;
 	struct mtd_info		*s_mtd;
 	struct list_head	s_instances;
 	struct quota_info	s_dquot;	/* Diskquota specific options */
-- 
cgit v1.2.3-70-g09d2


From f11fcae8401a3175f528e2f7917362645d570111 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 15 Sep 2009 09:53:35 +0200
Subject: writeback: only use bdi_writeback_all() for WB_SYNC_NONE writeout

Data integrity writeback must use bdi_start_writeback() and ensure
that wbc->sb and wbc->bdi are set.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 70 +++++++++++--------------------------------------------
 1 file changed, 14 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c5e91225501d..14f06b459197 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -50,7 +50,6 @@ struct wb_writeback_args {
  */
 struct bdi_work {
 	struct list_head list;
-	struct list_head wait_list;
 	struct rcu_head rcu_head;
 
 	unsigned long seen;
@@ -198,7 +197,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
 		    TASK_UNINTERRUPTIBLE);
 }
 
-static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
+static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
+				 struct writeback_control *wbc)
 {
 	struct bdi_work *work;
 
@@ -206,7 +206,7 @@ static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
 	if (work)
 		bdi_work_init(work, wbc);
 
-	return work;
+	bdi_queue_work(bdi, work);
 }
 
 void bdi_start_writeback(struct writeback_control *wbc)
@@ -216,11 +216,9 @@ void bdi_start_writeback(struct writeback_control *wbc)
 	 * bdi_queue_work() will wake up the thread and flush old data. This
 	 * should ensure some amount of progress in freeing memory.
 	 */
-	if (wbc->sync_mode != WB_SYNC_ALL) {
-		struct bdi_work *w = bdi_alloc_work(wbc);
-
-		bdi_queue_work(wbc->bdi, w);
-	} else {
+	if (wbc->sync_mode != WB_SYNC_ALL)
+		bdi_alloc_queue_work(wbc->bdi, wbc);
+	else {
 		struct bdi_work work;
 
 		bdi_work_init(&work, wbc);
@@ -860,67 +858,26 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 }
 
 /*
- * Schedule writeback for all backing devices. Expensive! If this is a data
- * integrity operation, writeback will be complete when this returns. If
- * we are simply called for WB_SYNC_NONE, then writeback will merely be
- * scheduled to run.
+ * Schedule writeback for all backing devices. Can only be used for
+ * WB_SYNC_NONE writeback, WB_SYNC_ALL should use bdi_start_writeback()
+ * and pass in the superblock.
  */
 static void bdi_writeback_all(struct writeback_control *wbc)
 {
-	const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
 	struct backing_dev_info *bdi;
-	struct bdi_work *work;
-	LIST_HEAD(list);
 
-restart:
+	WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
+
 	spin_lock(&bdi_lock);
 
 	list_for_each_entry(bdi, &bdi_list, bdi_list) {
-		struct bdi_work *work;
-
 		if (!bdi_has_dirty_io(bdi))
 			continue;
 
-		/*
-		 * If work allocation fails, do the writes inline. We drop
-		 * the lock and restart the list writeout. This should be OK,
-		 * since this happens rarely and because the writeout should
-		 * eventually make more free memory available.
-		 */
-		work = bdi_alloc_work(wbc);
-		if (!work) {
-			struct writeback_control __wbc;
-
-			/*
-			 * Not a data integrity writeout, just continue
-			 */
-			if (!must_wait)
-				continue;
-
-			spin_unlock(&bdi_lock);
-			__wbc = *wbc;
-			__wbc.bdi = bdi;
-			writeback_inodes_wbc(&__wbc);
-			goto restart;
-		}
-		if (must_wait)
-			list_add_tail(&work->wait_list, &list);
-
-		bdi_queue_work(bdi, work);
+		bdi_alloc_queue_work(bdi, wbc);
 	}
 
 	spin_unlock(&bdi_lock);
-
-	/*
-	 * If this is for WB_SYNC_ALL, wait for pending work to complete
-	 * before returning.
-	 */
-	while (!list_empty(&list)) {
-		work = list_entry(list.next, struct bdi_work, wait_list);
-		list_del(&work->wait_list);
-		bdi_wait_on_work_clear(work);
-		call_rcu(&work->rcu_head, bdi_work_free);
-	}
 }
 
 /*
@@ -1177,6 +1134,7 @@ long sync_inodes_sb(struct super_block *sb)
 {
 	struct writeback_control wbc = {
 		.sb		= sb,
+		.bdi		= sb->s_bdi,
 		.sync_mode	= WB_SYNC_ALL,
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
@@ -1184,7 +1142,7 @@ long sync_inodes_sb(struct super_block *sb)
 	long nr_to_write = LONG_MAX; /* doesn't actually matter */
 
 	wbc.nr_to_write = nr_to_write;
-	bdi_writeback_all(&wbc);
+	bdi_start_writeback(&wbc);
 	wait_sb_inodes(&wbc);
 	return nr_to_write - wbc.nr_to_write;
 }
-- 
cgit v1.2.3-70-g09d2


From cfc4ba5365449cb6b5c9f68d755a142f17da1e47 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 14 Sep 2009 13:12:40 +0200
Subject: writeback: use RCU to protect bdi_list

Now that bdi_writeback_all() no longer handles integrity writeback,
it doesn't have to block anymore. This means that we can switch
bdi_list reader side protection to RCU.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c           |  6 ++--
 include/linux/backing-dev.h |  1 +
 mm/backing-dev.c            | 76 ++++++++++++++++++++++++++++++++-------------
 mm/page-writeback.c         |  8 ++---
 4 files changed, 63 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 14f06b459197..f8cd7a97f5b7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -868,16 +868,16 @@ static void bdi_writeback_all(struct writeback_control *wbc)
 
 	WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
 
-	spin_lock(&bdi_lock);
+	rcu_read_lock();
 
-	list_for_each_entry(bdi, &bdi_list, bdi_list) {
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
 
 		bdi_alloc_queue_work(bdi, wbc);
 	}
 
-	spin_unlock(&bdi_lock);
+	rcu_read_unlock();
 }
 
 /*
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f169bcb90b58..859e797f4576 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -59,6 +59,7 @@ struct bdi_writeback {
 
 struct backing_dev_info {
 	struct list_head bdi_list;
+	struct rcu_head rcu_head;
 	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
 	unsigned long state;	/* Always use atomic bitops on this */
 	unsigned int capabilities; /* Device capabilities */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d3ca0dac1111..fd93566345b6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -26,6 +26,12 @@ struct backing_dev_info default_backing_dev_info = {
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
 static struct class *bdi_class;
+
+/*
+ * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
+ * reader side protection for bdi_pending_list. bdi_list has RCU reader side
+ * locking.
+ */
 DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
 LIST_HEAD(bdi_pending_list);
@@ -284,9 +290,9 @@ static int bdi_start_fn(void *ptr)
 	/*
 	 * Add us to the active bdi_list
 	 */
-	spin_lock(&bdi_lock);
-	list_add(&bdi->bdi_list, &bdi_list);
-	spin_unlock(&bdi_lock);
+	spin_lock_bh(&bdi_lock);
+	list_add_rcu(&bdi->bdi_list, &bdi_list);
+	spin_unlock_bh(&bdi_lock);
 
 	bdi_task_init(bdi, wb);
 
@@ -389,7 +395,7 @@ static int bdi_forker_task(void *ptr)
 		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
 			wb_do_writeback(me, 0);
 
-		spin_lock(&bdi_lock);
+		spin_lock_bh(&bdi_lock);
 
 		/*
 		 * Check if any existing bdi's have dirty data without
@@ -410,7 +416,7 @@ static int bdi_forker_task(void *ptr)
 		if (list_empty(&bdi_pending_list)) {
 			unsigned long wait;
 
-			spin_unlock(&bdi_lock);
+			spin_unlock_bh(&bdi_lock);
 			wait = msecs_to_jiffies(dirty_writeback_interval * 10);
 			schedule_timeout(wait);
 			try_to_freeze();
@@ -426,7 +432,7 @@ static int bdi_forker_task(void *ptr)
 		bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
 				 bdi_list);
 		list_del_init(&bdi->bdi_list);
-		spin_unlock(&bdi_lock);
+		spin_unlock_bh(&bdi_lock);
 
 		wb = &bdi->wb;
 		wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
@@ -445,9 +451,9 @@ static int bdi_forker_task(void *ptr)
 			 * a chance to flush other bdi's to free
 			 * memory.
 			 */
-			spin_lock(&bdi_lock);
+			spin_lock_bh(&bdi_lock);
 			list_add_tail(&bdi->bdi_list, &bdi_pending_list);
-			spin_unlock(&bdi_lock);
+			spin_unlock_bh(&bdi_lock);
 
 			bdi_flush_io(bdi);
 		}
@@ -456,6 +462,24 @@ static int bdi_forker_task(void *ptr)
 	return 0;
 }
 
+static void bdi_add_to_pending(struct rcu_head *head)
+{
+	struct backing_dev_info *bdi;
+
+	bdi = container_of(head, struct backing_dev_info, rcu_head);
+	INIT_LIST_HEAD(&bdi->bdi_list);
+
+	spin_lock(&bdi_lock);
+	list_add_tail(&bdi->bdi_list, &bdi_pending_list);
+	spin_unlock(&bdi_lock);
+
+	/*
+	 * We are now on the pending list, wake up bdi_forker_task()
+	 * to finish the job and add us back to the active bdi_list
+	 */
+	wake_up_process(default_backing_dev_info.wb.task);
+}
+
 /*
  * Add the default flusher task that gets created for any bdi
  * that has dirty data pending writeout
@@ -478,16 +502,29 @@ void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
 	 * waiting for previous additions to finish.
 	 */
 	if (!test_and_set_bit(BDI_pending, &bdi->state)) {
-		list_move_tail(&bdi->bdi_list, &bdi_pending_list);
+		list_del_rcu(&bdi->bdi_list);
 
 		/*
-		 * We are now on the pending list, wake up bdi_forker_task()
-		 * to finish the job and add us back to the active bdi_list
+		 * We must wait for the current RCU period to end before
+		 * moving to the pending list. So schedule that operation
+		 * from an RCU callback.
 		 */
-		wake_up_process(default_backing_dev_info.wb.task);
+		call_rcu(&bdi->rcu_head, bdi_add_to_pending);
 	}
 }
 
+/*
+ * Remove bdi from bdi_list, and ensure that it is no longer visible
+ */
+static void bdi_remove_from_list(struct backing_dev_info *bdi)
+{
+	spin_lock_bh(&bdi_lock);
+	list_del_rcu(&bdi->bdi_list);
+	spin_unlock_bh(&bdi_lock);
+
+	synchronize_rcu();
+}
+
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...)
 {
@@ -506,9 +543,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		goto exit;
 	}
 
-	spin_lock(&bdi_lock);
-	list_add_tail(&bdi->bdi_list, &bdi_list);
-	spin_unlock(&bdi_lock);
+	spin_lock_bh(&bdi_lock);
+	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+	spin_unlock_bh(&bdi_lock);
 
 	bdi->dev = dev;
 
@@ -526,9 +563,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 			wb->task = NULL;
 			ret = -ENOMEM;
 
-			spin_lock(&bdi_lock);
-			list_del(&bdi->bdi_list);
-			spin_unlock(&bdi_lock);
+			bdi_remove_from_list(bdi);
 			goto exit;
 		}
 	}
@@ -565,9 +600,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 	/*
 	 * Make sure nobody finds us on the bdi_list anymore
 	 */
-	spin_lock(&bdi_lock);
-	list_del(&bdi->bdi_list);
-	spin_unlock(&bdi_lock);
+	bdi_remove_from_list(bdi);
 
 	/*
 	 * Finally, kill the kernel threads. We don't need to be RCU
@@ -599,6 +632,7 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = PROP_FRAC_BASE;
 	spin_lock_init(&bdi->wb_lock);
+	INIT_RCU_HEAD(&bdi->rcu_head);
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->wb_list);
 	INIT_LIST_HEAD(&bdi->work_list);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index abc648f5de00..12c3d843ce93 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -315,7 +315,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
 {
 	int ret = 0;
 
-	spin_lock(&bdi_lock);
+	spin_lock_bh(&bdi_lock);
 	if (min_ratio > bdi->max_ratio) {
 		ret = -EINVAL;
 	} else {
@@ -327,7 +327,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
 			ret = -EINVAL;
 		}
 	}
-	spin_unlock(&bdi_lock);
+	spin_unlock_bh(&bdi_lock);
 
 	return ret;
 }
@@ -339,14 +339,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 	if (max_ratio > 100)
 		return -EINVAL;
 
-	spin_lock(&bdi_lock);
+	spin_lock_bh(&bdi_lock);
 	if (bdi->min_ratio > max_ratio) {
 		ret = -EINVAL;
 	} else {
 		bdi->max_ratio = max_ratio;
 		bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
 	}
-	spin_unlock(&bdi_lock);
+	spin_unlock_bh(&bdi_lock);
 
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From bcddc3f01c9122882c8b9f12ab94a934e55aef97 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sun, 13 Sep 2009 20:07:36 +0200
Subject: writeback: inline allocation failure handling in
 bdi_alloc_queue_work()

This gets rid of work == NULL in bdi_queue_work() and puts the
OOM handling where it belongs.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 49 +++++++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f8cd7a97f5b7..59b3ee63b624 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -149,21 +149,19 @@ static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
 
 static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 {
-	if (work) {
-		work->seen = bdi->wb_mask;
-		BUG_ON(!work->seen);
-		atomic_set(&work->pending, bdi->wb_cnt);
-		BUG_ON(!bdi->wb_cnt);
+	work->seen = bdi->wb_mask;
+	BUG_ON(!work->seen);
+	atomic_set(&work->pending, bdi->wb_cnt);
+	BUG_ON(!bdi->wb_cnt);
 
-		/*
-		 * Make sure stores are seen before it appears on the list
-		 */
-		smp_mb();
+	/*
+	 * Make sure stores are seen before it appears on the list
+	 */
+	smp_mb();
 
-		spin_lock(&bdi->wb_lock);
-		list_add_tail_rcu(&work->list, &bdi->work_list);
-		spin_unlock(&bdi->wb_lock);
-	}
+	spin_lock(&bdi->wb_lock);
+	list_add_tail_rcu(&work->list, &bdi->work_list);
+	spin_unlock(&bdi->wb_lock);
 
 	/*
 	 * If the default thread isn't there, make sure we add it. When
@@ -175,14 +173,12 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 		struct bdi_writeback *wb = &bdi->wb;
 
 		/*
-		 * If we failed allocating the bdi work item, wake up the wb
-		 * thread always. As a safety precaution, it'll flush out
-		 * everything
+		 * End work now if this wb has no dirty IO pending. Otherwise
+		 * wakeup the handling thread
 		 */
-		if (!wb_has_dirty_io(wb)) {
-			if (work)
-				wb_clear_pending(wb, work);
-		} else if (wb->task)
+		if (!wb_has_dirty_io(wb))
+			wb_clear_pending(wb, work);
+		else if (wb->task)
 			wake_up_process(wb->task);
 	}
 }
@@ -202,11 +198,20 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 {
 	struct bdi_work *work;
 
+	/*
+	 * This is WB_SYNC_NONE writeback, so if allocation fails just
+	 * wakeup the thread for old dirty data writeback
+	 */
 	work = kmalloc(sizeof(*work), GFP_ATOMIC);
-	if (work)
+	if (work) {
 		bdi_work_init(work, wbc);
+		bdi_queue_work(bdi, work);
+	} else {
+		struct bdi_writeback *wb = &bdi->wb;
 
-	bdi_queue_work(bdi, work);
+		if (wb->task)
+			wake_up_process(wb->task);
+	}
 }
 
 void bdi_start_writeback(struct writeback_control *wbc)
-- 
cgit v1.2.3-70-g09d2


From b6e51316daede0633e9274e1e30391cfa4747877 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 16 Sep 2009 15:13:54 +0200
Subject: writeback: separate starting of sync vs opportunistic writeback

bdi_start_writeback() is currently split into two paths, one for
WB_SYNC_NONE and one for WB_SYNC_ALL. Add bdi_sync_writeback()
for WB_SYNC_ALL writeback and let bdi_start_writeback() handle
only WB_SYNC_NONE.

Push down the writeback_control allocation and only accept the
parameters that make sense for each function. This cleans up
the API considerably.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c           | 132 ++++++++++++++++++++++----------------------
 fs/ubifs/budget.c           |  20 +------
 include/linux/backing-dev.h |   2 +-
 include/linux/writeback.h   |   4 +-
 mm/page-writeback.c         |  12 +---
 5 files changed, 75 insertions(+), 95 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 59b3ee63b624..5887328b5a06 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -74,14 +74,10 @@ static inline bool bdi_work_on_stack(struct bdi_work *work)
 }
 
 static inline void bdi_work_init(struct bdi_work *work,
-				 struct writeback_control *wbc)
+				 struct wb_writeback_args *args)
 {
 	INIT_RCU_HEAD(&work->rcu_head);
-	work->args.sb = wbc->sb;
-	work->args.nr_pages = wbc->nr_to_write;
-	work->args.sync_mode = wbc->sync_mode;
-	work->args.range_cyclic = wbc->range_cyclic;
-	work->args.for_kupdate = 0;
+	work->args = *args;
 	work->state = WS_USED;
 }
 
@@ -194,7 +190,7 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
 }
 
 static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-				 struct writeback_control *wbc)
+				 struct wb_writeback_args *args)
 {
 	struct bdi_work *work;
 
@@ -204,7 +200,7 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 	 */
 	work = kmalloc(sizeof(*work), GFP_ATOMIC);
 	if (work) {
-		bdi_work_init(work, wbc);
+		bdi_work_init(work, args);
 		bdi_queue_work(bdi, work);
 	} else {
 		struct bdi_writeback *wb = &bdi->wb;
@@ -214,24 +210,54 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 	}
 }
 
-void bdi_start_writeback(struct writeback_control *wbc)
+/**
+ * bdi_sync_writeback - start and wait for writeback
+ * @bdi: the backing device to write from
+ * @sb: write inodes from this super_block
+ *
+ * Description:
+ *   This does WB_SYNC_ALL data integrity writeback and waits for the
+ *   IO to complete. Callers must hold the sb s_umount semaphore for
+ *   reading, to avoid having the super disappear before we are done.
+ */
+static void bdi_sync_writeback(struct backing_dev_info *bdi,
+			       struct super_block *sb)
 {
-	/*
-	 * WB_SYNC_NONE is opportunistic writeback. If this allocation fails,
-	 * bdi_queue_work() will wake up the thread and flush old data. This
-	 * should ensure some amount of progress in freeing memory.
-	 */
-	if (wbc->sync_mode != WB_SYNC_ALL)
-		bdi_alloc_queue_work(wbc->bdi, wbc);
-	else {
-		struct bdi_work work;
+	struct wb_writeback_args args = {
+		.sb		= sb,
+		.sync_mode	= WB_SYNC_ALL,
+		.nr_pages	= LONG_MAX,
+		.range_cyclic	= 0,
+	};
+	struct bdi_work work;
 
-		bdi_work_init(&work, wbc);
-		work.state |= WS_ONSTACK;
+	bdi_work_init(&work, &args);
+	work.state |= WS_ONSTACK;
 
-		bdi_queue_work(wbc->bdi, &work);
-		bdi_wait_on_work_clear(&work);
-	}
+	bdi_queue_work(bdi, &work);
+	bdi_wait_on_work_clear(&work);
+}
+
+/**
+ * bdi_start_writeback - start writeback
+ * @bdi: the backing device to write from
+ * @nr_pages: the number of pages to write
+ *
+ * Description:
+ *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ *   started when this function returns, we make no guarentees on
+ *   completion. Caller need not hold sb s_umount semaphore.
+ *
+ */
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
+{
+	struct wb_writeback_args args = {
+		.sync_mode	= WB_SYNC_NONE,
+		.nr_pages	= nr_pages,
+		.range_cyclic	= 1,
+	};
+
+	bdi_alloc_queue_work(bdi, &args);
 }
 
 /*
@@ -863,23 +889,25 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 }
 
 /*
- * Schedule writeback for all backing devices. Can only be used for
- * WB_SYNC_NONE writeback, WB_SYNC_ALL should use bdi_start_writeback()
- * and pass in the superblock.
+ * Schedule writeback for all backing devices. This does WB_SYNC_NONE
+ * writeback, for integrity writeback see bdi_sync_writeback().
  */
-static void bdi_writeback_all(struct writeback_control *wbc)
+static void bdi_writeback_all(struct super_block *sb, long nr_pages)
 {
+	struct wb_writeback_args args = {
+		.sb		= sb,
+		.nr_pages	= nr_pages,
+		.sync_mode	= WB_SYNC_NONE,
+	};
 	struct backing_dev_info *bdi;
 
-	WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
-
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
 
-		bdi_alloc_queue_work(bdi, wbc);
+		bdi_alloc_queue_work(bdi, &args);
 	}
 
 	rcu_read_unlock();
@@ -891,17 +919,10 @@ static void bdi_writeback_all(struct writeback_control *wbc)
  */
 void wakeup_flusher_threads(long nr_pages)
 {
-	struct writeback_control wbc = {
-		.sync_mode	= WB_SYNC_NONE,
-		.older_than_this = NULL,
-		.range_cyclic	= 1,
-	};
-
 	if (nr_pages == 0)
 		nr_pages = global_page_state(NR_FILE_DIRTY) +
 				global_page_state(NR_UNSTABLE_NFS);
-	wbc.nr_to_write = nr_pages;
-	bdi_writeback_all(&wbc);
+	bdi_writeback_all(NULL, nr_pages);
 }
 
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
@@ -1048,7 +1069,7 @@ EXPORT_SYMBOL(__mark_inode_dirty);
  * on the writer throttling path, and we get decent balancing between many
  * throttled threads: we don't want them all piling up on inode_sync_wait.
  */
-static void wait_sb_inodes(struct writeback_control *wbc)
+static void wait_sb_inodes(struct super_block *sb)
 {
 	struct inode *inode, *old_inode = NULL;
 
@@ -1056,7 +1077,7 @@ static void wait_sb_inodes(struct writeback_control *wbc)
 	 * We need to be protected against the filesystem going from
 	 * r/o to r/w or vice versa.
 	 */
-	WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
 	spin_lock(&inode_lock);
 
@@ -1067,7 +1088,7 @@ static void wait_sb_inodes(struct writeback_control *wbc)
 	 * In which case, the inode may not be on the dirty list, but
 	 * we still have to wait for that writeout.
 	 */
-	list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		struct address_space *mapping;
 
 		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
@@ -1107,14 +1128,8 @@ static void wait_sb_inodes(struct writeback_control *wbc)
  * for IO completion of submitted IO. The number of pages submitted is
  * returned.
  */
-long writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb(struct super_block *sb)
 {
-	struct writeback_control wbc = {
-		.sb		= sb,
-		.sync_mode	= WB_SYNC_NONE,
-		.range_start	= 0,
-		.range_end	= LLONG_MAX,
-	};
 	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
 	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
 	long nr_to_write;
@@ -1122,9 +1137,7 @@ long writeback_inodes_sb(struct super_block *sb)
 	nr_to_write = nr_dirty + nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
-	wbc.nr_to_write = nr_to_write;
-	bdi_writeback_all(&wbc);
-	return nr_to_write - wbc.nr_to_write;
+	bdi_writeback_all(sb, nr_to_write);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
@@ -1135,21 +1148,10 @@ EXPORT_SYMBOL(writeback_inodes_sb);
  * This function writes and waits on any dirty inode belonging to this
  * super_block. The number of pages synced is returned.
  */
-long sync_inodes_sb(struct super_block *sb)
+void sync_inodes_sb(struct super_block *sb)
 {
-	struct writeback_control wbc = {
-		.sb		= sb,
-		.bdi		= sb->s_bdi,
-		.sync_mode	= WB_SYNC_ALL,
-		.range_start	= 0,
-		.range_end	= LLONG_MAX,
-	};
-	long nr_to_write = LONG_MAX; /* doesn't actually matter */
-
-	wbc.nr_to_write = nr_to_write;
-	bdi_start_writeback(&wbc);
-	wait_sb_inodes(&wbc);
-	return nr_to_write - wbc.nr_to_write;
+	bdi_sync_writeback(sb->s_bdi, sb);
+	wait_sb_inodes(sb);
 }
 EXPORT_SYMBOL(sync_inodes_sb);
 
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 1c8991b0db13..ee1ce68fd98b 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -54,29 +54,15 @@
  * @nr_to_write: how many dirty pages to write-back
  *
  * This function shrinks UBIFS liability by means of writing back some amount
- * of dirty inodes and their pages. Returns the amount of pages which were
- * written back. The returned value does not include dirty inodes which were
- * synchronized.
+ * of dirty inodes and their pages.
  *
  * Note, this function synchronizes even VFS inodes which are locked
  * (@i_mutex) by the caller of the budgeting function, because write-back does
  * not touch @i_mutex.
  */
-static int shrink_liability(struct ubifs_info *c, int nr_to_write)
+static void shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
-	int nr_written;
-
-	nr_written = writeback_inodes_sb(c->vfs_sb);
-	if (!nr_written) {
-		/*
-		 * Re-try again but wait on pages/inodes which are being
-		 * written-back concurrently (e.g., by pdflush).
-		 */
-		nr_written = sync_inodes_sb(c->vfs_sb);
-	}
-
-	dbg_budg("%d pages were written back", nr_written);
-	return nr_written;
+	writeback_inodes_sb(c->vfs_sb);
 }
 
 /**
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 859e797f4576..0ee33c2e6129 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -101,7 +101,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
-void bdi_start_writeback(struct writeback_control *wbc);
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages);
 int bdi_writeback_task(struct bdi_writeback *wb);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 48a054e2b716..75cf58666ff9 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -68,8 +68,8 @@ struct writeback_control {
  */	
 struct bdi_writeback;
 int inode_wait(void *);
-long writeback_inodes_sb(struct super_block *);
-long sync_inodes_sb(struct super_block *);
+void writeback_inodes_sb(struct super_block *);
+void sync_inodes_sb(struct super_block *);
 void writeback_inodes_wbc(struct writeback_control *wbc);
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
 void wakeup_flusher_threads(long nr_pages);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 12c3d843ce93..1eea4fa0d410 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -582,16 +582,8 @@ static void balance_dirty_pages(struct address_space *mapping)
 	if ((laptop_mode && pages_written) ||
 	    (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
 					  + global_page_state(NR_UNSTABLE_NFS))
-					  > background_thresh))) {
-		struct writeback_control wbc = {
-			.bdi		= bdi,
-			.sync_mode	= WB_SYNC_NONE,
-			.nr_to_write	= nr_writeback,
-		};
-
-
-		bdi_start_writeback(&wbc);
-	}
+					  > background_thresh)))
+		bdi_start_writeback(bdi, nr_writeback);
 }
 
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
-- 
cgit v1.2.3-70-g09d2


From 8010c3b6349b407f8f11b3f4d7e9f94cb00fe528 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 15 Sep 2009 20:04:57 +0200
Subject: writeback: add comments to bdi_work structure

And document its retriever, get_next_work_item().

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5887328b5a06..55f0d4e51b59 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -49,15 +49,15 @@ struct wb_writeback_args {
  * Work items for the bdi_writeback threads
  */
 struct bdi_work {
-	struct list_head list;
-	struct rcu_head rcu_head;
+	struct list_head list;		/* pending work list */
+	struct rcu_head rcu_head;	/* for RCU free/clear of work */
 
-	unsigned long seen;
-	atomic_t pending;
+	unsigned long seen;		/* threads that have seen this work */
+	atomic_t pending;		/* number of threads still to do work */
 
-	struct wb_writeback_args args;
+	struct wb_writeback_args args;	/* writeback arguments */
 
-	unsigned long state;
+	unsigned long state;		/* flag bits, see WS_* */
 };
 
 enum {
@@ -758,7 +758,11 @@ static long wb_writeback(struct bdi_writeback *wb,
 
 /*
  * Return the next bdi_work struct that hasn't been processed by this
- * wb thread yet
+ * wb thread yet. ->seen is initially set for each thread that exists
+ * for this device, when a thread first notices a piece of work it
+ * clears its bit. Depending on writeback type, the thread will notify
+ * completion on either receiving the work (WB_SYNC_NONE) or after
+ * it is done (WB_SYNC_ALL).
  */
 static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
 					   struct bdi_writeback *wb)
-- 
cgit v1.2.3-70-g09d2


From 49db041430e8a856dbc3af15430bf068f1c74655 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 15 Sep 2009 21:27:40 +0200
Subject: writeback: use schedule_timeout_interruptible()

Gets rid of a manual set_current_state().

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 55f0d4e51b59..34757758511a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -884,8 +884,7 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 		}
 
 		wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(wait_jiffies);
+		schedule_timeout_interruptible(wait_jiffies);
 		try_to_freeze();
 	}
 
-- 
cgit v1.2.3-70-g09d2


From deed62edffe600bc5b379c872d3004116e001b66 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 15 Sep 2009 21:32:58 +0200
Subject: writeback: remove smp_mb(), it's not needed with list_add_tail_rcu()

list_add_tail_rcu contains required barriers.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 34757758511a..59c99e729187 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -151,10 +151,10 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 	BUG_ON(!bdi->wb_cnt);
 
 	/*
-	 * Make sure stores are seen before it appears on the list
+	 * list_add_tail_rcu() contains the necessary barriers to
+	 * make sure the above stores are seen before the item is
+	 * noticed on the list
 	 */
-	smp_mb();
-
 	spin_lock(&bdi->wb_lock);
 	list_add_tail_rcu(&work->list, &bdi->work_list);
 	spin_unlock(&bdi->wb_lock);
-- 
cgit v1.2.3-70-g09d2


From 77fad5e625e56eb31a343ae1d489979fdc61a2aa Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 15 Sep 2009 21:34:12 +0200
Subject: writeback: improve scalability of bdi writeback work queues

If you're going to do an atomic RMW on each list entry, there's not much
point in all the RCU complexities of the list walking. This is only going
to help the multi-thread case I guess, but it doesn't hurt to do now.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 59c99e729187..6bca6f8176f0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -772,8 +772,9 @@ static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(work, &bdi->work_list, list) {
-		if (!test_and_clear_bit(wb->nr, &work->seen))
+		if (!test_bit(wb->nr, &work->seen))
 			continue;
+		clear_bit(wb->nr, &work->seen);
 
 		ret = work;
 		break;
-- 
cgit v1.2.3-70-g09d2


From 77b9d059cb3ddb8b1246d5878e81d52926550b23 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 15 Sep 2009 21:34:51 +0200
Subject: writeback: Fix bdi use after free in wb_work_complete()

By the time bdi_work_on_stack gets evaluated again in bdi_work_free, it
can already have been deallocated and used for something else in the
!on stack case, giving a false positive in this test and causing
corruption.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6bca6f8176f0..7eba7326b9b9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -113,6 +113,7 @@ static void bdi_work_free(struct rcu_head *head)
 static void wb_work_complete(struct bdi_work *work)
 {
 	const enum writeback_sync_modes sync_mode = work->args.sync_mode;
+	int onstack = bdi_work_on_stack(work);
 
 	/*
 	 * For allocated work, we can clear the done/seen bit right here.
@@ -120,9 +121,9 @@ static void wb_work_complete(struct bdi_work *work)
 	 * to after the RCU grace period, since the stack could be invalidated
 	 * as soon as bdi_work_clear() has done the wakeup.
 	 */
-	if (!bdi_work_on_stack(work))
+	if (!onstack)
 		bdi_work_clear(work);
-	if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work))
+	if (sync_mode == WB_SYNC_NONE || onstack)
 		call_rcu(&work->rcu_head, bdi_work_free);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 1ef7d9aa32a8ee054c4d4fdcd2ea537c04d61b2f Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 15 Sep 2009 21:37:55 +0200
Subject: writeback: fix possible bdi writeback refcounting problem

wb_clear_pending AFAIKS should not be called after the item has been
put on the list, except by the worker threads. It could lead to the
situation where the refcount is decremented below 0 and cause lots of
problems.

Presumably the !wb_has_dirty_io case is not a common one, so it can
be discovered when the thread wakes up to check?

Also add a comment in bdi_work_clear.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7eba7326b9b9..8e1e5e19d21e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -97,6 +97,11 @@ static void bdi_work_clear(struct bdi_work *work)
 {
 	clear_bit(WS_USED_B, &work->state);
 	smp_mb__after_clear_bit();
+	/*
+	 * work can have disappeared at this point. bit waitq functions
+	 * should be able to tolerate this, provided bdi_sched_wait does
+	 * not dereference it's pointer argument.
+	*/
 	wake_up_bit(&work->state, WS_USED_B);
 }
 
@@ -169,13 +174,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 	else {
 		struct bdi_writeback *wb = &bdi->wb;
 
-		/*
-		 * End work now if this wb has no dirty IO pending. Otherwise
-		 * wakeup the handling thread
-		 */
-		if (!wb_has_dirty_io(wb))
-			wb_clear_pending(wb, work);
-		else if (wb->task)
+		if (wb->task)
 			wake_up_process(wb->task);
 	}
 }
-- 
cgit v1.2.3-70-g09d2


From b449fc6fcc07a392c69f3c1db9a4ad4dda8cbcba Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@sun.com>
Date: Thu, 30 Jul 2009 20:09:46 +0200
Subject: JBD: round commit timer up to avoid uncommitted transaction

Fix jiffie rounding in jbd commit timer setup code.  Rounding down could cause
the timer to be fired before the corresponding transaction has expired.  That
transaction can stay not committed forever if no new transaction is created or
explicit sync/umount happens.

Signed-off-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/transaction.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c03ac11f74be..833c1675a00c 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -56,7 +56,8 @@ get_transaction(journal_t *journal, transaction_t *transaction)
 	spin_lock_init(&transaction->t_handle_lock);
 
 	/* Set up the commit timer for the new transaction. */
-	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+	journal->j_commit_timer.expires =
+				round_jiffies_up(transaction->t_expires);
 	add_timer(&journal->j_commit_timer);
 
 	J_ASSERT(journal->j_running_transaction == NULL);
-- 
cgit v1.2.3-70-g09d2


From 9c28cbccec66a5ca292c6659bf5a0fe0c8459fa7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 3 Aug 2009 19:21:00 +0200
Subject: jbd: Journal block numbers can ever be only 32-bit use unsigned int
 for them

It does not make sense to store block number for journal as unsigned long
since they can be only 32-bit (because of on-disk format limitation). So
change in-memory structures and variables to use unsigned int instead.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/checkpoint.c |  6 +++---
 fs/jbd/commit.c     |  2 +-
 fs/jbd/journal.c    | 30 +++++++++++++++---------------
 fs/jbd/recovery.c   | 18 +++++++++---------
 fs/jbd/revoke.c     | 16 ++++++++--------
 include/linux/jbd.h | 26 +++++++++++++-------------
 6 files changed, 49 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 61f32f3868cd..b0435dd0654d 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -456,7 +456,7 @@ int cleanup_journal_tail(journal_t *journal)
 {
 	transaction_t * transaction;
 	tid_t		first_tid;
-	unsigned long	blocknr, freed;
+	unsigned int	blocknr, freed;
 
 	if (is_journal_aborted(journal))
 		return 1;
@@ -502,8 +502,8 @@ int cleanup_journal_tail(journal_t *journal)
 		freed = freed + journal->j_last - journal->j_first;
 
 	jbd_debug(1,
-		  "Cleaning journal tail from %d to %d (offset %lu), "
-		  "freeing %lu\n",
+		  "Cleaning journal tail from %d to %d (offset %u), "
+		  "freeing %u\n",
 		  journal->j_tail_sequence, first_tid, blocknr, freed);
 
 	journal->j_free += freed;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 618e21c0b7a3..4bd882548c45 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal)
 	int bufs;
 	int flags;
 	int err;
-	unsigned long blocknr;
+	unsigned int blocknr;
 	ktime_t start_time;
 	u64 commit_time;
 	char *tagp = NULL;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f96f85092d1c..bd3c073b485d 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -276,7 +276,7 @@ static void journal_kill_thread(journal_t *journal)
 int journal_write_metadata_buffer(transaction_t *transaction,
 				  struct journal_head  *jh_in,
 				  struct journal_head **jh_out,
-				  unsigned long blocknr)
+				  unsigned int blocknr)
 {
 	int need_copy_out = 0;
 	int done_copy_out = 0;
@@ -567,9 +567,9 @@ int log_wait_commit(journal_t *journal, tid_t tid)
  * Log buffer allocation routines:
  */
 
-int journal_next_log_block(journal_t *journal, unsigned long *retp)
+int journal_next_log_block(journal_t *journal, unsigned int *retp)
 {
-	unsigned long blocknr;
+	unsigned int blocknr;
 
 	spin_lock(&journal->j_state_lock);
 	J_ASSERT(journal->j_free > 1);
@@ -590,11 +590,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp)
  * this is a no-op.  If needed, we can use j_blk_offset - everything is
  * ready.
  */
-int journal_bmap(journal_t *journal, unsigned long blocknr,
-		 unsigned long *retp)
+int journal_bmap(journal_t *journal, unsigned int blocknr,
+		 unsigned int *retp)
 {
 	int err = 0;
-	unsigned long ret;
+	unsigned int ret;
 
 	if (journal->j_inode) {
 		ret = bmap(journal->j_inode, blocknr);
@@ -604,7 +604,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
 			char b[BDEVNAME_SIZE];
 
 			printk(KERN_ALERT "%s: journal block not found "
-					"at offset %lu on %s\n",
+					"at offset %u on %s\n",
 				__func__,
 				blocknr,
 				bdevname(journal->j_dev, b));
@@ -630,7 +630,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
 struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
 {
 	struct buffer_head *bh;
-	unsigned long blocknr;
+	unsigned int blocknr;
 	int err;
 
 	err = journal_next_log_block(journal, &blocknr);
@@ -774,7 +774,7 @@ journal_t * journal_init_inode (struct inode *inode)
 	journal_t *journal = journal_init_common();
 	int err;
 	int n;
-	unsigned long blocknr;
+	unsigned int blocknr;
 
 	if (!journal)
 		return NULL;
@@ -846,12 +846,12 @@ static void journal_fail_superblock (journal_t *journal)
 static int journal_reset(journal_t *journal)
 {
 	journal_superblock_t *sb = journal->j_superblock;
-	unsigned long first, last;
+	unsigned int first, last;
 
 	first = be32_to_cpu(sb->s_first);
 	last = be32_to_cpu(sb->s_maxlen);
 	if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
-		printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n",
+		printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
 		       first, last);
 		journal_fail_superblock(journal);
 		return -EINVAL;
@@ -885,7 +885,7 @@ static int journal_reset(journal_t *journal)
  **/
 int journal_create(journal_t *journal)
 {
-	unsigned long blocknr;
+	unsigned int blocknr;
 	struct buffer_head *bh;
 	journal_superblock_t *sb;
 	int i, err;
@@ -969,14 +969,14 @@ void journal_update_superblock(journal_t *journal, int wait)
 	if (sb->s_start == 0 && journal->j_tail_sequence ==
 				journal->j_transaction_sequence) {
 		jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
-			"(start %ld, seq %d, errno %d)\n",
+			"(start %u, seq %d, errno %d)\n",
 			journal->j_tail, journal->j_tail_sequence,
 			journal->j_errno);
 		goto out;
 	}
 
 	spin_lock(&journal->j_state_lock);
-	jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+	jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
 		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
 
 	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1371,7 +1371,7 @@ int journal_flush(journal_t *journal)
 {
 	int err = 0;
 	transaction_t *transaction = NULL;
-	unsigned long old_tail;
+	unsigned int old_tail;
 
 	spin_lock(&journal->j_state_lock);
 
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index db5e982c5ddf..cb1a49ae605e 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
 {
 	int err;
 	unsigned int max, nbufs, next;
-	unsigned long blocknr;
+	unsigned int blocknr;
 	struct buffer_head *bh;
 
 	struct buffer_head * bufs[MAXBUF];
@@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
 		 unsigned int offset)
 {
 	int err;
-	unsigned long blocknr;
+	unsigned int blocknr;
 	struct buffer_head *bh;
 
 	*bhp = NULL;
@@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal,
 			struct recovery_info *info, enum passtype pass)
 {
 	unsigned int		first_commit_ID, next_commit_ID;
-	unsigned long		next_log_block;
+	unsigned int		next_log_block;
 	int			err, success = 0;
 	journal_superblock_t *	sb;
 	journal_header_t *	tmp;
@@ -367,14 +367,14 @@ static int do_one_pass(journal_t *journal,
 			if (tid_geq(next_commit_ID, info->end_transaction))
 				break;
 
-		jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+		jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
 			  next_commit_ID, next_log_block, journal->j_last);
 
 		/* Skip over each chunk of the transaction looking
 		 * either the next descriptor block or the final commit
 		 * record. */
 
-		jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+		jbd_debug(3, "JBD: checking block %u\n", next_log_block);
 		err = jread(&bh, journal, next_log_block);
 		if (err)
 			goto failed;
@@ -429,7 +429,7 @@ static int do_one_pass(journal_t *journal,
 			tagp = &bh->b_data[sizeof(journal_header_t)];
 			while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
 			       <= journal->j_blocksize) {
-				unsigned long io_block;
+				unsigned int io_block;
 
 				tag = (journal_block_tag_t *) tagp;
 				flags = be32_to_cpu(tag->t_flags);
@@ -443,10 +443,10 @@ static int do_one_pass(journal_t *journal,
 					success = err;
 					printk (KERN_ERR
 						"JBD: IO error %d recovering "
-						"block %ld in log\n",
+						"block %u in log\n",
 						err, io_block);
 				} else {
-					unsigned long blocknr;
+					unsigned int blocknr;
 
 					J_ASSERT(obh != NULL);
 					blocknr = be32_to_cpu(tag->t_blocknr);
@@ -581,7 +581,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
 	max = be32_to_cpu(header->r_count);
 
 	while (offset < max) {
-		unsigned long blocknr;
+		unsigned int blocknr;
 		int err;
 
 		blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index da6cd9bdaabc..ad717328343a 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -101,7 +101,7 @@ struct jbd_revoke_record_s
 {
 	struct list_head  hash;
 	tid_t		  sequence;	/* Used for recovery only */
-	unsigned long	  blocknr;
+	unsigned int	  blocknr;
 };
 
 
@@ -126,7 +126,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 /* Utility functions to maintain the revoke table */
 
 /* Borrowed from buffer.c: this is a tried and tested block hash function */
-static inline int hash(journal_t *journal, unsigned long block)
+static inline int hash(journal_t *journal, unsigned int block)
 {
 	struct jbd_revoke_table_s *table = journal->j_revoke;
 	int hash_shift = table->hash_shift;
@@ -136,7 +136,7 @@ static inline int hash(journal_t *journal, unsigned long block)
 		(block << (hash_shift - 12))) & (table->hash_size - 1);
 }
 
-static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
+static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
 			      tid_t seq)
 {
 	struct list_head *hash_list;
@@ -166,7 +166,7 @@ oom:
 /* Find a revoke record in the journal's hash table. */
 
 static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
-						      unsigned long blocknr)
+						      unsigned int blocknr)
 {
 	struct list_head *hash_list;
 	struct jbd_revoke_record_s *record;
@@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal)
  * by one.
  */
 
-int journal_revoke(handle_t *handle, unsigned long blocknr,
+int journal_revoke(handle_t *handle, unsigned int blocknr,
 		   struct buffer_head *bh_in)
 {
 	struct buffer_head *bh = NULL;
@@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
 		}
 	}
 
-	jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
+	jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
 	err = insert_revoke_hash(journal, blocknr,
 				handle->h_transaction->t_tid);
 	BUFFER_TRACE(bh_in, "exit");
@@ -644,7 +644,7 @@ static void flush_descriptor(journal_t *journal,
  */
 
 int journal_set_revoke(journal_t *journal,
-		       unsigned long blocknr,
+		       unsigned int blocknr,
 		       tid_t sequence)
 {
 	struct jbd_revoke_record_s *record;
@@ -668,7 +668,7 @@ int journal_set_revoke(journal_t *journal,
  */
 
 int journal_test_revoke(journal_t *journal,
-			unsigned long blocknr,
+			unsigned int blocknr,
 			tid_t sequence)
 {
 	struct jbd_revoke_record_s *record;
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index c2049a04fa0b..a1187a0c99b4 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -446,7 +446,7 @@ struct transaction_s
 	/*
 	 * Where in the log does this transaction's commit start? [no locking]
 	 */
-	unsigned long		t_log_start;
+	unsigned int		t_log_start;
 
 	/* Number of buffers on the t_buffers list [j_list_lock] */
 	int			t_nr_buffers;
@@ -701,26 +701,26 @@ struct journal_s
 	 * Journal head: identifies the first unused block in the journal.
 	 * [j_state_lock]
 	 */
-	unsigned long		j_head;
+	unsigned int		j_head;
 
 	/*
 	 * Journal tail: identifies the oldest still-used block in the journal.
 	 * [j_state_lock]
 	 */
-	unsigned long		j_tail;
+	unsigned int		j_tail;
 
 	/*
 	 * Journal free: how many free blocks are there in the journal?
 	 * [j_state_lock]
 	 */
-	unsigned long		j_free;
+	unsigned int		j_free;
 
 	/*
 	 * Journal start and end: the block numbers of the first usable block
 	 * and one beyond the last usable block in the journal. [j_state_lock]
 	 */
-	unsigned long		j_first;
-	unsigned long		j_last;
+	unsigned int		j_first;
+	unsigned int		j_last;
 
 	/*
 	 * Device, blocksize and starting block offset for the location where we
@@ -728,7 +728,7 @@ struct journal_s
 	 */
 	struct block_device	*j_dev;
 	int			j_blocksize;
-	unsigned long		j_blk_offset;
+	unsigned int		j_blk_offset;
 
 	/*
 	 * Device which holds the client fs.  For internal journal this will be
@@ -859,7 +859,7 @@ extern void __journal_clean_data_list(transaction_t *transaction);
 
 /* Log buffer allocation */
 extern struct journal_head * journal_get_descriptor_buffer(journal_t *);
-int journal_next_log_block(journal_t *, unsigned long *);
+int journal_next_log_block(journal_t *, unsigned int *);
 
 /* Commit management */
 extern void journal_commit_transaction(journal_t *);
@@ -874,7 +874,7 @@ extern int
 journal_write_metadata_buffer(transaction_t	  *transaction,
 			      struct journal_head  *jh_in,
 			      struct journal_head **jh_out,
-			      unsigned long	   blocknr);
+			      unsigned int blocknr);
 
 /* Transaction locking */
 extern void		__wait_on_journal (journal_t *);
@@ -942,7 +942,7 @@ extern void	   journal_abort      (journal_t *, int);
 extern int	   journal_errno      (journal_t *);
 extern void	   journal_ack_err    (journal_t *);
 extern int	   journal_clear_err  (journal_t *);
-extern int	   journal_bmap(journal_t *, unsigned long, unsigned long *);
+extern int	   journal_bmap(journal_t *, unsigned int, unsigned int *);
 extern int	   journal_force_commit(journal_t *);
 
 /*
@@ -976,14 +976,14 @@ extern int	   journal_init_revoke_caches(void);
 
 extern void	   journal_destroy_revoke(journal_t *);
 extern int	   journal_revoke (handle_t *,
-				unsigned long, struct buffer_head *);
+				unsigned int, struct buffer_head *);
 extern int	   journal_cancel_revoke(handle_t *, struct journal_head *);
 extern void	   journal_write_revoke_records(journal_t *,
 						transaction_t *, int);
 
 /* Recovery revoke support */
-extern int	journal_set_revoke(journal_t *, unsigned long, tid_t);
-extern int	journal_test_revoke(journal_t *, unsigned long, tid_t);
+extern int	journal_set_revoke(journal_t *, unsigned int, tid_t);
+extern int	journal_test_revoke(journal_t *, unsigned int, tid_t);
 extern void	journal_clear_revoke(journal_t *);
 extern void	journal_switch_revoke_table(journal_t *journal);
 
-- 
cgit v1.2.3-70-g09d2


From 3adae9da0b35d2ca908039f42a1e90395c335181 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 11 Aug 2009 17:27:21 +0200
Subject: jbd: Annotate transaction start also for journal_restart()

lockdep annotation for a transaction start has been at the end of
journal_start(). But a transaction is also started from journal_restart(). Move
the lockdep annotation to start_this_handle() which covers both cases.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/transaction.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 833c1675a00c..006f9ad838a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -229,6 +229,8 @@ repeat_locked:
 		  __log_space_left(journal));
 	spin_unlock(&transaction->t_handle_lock);
 	spin_unlock(&journal->j_state_lock);
+
+	lock_map_acquire(&handle->h_lockdep_map);
 out:
 	if (unlikely(new_transaction))		/* It's usually NULL */
 		kfree(new_transaction);
@@ -293,9 +295,6 @@ handle_t *journal_start(journal_t *journal, int nblocks)
 		handle = ERR_PTR(err);
 		goto out;
 	}
-
-	lock_map_acquire(&handle->h_lockdep_map);
-
 out:
 	return handle;
 }
@@ -417,6 +416,7 @@ int journal_restart(handle_t *handle, int nblocks)
 	__log_start_commit(journal, transaction->t_tid);
 	spin_unlock(&journal->j_state_lock);
 
+	lock_map_release(&handle->h_lockdep_map);
 	handle->h_buffer_credits = nblocks;
 	ret = start_this_handle(journal, handle);
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From 00171d3c7e3b738ba582c7a9b37408e796f49046 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 11 Aug 2009 19:06:10 +0200
Subject: ext3: Fix possible deadlock between ext3_truncate() and
 ext3_get_blocks()

During truncate we are sometimes forced to start a new transaction as the
amount of blocks to be journaled is both quite large and hard to predict. So
far we restarted a transaction while holding truncate_mutex and that violates
lock ordering because truncate_mutex ranks below transaction start (and it
can lead to a real deadlock with ext3_get_blocks() allocating new blocks
from ext3_writepage()).

Luckily, the problem is easy to fix: We just drop the truncate_mutex before
restarting the transaction and acquire it afterwards. We are safe to do this as
by the time ext3_truncate() is called, all the page cache for the truncated
part of the file is dropped and so writepage() cannot come and allocate new
blocks in the part of the file we are truncating. The rest of writers is
stopped by us holding i_mutex.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/inode.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b49908a167ae..eb0b4e0d517b 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -172,10 +172,21 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
  * so before we call here everything must be consistently dirtied against
  * this transaction.
  */
-static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
 {
+	int ret;
+
 	jbd_debug(2, "restarting handle %p\n", handle);
-	return ext3_journal_restart(handle, blocks_for_truncate(inode));
+	/*
+	 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
+	 * At this moment, get_block can be called only for blocks inside
+	 * i_size since page cache has been already dropped and writes are
+	 * blocked by i_mutex. So we can safely drop the truncate_mutex.
+	 */
+	mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+	ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
+	mutex_lock(&EXT3_I(inode)->truncate_mutex);
+	return ret;
 }
 
 /*
@@ -2072,7 +2083,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
 			ext3_journal_dirty_metadata(handle, bh);
 		}
 		ext3_mark_inode_dirty(handle, inode);
-		ext3_journal_test_restart(handle, inode);
+		truncate_restart_transaction(handle, inode);
 		if (bh) {
 			BUFFER_TRACE(bh, "retaking write access");
 			ext3_journal_get_write_access(handle, bh);
@@ -2282,7 +2293,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 				return;
 			if (try_to_extend_transaction(handle, inode)) {
 				ext3_mark_inode_dirty(handle, inode);
-				ext3_journal_test_restart(handle, inode);
+				truncate_restart_transaction(handle, inode);
 			}
 
 			ext3_free_blocks(handle, inode, nr, 1);
-- 
cgit v1.2.3-70-g09d2


From 4f003fd32bc54ec438b8691795279844df27ce38 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Sep 2009 00:22:14 +0200
Subject: ext3: Add locking to ext3_do_update_inode

I've been struggling with this off and on while I've been testing the
data=guarded work.  The symptom is corrupted orphan lists and inodes
with the wrong i_size stored on disk.  I was convinced the
data=guarded code was just missing a call to ext3_mark_inode_dirty, but
tracing showed the i_disksize I was sending to ext3_mark_inode_dirty
wasn't actually making it to the drive.

ext3_mark_inode_dirty can be called without locks held (atime updates
and a few others), so the data=guarded code uses locks while updating
the in-memory inode, and then calls ext3_mark_inode_dirty
without any locks held.

But, ext3_mark_inode_dirty has no internal locking to make sure that
only one CPU is updating the buffer head at a time.  Generally this
works out ok because everyone that changes the inode then calls
ext3_mark_inode_dirty themselves.  Even though it races, eventually
someone updates the buffer heads and things move on.

But there is still a risk of the wrong values getting in, and the
data=guarded code seems to hit the race very often.

Since everyone that changes the inode also logs it, it should be
possible to fix this with some memory barriers.  I'll leave that as an
exercise to the reader and lock the buffer head instead.

It it probably a good idea to have a different patch series for lockless
bit flipping on the ext3 i_state field.  ext3_do_update_inode &= clears
EXT3_STATE_NEW without any locks held.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/inode.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index eb0b4e0d517b..cd098a7b77fc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2903,6 +2903,10 @@ static int ext3_do_update_inode(handle_t *handle,
 	struct buffer_head *bh = iloc->bh;
 	int err = 0, rc, block;
 
+again:
+	/* we can't allow multiple procs in here at once, its a bit racey */
+	lock_buffer(bh);
+
 	/* For fields not not tracking in the in-memory inode,
 	 * initialise them to zero for new inodes. */
 	if (ei->i_state & EXT3_STATE_NEW)
@@ -2962,16 +2966,20 @@ static int ext3_do_update_inode(handle_t *handle,
 			       /* If this is the first large file
 				* created, add a flag to the superblock.
 				*/
+				unlock_buffer(bh);
 				err = ext3_journal_get_write_access(handle,
 						EXT3_SB(sb)->s_sbh);
 				if (err)
 					goto out_brelse;
+
 				ext3_update_dynamic_rev(sb);
 				EXT3_SET_RO_COMPAT_FEATURE(sb,
 					EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
 				handle->h_sync = 1;
 				err = ext3_journal_dirty_metadata(handle,
 						EXT3_SB(sb)->s_sbh);
+				/* get our lock and start over */
+				goto again;
 			}
 		}
 	}
@@ -2994,6 +3002,7 @@ static int ext3_do_update_inode(handle_t *handle,
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
 
 	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+	unlock_buffer(bh);
 	rc = ext3_journal_dirty_metadata(handle, bh);
 	if (!err)
 		err = rc;
-- 
cgit v1.2.3-70-g09d2


From 56fcad29d4b3cbcbb2ed47a9d3ceca3f57175417 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 8 Sep 2009 14:59:42 +0200
Subject: ext3: Flush disk caches on fsync when needed

In case we fsync() a file and inode is not dirty, we don't force a transaction
to disk and hence don't flush disk caches. Thus file data could be just in disk
caches and not on persistent storage. Fix the problem by flushing disk caches
if we didn't force a transaction commit.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/fsync.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d33634119e17..451d166bbe93 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -23,6 +23,7 @@
  */
 
 #include <linux/time.h>
+#include <linux/blkdev.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
@@ -73,7 +74,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
 	}
 
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		goto out;
+		goto flush;
 
 	/*
 	 * The VFS has written the file data.  If the inode is unaltered
@@ -85,7 +86,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
 			.nr_to_write = 0, /* sys_fsync did this */
 		};
 		ret = sync_inode(inode, &wbc);
+		goto out;
 	}
+flush:
+	/*
+	 * In case we didn't commit a transaction, we have to flush
+	 * disk caches manually so that data really is on persistent
+	 * storage
+	 */
+	if (test_opt(inode->i_sb, BARRIER))
+		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 out:
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From e8505970af46658ece2545e9bc1fe594998fdcdf Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Wed, 16 Sep 2009 13:46:38 -0400
Subject: ext4: Replace get_ext_path macro with an inline funciton

Replace get_ext_path macro with an inline function,
since this macro looks like a function call but its arguments
get modified. Ted pointed this out, thanks.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 55 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 4c4491cef357..e4bd8761498a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -19,14 +19,29 @@
 #include "ext4_extents.h"
 #include "ext4.h"
 
-#define get_ext_path(path, inode, block, ret)		\
-	do {								\
-		path = ext4_ext_find_extent(inode, block, path);	\
-		if (IS_ERR(path)) {					\
-			ret = PTR_ERR(path);				\
-			path = NULL;					\
-		}							\
-	} while (0)
+/**
+ * get_ext_path - Find an extent path for designated logical block number.
+ *
+ * @inode:	an inode which is searched
+ * @lblock:	logical block number to find an extent path
+ * @path:	pointer to an extent path pointer (for output)
+ *
+ * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * on failure.
+ */
+static inline int
+get_ext_path(struct inode *inode, ext4_lblk_t lblock,
+		struct ext4_ext_path **path)
+{
+	int ret = 0;
+
+	*path = ext4_ext_find_extent(inode, lblock, *path);
+	if (IS_ERR(*path)) {
+		ret = PTR_ERR(*path);
+		*path = NULL;
+	}
+	return ret;
+}
 
 /**
  * copy_extent_status - Copy the extent's initialization status
@@ -283,7 +298,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 	}
 
 	if (new_flag) {
-		get_ext_path(orig_path, orig_inode, eblock, err);
+		err = get_ext_path(orig_inode, eblock, &orig_path);
 		if (orig_path == NULL)
 			goto out;
 
@@ -293,8 +308,8 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 	}
 
 	if (end_flag) {
-		get_ext_path(orig_path, orig_inode,
-				      le32_to_cpu(end_ext->ee_block) - 1, err);
+		err = get_ext_path(orig_inode,
+				le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
 		if (orig_path == NULL)
 			goto out;
 
@@ -631,12 +646,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 	mext_double_down_write(orig_inode, donor_inode);
 
 	/* Get the original extent for the block "orig_off" */
-	get_ext_path(orig_path, orig_inode, orig_off, err);
+	err = get_ext_path(orig_inode, orig_off, &orig_path);
 	if (orig_path == NULL)
 		goto out;
 
 	/* Get the donor extent for the head */
-	get_ext_path(donor_path, donor_inode, donor_off, err);
+	err = get_ext_path(donor_inode, donor_off, &donor_path);
 	if (donor_path == NULL)
 		goto out;
 	depth = ext_depth(orig_inode);
@@ -678,7 +693,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 
 		if (orig_path)
 			ext4_ext_drop_refs(orig_path);
-		get_ext_path(orig_path, orig_inode, orig_off, err);
+		err = get_ext_path(orig_inode, orig_off, &orig_path);
 		if (orig_path == NULL)
 			goto out;
 		depth = ext_depth(orig_inode);
@@ -692,8 +707,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 
 		if (donor_path)
 			ext4_ext_drop_refs(donor_path);
-		get_ext_path(donor_path, donor_inode,
-				      donor_off, err);
+		err = get_ext_path(donor_inode, donor_off, &donor_path);
 		if (donor_path == NULL)
 			goto out;
 		depth = ext_depth(donor_inode);
@@ -1154,12 +1168,12 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	if (file_end < block_end)
 		len -= block_end - file_end;
 
-	get_ext_path(orig_path, orig_inode, block_start, ret);
+	ret = get_ext_path(orig_inode, block_start, &orig_path);
 	if (orig_path == NULL)
 		goto out2;
 
 	/* Get path structure to check the hole */
-	get_ext_path(holecheck_path, orig_inode, block_start, ret);
+	ret = get_ext_path(orig_inode, block_start, &holecheck_path);
 	if (holecheck_path == NULL)
 		goto out;
 
@@ -1289,8 +1303,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		/* Decrease buffer counter */
 		if (holecheck_path)
 			ext4_ext_drop_refs(holecheck_path);
-		get_ext_path(holecheck_path, orig_inode,
-				      seq_start, ret);
+		ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
 		if (holecheck_path == NULL)
 			break;
 		depth = holecheck_path->p_depth;
@@ -1298,7 +1311,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		/* Decrease buffer counter */
 		if (orig_path)
 			ext4_ext_drop_refs(orig_path);
-		get_ext_path(orig_path, orig_inode, seq_start, ret);
+		ret = get_ext_path(orig_inode, seq_start, &orig_path);
 		if (orig_path == NULL)
 			break;
 
-- 
cgit v1.2.3-70-g09d2


From 2147b1a6a48e28399120ca51d4a91840a278611f Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Wed, 16 Sep 2009 13:46:35 -0400
Subject: ext4: Replace BUG_ON() with ext4_error() in move_extents.c

Replace BUG_ON calls with a call to ext4_error()
to print an error message if EXT4_IOC_MOVE_EXT failed
with some kind of reasons.  This will help to debug.
Ted pointed this out, thanks.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 149 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 109 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index e4bd8761498a..2258560e9722 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -127,6 +127,31 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 	return 1;
 }
 
+/**
+ * mext_check_null_inode - NULL check for two inodes
+ *
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ */
+static int
+mext_check_null_inode(struct inode *inode1, struct inode *inode2,
+		const char *function)
+{
+	int ret = 0;
+
+	if (inode1 == NULL) {
+		ext4_error(inode2->i_sb, function,
+			"Both inodes should not be NULL: "
+			"inode1 NULL inode2 %lu", inode2->i_ino);
+		ret = -EIO;
+	} else if (inode2 == NULL) {
+		ext4_error(inode1->i_sb, function,
+			"Both inodes should not be NULL: "
+			"inode1 %lu inode2 NULL", inode1->i_ino);
+		ret = -EIO;
+	}
+	return ret;
+}
+
 /**
  * mext_double_down_read - Acquire two inodes' read semaphore
  *
@@ -139,8 +164,6 @@ mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
 {
 	struct inode *first = orig_inode, *second = donor_inode;
 
-	BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
 	/*
 	 * Use the inode number to provide the stable locking order instead
 	 * of its address, because the C language doesn't guarantee you can
@@ -167,8 +190,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
 {
 	struct inode *first = orig_inode, *second = donor_inode;
 
-	BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
 	/*
 	 * Use the inode number to provide the stable locking order instead
 	 * of its address, because the C language doesn't guarantee you can
@@ -193,8 +214,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
 static void
 mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
 {
-	BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
 	up_read(&EXT4_I(orig_inode)->i_data_sem);
 	up_read(&EXT4_I(donor_inode)->i_data_sem);
 }
@@ -209,8 +228,6 @@ mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
 static void
 mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
 {
-	BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
 	up_write(&EXT4_I(orig_inode)->i_data_sem);
 	up_write(&EXT4_I(donor_inode)->i_data_sem);
 }
@@ -534,7 +551,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 	 * oext      |-----------|
 	 * new_ext       |-------|
 	 */
-	BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
+	if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
+		ext4_error(orig_inode->i_sb, __func__,
+			"new_ext_end(%u) should be less than or equal to "
+			"oext->ee_block(%u) + oext_alen(%d) - 1",
+			new_ext_end, le32_to_cpu(oext->ee_block),
+			oext_alen);
+		ret = -EIO;
+		goto out;
+	}
 
 	/*
 	 * Case: new_ext is smaller than original extent
@@ -558,6 +583,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 
 	ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
 				o_end, &start_ext, &new_ext, &end_ext);
+out:
 	return ret;
 }
 
@@ -668,7 +694,20 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 	/* Loop for the donor extents */
 	while (1) {
 		/* The extent for donor must be found. */
-		BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
+		if (!dext) {
+			ext4_error(donor_inode->i_sb, __func__,
+				   "The extent for donor must be found");
+			err = -EIO;
+			goto out;
+		} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
+			ext4_error(donor_inode->i_sb, __func__,
+				"Donor offset(%u) and the first block of donor "
+				"extent(%u) should be equal",
+				donor_off,
+				le32_to_cpu(tmp_dext.ee_block));
+			err = -EIO;
+			goto out;
+		}
 
 		/* Set donor extent to orig extent */
 		err = mext_leaf_block(handle, orig_inode,
@@ -1050,18 +1089,23 @@ mext_check_arguments(struct inode *orig_inode,
  * @inode1:	the inode structure
  * @inode2:	the inode structure
  *
- * Lock two inodes' i_mutex by i_ino order. This function is moved from
- * fs/inode.c.
+ * Lock two inodes' i_mutex by i_ino order.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
  */
-static void
+static int
 mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
 {
-	if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
-		if (inode1)
-			mutex_lock(&inode1->i_mutex);
-		else if (inode2)
-			mutex_lock(&inode2->i_mutex);
-		return;
+	int ret = 0;
+
+	BUG_ON(inode1 == NULL && inode2 == NULL);
+
+	ret = mext_check_null_inode(inode1, inode2, __func__);
+	if (ret < 0)
+		goto out;
+
+	if (inode1 == inode2) {
+		mutex_lock(&inode1->i_mutex);
+		goto out;
 	}
 
 	if (inode1->i_ino < inode2->i_ino) {
@@ -1071,6 +1115,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
 		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
 		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
 	}
+
+out:
+	return ret;
 }
 
 /**
@@ -1079,17 +1126,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
  * @inode1:     the inode that is released first
  * @inode2:     the inode that is released second
  *
- * This function is moved from fs/inode.c.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
  */
 
-static void
+static int
 mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
 {
+	int ret = 0;
+
+	BUG_ON(inode1 == NULL && inode2 == NULL);
+
+	ret = mext_check_null_inode(inode1, inode2, __func__);
+	if (ret < 0)
+		goto out;
+
 	if (inode1)
 		mutex_unlock(&inode1->i_mutex);
 
 	if (inode2 && inode2 != inode1)
 		mutex_unlock(&inode2->i_mutex);
+
+out:
+	return ret;
 }
 
 /**
@@ -1146,21 +1204,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
 	ext4_lblk_t rest_blocks;
 	pgoff_t orig_page_offset = 0, seq_end_page;
-	int ret, depth, last_extent = 0;
+	int ret1, ret2, depth, last_extent = 0;
 	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
 	int data_offset_in_page;
 	int block_len_in_page;
 	int uninit;
 
 	/* protect orig and donor against a truncate */
-	mext_inode_double_lock(orig_inode, donor_inode);
+	ret1 = mext_inode_double_lock(orig_inode, donor_inode);
+	if (ret1 < 0)
+		return ret1;
 
 	mext_double_down_read(orig_inode, donor_inode);
 	/* Check the filesystem environment whether move_extent can be done */
-	ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+	ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
 					donor_start, &len, *moved_len);
 	mext_double_up_read(orig_inode, donor_inode);
-	if (ret)
+	if (ret1)
 		goto out2;
 
 	file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
@@ -1168,19 +1228,19 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	if (file_end < block_end)
 		len -= block_end - file_end;
 
-	ret = get_ext_path(orig_inode, block_start, &orig_path);
+	ret1 = get_ext_path(orig_inode, block_start, &orig_path);
 	if (orig_path == NULL)
 		goto out2;
 
 	/* Get path structure to check the hole */
-	ret = get_ext_path(orig_inode, block_start, &holecheck_path);
+	ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
 	if (holecheck_path == NULL)
 		goto out;
 
 	depth = ext_depth(orig_inode);
 	ext_cur = holecheck_path[depth].p_ext;
 	if (ext_cur == NULL) {
-		ret = -EINVAL;
+		ret1 = -EINVAL;
 		goto out;
 	}
 
@@ -1193,13 +1253,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		last_extent = mext_next_extent(orig_inode,
 					holecheck_path, &ext_cur);
 		if (last_extent < 0) {
-			ret = last_extent;
+			ret1 = last_extent;
 			goto out;
 		}
 		last_extent = mext_next_extent(orig_inode, orig_path,
 							&ext_dummy);
 		if (last_extent < 0) {
-			ret = last_extent;
+			ret1 = last_extent;
 			goto out;
 		}
 	}
@@ -1209,7 +1269,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
 		ext4_debug("ext4 move extent: The specified range of file "
 							"may be the hole\n");
-		ret = -EINVAL;
+		ret1 = -EINVAL;
 		goto out;
 	}
 
@@ -1229,7 +1289,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		last_extent = mext_next_extent(orig_inode, holecheck_path,
 						&ext_cur);
 		if (last_extent < 0) {
-			ret = last_extent;
+			ret1 = last_extent;
 			break;
 		}
 		add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1281,16 +1341,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		while (orig_page_offset <= seq_end_page) {
 
 			/* Swap original branches with new branches */
-			ret = move_extent_per_page(o_filp, donor_inode,
+			ret1 = move_extent_per_page(o_filp, donor_inode,
 						orig_page_offset,
 						data_offset_in_page,
 						block_len_in_page, uninit);
-			if (ret < 0)
+			if (ret1 < 0)
 				goto out;
 			orig_page_offset++;
 			/* Count how many blocks we have exchanged */
 			*moved_len += block_len_in_page;
-			BUG_ON(*moved_len > len);
+			if (*moved_len > len) {
+				ext4_error(orig_inode->i_sb, __func__,
+					"We replaced blocks too much! "
+					"sum of replaced: %llu requested: %llu",
+					*moved_len, len);
+				ret1 = -EIO;
+				goto out;
+			}
 
 			data_offset_in_page = 0;
 			rest_blocks -= block_len_in_page;
@@ -1303,7 +1370,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		/* Decrease buffer counter */
 		if (holecheck_path)
 			ext4_ext_drop_refs(holecheck_path);
-		ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
+		ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
 		if (holecheck_path == NULL)
 			break;
 		depth = holecheck_path->p_depth;
@@ -1311,7 +1378,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		/* Decrease buffer counter */
 		if (orig_path)
 			ext4_ext_drop_refs(orig_path);
-		ret = get_ext_path(orig_inode, seq_start, &orig_path);
+		ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
 		if (orig_path == NULL)
 			break;
 
@@ -1330,10 +1397,12 @@ out:
 		kfree(holecheck_path);
 	}
 out2:
-	mext_inode_double_unlock(orig_inode, donor_inode);
+	ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
 
-	if (ret)
-		return ret;
+	if (ret1)
+		return ret1;
+	else if (ret2)
+		return ret2;
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 347fa6f1c7cb5df2b38d3c9167cfe242ce0cd1da Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Wed, 16 Sep 2009 14:25:07 -0400
Subject: ext4: Add null extent check to ext_get_path

There is the possibility that path structure which is taken
by ext4_ext_find_extent() indicates null extents.
Because during data block exchanging in ext4_move_extents(),
constitution of an extent tree may be changed.
As a solution, the patch adds null extent check
to ext_get_path().

Reported-by: Peng Tao <bergwolf@gmail.com>
Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2258560e9722..1c509d549137 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -39,7 +39,9 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
 	if (IS_ERR(*path)) {
 		ret = PTR_ERR(*path);
 		*path = NULL;
-	}
+	} else if ((*path)[ext_depth(inode)].p_ext == NULL)
+		ret = -ENODATA;
+
 	return ret;
 }
 
@@ -316,7 +318,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 
 	if (new_flag) {
 		err = get_ext_path(orig_inode, eblock, &orig_path);
-		if (orig_path == NULL)
+		if (err)
 			goto out;
 
 		if (ext4_ext_insert_extent(handle, orig_inode,
@@ -327,7 +329,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 	if (end_flag) {
 		err = get_ext_path(orig_inode,
 				le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
-		if (orig_path == NULL)
+		if (err)
 			goto out;
 
 		if (ext4_ext_insert_extent(handle, orig_inode,
@@ -673,12 +675,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 
 	/* Get the original extent for the block "orig_off" */
 	err = get_ext_path(orig_inode, orig_off, &orig_path);
-	if (orig_path == NULL)
+	if (err)
 		goto out;
 
 	/* Get the donor extent for the head */
 	err = get_ext_path(donor_inode, donor_off, &donor_path);
-	if (donor_path == NULL)
+	if (err)
 		goto out;
 	depth = ext_depth(orig_inode);
 	oext = orig_path[depth].p_ext;
@@ -733,7 +735,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 		if (orig_path)
 			ext4_ext_drop_refs(orig_path);
 		err = get_ext_path(orig_inode, orig_off, &orig_path);
-		if (orig_path == NULL)
+		if (err)
 			goto out;
 		depth = ext_depth(orig_inode);
 		oext = orig_path[depth].p_ext;
@@ -747,7 +749,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 		if (donor_path)
 			ext4_ext_drop_refs(donor_path);
 		err = get_ext_path(donor_inode, donor_off, &donor_path);
-		if (donor_path == NULL)
+		if (err)
 			goto out;
 		depth = ext_depth(donor_inode);
 		dext = donor_path[depth].p_ext;
@@ -1221,7 +1223,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 					donor_start, &len, *moved_len);
 	mext_double_up_read(orig_inode, donor_inode);
 	if (ret1)
-		goto out2;
+		goto out;
 
 	file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
 	block_end = block_start + len - 1;
@@ -1229,20 +1231,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		len -= block_end - file_end;
 
 	ret1 = get_ext_path(orig_inode, block_start, &orig_path);
-	if (orig_path == NULL)
-		goto out2;
+	if (ret1)
+		goto out;
 
 	/* Get path structure to check the hole */
 	ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
-	if (holecheck_path == NULL)
+	if (ret1)
 		goto out;
 
 	depth = ext_depth(orig_inode);
 	ext_cur = holecheck_path[depth].p_ext;
-	if (ext_cur == NULL) {
-		ret1 = -EINVAL;
-		goto out;
-	}
 
 	/*
 	 * Get proper extent whose ee_block is beyond block_start
@@ -1371,7 +1369,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		if (holecheck_path)
 			ext4_ext_drop_refs(holecheck_path);
 		ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
-		if (holecheck_path == NULL)
+		if (ret1)
 			break;
 		depth = holecheck_path->p_depth;
 
@@ -1379,7 +1377,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		if (orig_path)
 			ext4_ext_drop_refs(orig_path);
 		ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
-		if (orig_path == NULL)
+		if (ret1)
 			break;
 
 		ext_cur = holecheck_path[depth].p_ext;
@@ -1396,7 +1394,7 @@ out:
 		ext4_ext_drop_refs(holecheck_path);
 		kfree(holecheck_path);
 	}
-out2:
+
 	ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
 
 	if (ret1)
-- 
cgit v1.2.3-70-g09d2


From c40ce3c9ea97425a12d7e44031a98fe50add6fc1 Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Wed, 16 Sep 2009 14:25:39 -0400
Subject: ext4: Fix different block exchange issue in EXT4_IOC_MOVE_EXT

If logical block offset of original file which is passed to
EXT4_IOC_MOVE_EXT is different from donor file's,
a calculation error occurs in ext4_calc_swap_extents(),
therefore wrong block is exchanged between original file and donor file.
As a result, we hit ext4_error() in check_block_validity().
To detect the logical offset difference in EXT4_IOC_MOVE_EXT,
add checks to mext_calc_swap_extents() and handle it as error,
since data exchange must be done between the same blocks in EXT4_IOC_MOVE_EXT.

Reported-by: Peng Tao <bergwolf@gmail.com>
Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 46 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 1c509d549137..1f027b1ec430 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -597,8 +597,10 @@ out:
  * @orig_off:		block offset of original inode
  * @donor_off:		block offset of donor inode
  * @max_count:		the maximun length of extents
+ *
+ * Return 0 on success, or a negative error value on failure.
  */
-static void
+static int
 mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 			      struct ext4_extent *tmp_oext,
 			      ext4_lblk_t orig_off, ext4_lblk_t donor_off,
@@ -607,6 +609,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 	ext4_lblk_t diff, orig_diff;
 	struct ext4_extent dext_old, oext_old;
 
+	BUG_ON(orig_off != donor_off);
+
+	/* original and donor extents have to cover the same block offset */
+	if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
+	    le32_to_cpu(tmp_oext->ee_block) +
+			ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
+		return -ENODATA;
+
+	if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
+	    le32_to_cpu(tmp_dext->ee_block) +
+			ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
+		return -ENODATA;
+
 	dext_old = *tmp_dext;
 	oext_old = *tmp_oext;
 
@@ -634,6 +649,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
 
 	copy_extent_status(&oext_old, tmp_dext);
 	copy_extent_status(&dext_old, tmp_oext);
+
+	return 0;
 }
 
 /**
@@ -690,8 +707,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 	dext = donor_path[depth].p_ext;
 	tmp_dext = *dext;
 
-	mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+	err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
 				      donor_off, count);
+	if (err)
+		goto out;
 
 	/* Loop for the donor extents */
 	while (1) {
@@ -760,9 +779,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 		}
 		tmp_dext = *dext;
 
-		mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
-					      donor_off,
-					      count - replaced_count);
+		err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+					   donor_off, count - replaced_count);
+		if (err)
+			goto out;
 	}
 
 out:
@@ -1243,11 +1263,15 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	ext_cur = holecheck_path[depth].p_ext;
 
 	/*
-	 * Get proper extent whose ee_block is beyond block_start
-	 * if block_start was within the hole.
+	 * Get proper starting location of block replacement if block_start was
+	 * within the hole.
 	 */
 	if (le32_to_cpu(ext_cur->ee_block) +
 		ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+		/*
+		 * The hole exists between extents or the tail of
+		 * original file.
+		 */
 		last_extent = mext_next_extent(orig_inode,
 					holecheck_path, &ext_cur);
 		if (last_extent < 0) {
@@ -1260,8 +1284,12 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			ret1 = last_extent;
 			goto out;
 		}
-	}
-	seq_start = block_start;
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+	} else if (le32_to_cpu(ext_cur->ee_block) > block_start)
+		/* The hole exists at the beginning of original file. */
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+	else
+		seq_start = block_start;
 
 	/* No blocks within the specified range. */
 	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
-- 
cgit v1.2.3-70-g09d2


From fb0a387dcdcd21aab1b09ee7fd80b7c979bdbbfd Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 16 Sep 2009 14:45:10 -0400
Subject: ext4: limit block allocations for indirect-block files to < 2^32

Today, the ext4 allocator will happily allocate blocks past
2^32 for indirect-block files, which results in the block
numbers getting truncated, and corruption ensues.

This patch limits such allocations to < 2^32, and adds
BUG_ONs if we do get blocks larger than that.

This should address RH Bug 519471, ext4 bitmap allocator
must limit blocks to < 2^32

* ext4_find_goal() is modified to choose a goal < UINT_MAX,
  so that our starting point is in an acceptable range.

* ext4_xattr_block_set() is modified such that the goal block
  is < UINT_MAX, as above.

* ext4_mb_regular_allocator() is modified so that the group
  search does not continue into groups which are too high

* ext4_mb_use_preallocated() has a check that we don't use
  preallocated space which is too far out

* ext4_alloc_blocks() and ext4_xattr_block_set() add some BUG_ONs

No attempt has been made to limit inode locations to < 2^32,
so we may wind up with blocks far from their inodes.  Doing
this much already will lead to some odd ENOSPC issues when the
"lower 32" gets full, and further restricting inodes could
make that even weirder.

For high inodes, choosing a goal of the original, % UINT_MAX,
may be a bit odd, but then we're in an odd situation anyway,
and I don't know of a better heuristic.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  4 ++++
 fs/ext4/inode.c   | 11 ++++++++++-
 fs/ext4/mballoc.c |  9 +++++++++
 fs/ext4/super.c   |  2 ++
 fs/ext4/xattr.c   | 15 +++++++++++++--
 5 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4dc64ed58d26..2a0f75d55fad 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -403,6 +403,9 @@ struct ext4_mount_options {
 #endif
 };
 
+/* Max physical block we can addres w/o extents */
+#define EXT4_MAX_BLOCK_FILE_PHYS	0xFFFFFFFF
+
 /*
  * Structure of an inode on the disk
  */
@@ -857,6 +860,7 @@ struct ext4_sb_info {
 	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
 	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
 	ext4_group_t s_groups_count;	/* Number of groups in the fs */
+	ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
 	unsigned long s_overhead_last;  /* Last calculated overhead */
 	unsigned long s_blocks_last;    /* Last seen block count */
 	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d04c8428bde2..5a8979259c9a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -562,15 +562,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
  *
  *	Normally this function find the preferred place for block allocation,
  *	returns it.
+ *	Because this is only used for non-extent files, we limit the block nr
+ *	to 32 bits.
  */
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 				   Indirect *partial)
 {
+	ext4_fsblk_t goal;
+
 	/*
 	 * XXX need to get goal block from mballoc's data structures
 	 */
 
-	return ext4_find_near(inode, partial);
+	goal = ext4_find_near(inode, partial);
+	goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+	return goal;
 }
 
 /**
@@ -651,6 +657,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 		if (*err)
 			goto failed_out;
 
+		BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
+
 		target -= count;
 		/* allocate blocks for indirect blocks */
 		while (index < indirect_blks && count) {
@@ -685,6 +693,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 		ar.flags = EXT4_MB_HINT_DATA;
 
 	current_block = ext4_mb_new_blocks(handle, &ar, err);
+	BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
 
 	if (*err && (target == blks)) {
 		/*
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d23056d375b3..e9c61896d605 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1965,6 +1965,10 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	sb = ac->ac_sb;
 	sbi = EXT4_SB(sb);
 	ngroups = ext4_get_groups_count(sb);
+	/* non-extent files are limited to low blocks/groups */
+	if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+		ngroups = sbi->s_blockfile_groups;
+
 	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
 
 	/* first, try the goal */
@@ -3382,6 +3386,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 			ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
 			continue;
 
+		/* non-extent files can't have physical blocks past 2^32 */
+		if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+			pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+			continue;
+
 		/* found preallocated blocks, use them */
 		spin_lock(&pa->pa_lock);
 		if (pa->pa_deleted == 0 && pa->pa_free) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index af95dd8ba54b..a6b1ab734728 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2616,6 +2616,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 	sbi->s_groups_count = blocks_count;
+	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
 		   EXT4_DESC_PER_BLOCK(sb);
 	sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 62b31c246994..fed5b01d7a8d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,12 +810,23 @@ inserted:
 			get_bh(new_bh);
 		} else {
 			/* We need to allocate a new block */
-			ext4_fsblk_t goal = ext4_group_first_block_no(sb,
+			ext4_fsblk_t goal, block;
+
+			goal = ext4_group_first_block_no(sb,
 						EXT4_I(inode)->i_block_group);
-			ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
+
+			/* non-extent files can't have physical blocks past 2^32 */
+			if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+
+			block = ext4_new_meta_blocks(handle, inode,
 						  goal, NULL, &error);
 			if (error)
 				goto cleanup;
+
+			if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+				BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+
 			ea_idebug(inode, "creating block %d", block);
 
 			new_bh = sb_getblk(sb, block);
-- 
cgit v1.2.3-70-g09d2


From 1b9c12f44c1eb614fd3b8822bfe8f1f5d8e53737 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 17 Sep 2009 08:32:22 -0400
Subject: ext4: store EXT4_EXT_MIGRATE in i_state instead of i_flags

EXT4_EXT_MIGRATE is only intended to be used for an in-memory flag,
and the hex value assigned to it collides with FS_DIRECTIO_FL (which
is also stored in i_flags).  There's no reason for the
EXT4_EXT_MIGRATE bit to be stored in i_flags, so we switch it to use
i_state instead.

Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  2 +-
 fs/ext4/inode.c   |  6 ++----
 fs/ext4/migrate.c | 20 ++++++++++----------
 3 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2a0f75d55fad..84e7f1d00a83 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -268,7 +268,6 @@ struct flex_groups {
 #define EXT4_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
 #define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
-#define EXT4_EXT_MIGRATE		0x00100000 /* Inode is migrating */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
 #define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
@@ -306,6 +305,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 #define EXT4_STATE_XATTR		0x00000004 /* has in-inode xattrs */
 #define EXT4_STATE_NO_EXPAND		0x00000008 /* No space for expansion */
 #define EXT4_STATE_DA_ALLOC_CLOSE	0x00000010 /* Alloc DA blks on close */
+#define EXT4_STATE_EXT_MIGRATE		0x00000020 /* Inode is migrating */
 
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5a8979259c9a..a5b4ce40cc66 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1255,8 +1255,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 			 * i_data's format changing.  Force the migrate
 			 * to fail by clearing migrate flags
 			 */
-			EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
-							~EXT4_EXT_MIGRATE;
+			EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
 		}
 	}
 
@@ -4596,8 +4595,7 @@ static int ext4_do_update_inode(handle_t *handle,
 	if (ext4_inode_blocks_set(handle, raw_inode, ei))
 		goto out_brelse;
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-	/* clear the migrate flag in the raw_inode */
-	raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
+	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
 	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
 	    cpu_to_le32(EXT4_OS_HURD))
 		raw_inode->i_file_acl_high =
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 05361ad5b80a..bf519f239ae6 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
 
 	down_write(&EXT4_I(inode)->i_data_sem);
 	/*
-	 * if EXT4_EXT_MIGRATE is cleared a block allocation
+	 * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
 	 * happened after we started the migrate. We need to
 	 * fail the migrate
 	 */
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) {
+	if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
 		retval = -EAGAIN;
 		up_write(&EXT4_I(inode)->i_data_sem);
 		goto err_out;
 	} else
-		EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
-							~EXT4_EXT_MIGRATE;
+		EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
 	/*
 	 * We have the extent map build with the tmp inode.
 	 * Now copy the i_data across
@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
 	 * when we add extents we extent the journal
 	 */
 	/*
-	 * Even though we take i_mutex we can still cause block allocation
-	 * via mmap write to holes. If we have allocated new blocks we fail
-	 * migrate.  New block allocation will clear EXT4_EXT_MIGRATE flag.
-	 * The flag is updated with i_data_sem held to prevent racing with
-	 * block allocation.
+	 * Even though we take i_mutex we can still cause block
+	 * allocation via mmap write to holes. If we have allocated
+	 * new blocks we fail migrate.  New block allocation will
+	 * clear EXT4_STATE_EXT_MIGRATE flag.  The flag is updated
+	 * with i_data_sem held to prevent racing with block
+	 * allocation.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
-	EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE;
+	EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 	handle = ext4_journal_start(inode, 1);
-- 
cgit v1.2.3-70-g09d2


From fb40ba0d98968bc3454731360363d725b4f1064c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 16 Sep 2009 19:30:40 -0400
Subject: ext4: Add a tracepoint for ext4_alloc_da_blocks()

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c             |  2 ++
 include/trace/events/ext4.h | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a5b4ce40cc66..9887a0c562d5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3128,6 +3128,8 @@ out:
  */
 int ext4_alloc_da_blocks(struct inode *inode)
 {
+	trace_ext4_alloc_da_blocks(inode);
+
 	if (!EXT4_I(inode)->i_reserved_data_blocks &&
 	    !EXT4_I(inode)->i_reserved_meta_blocks)
 		return 0;
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 6fe6ce9ee071..c1bd8f1e8b94 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -10,6 +10,9 @@
 struct ext4_allocation_context;
 struct ext4_allocation_request;
 struct ext4_prealloc_space;
+struct ext4_inode_info;
+
+#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
 
 TRACE_EVENT(ext4_free_inode,
 	TP_PROTO(struct inode *inode),
@@ -710,6 +713,30 @@ TRACE_EVENT(ext4_sync_fs,
 		  __entry->wait)
 );
 
+TRACE_EVENT(ext4_alloc_da_blocks,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field( unsigned int,	data_blocks	)
+		__field( unsigned int,	meta_blocks	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+		__entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
+	),
+
+	TP_printk("dev %s ino %lu data_blocks %u meta_blocks %u",
+		  jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+		  __entry->data_blocks, __entry->meta_blocks)
+);
+
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3-70-g09d2


From 5534fb5bb35a62a94e0bd1fa2421f7fb6e894f10 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 17 Sep 2009 09:34:16 -0400
Subject: ext4: Fix the alloc on close after a truncate hueristic

In an attempt to avoid doing an unneeded flush after opening a
(previously non-existent) file with O_CREAT|O_TRUNC, the code only
triggered the hueristic if ei->disksize was non-zero.  Turns out that
the VFS doesn't call ->truncate() if the file doesn't exist, and
ei->disksize is always zero even if the file previously existed.  So
remove the test, since it isn't necessary and in fact disabled the
hueristic.

Thanks to Clemens Eisserer that he was seeing problems with files
written using kwrite and eclipse after sudden crashes caused by a
buggy Intel video driver.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9887a0c562d5..4abd683b963d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3973,8 +3973,7 @@ void ext4_truncate(struct inode *inode)
 	if (!ext4_can_truncate(inode))
 		return;
 
-	if (ei->i_disksize && inode->i_size == 0 &&
-	    !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
 
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-- 
cgit v1.2.3-70-g09d2


From 0a80e9867db154966b2a771042e10452ac110e1e Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 17 Sep 2009 11:55:58 -0400
Subject: ext4: replace MAX_DEFRAG_SIZE with EXT_MAX_BLOCK

There's no reason to redefine the maximum allowable offset
in an extent-based file just for defrag;
EXT_MAX_BLOCK already does this.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h         |  1 -
 fs/ext4/ext4_extents.h |  1 +
 fs/ext4/move_extent.c  | 12 ++++++------
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 84e7f1d00a83..e227eea23f05 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -476,7 +476,6 @@ struct move_extent {
 	__u64 len;		/* block length to be moved */
 	__u64 moved_len;	/* moved block length */
 };
-#define MAX_DEFRAG_SIZE         ((1UL<<31) - 1)
 
 #define EXT4_EPOCH_BITS 2
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 9538633879d7..61652f1d15e6 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -137,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
 #define EXT_BREAK      1
 #define EXT_REPEAT     2
 
+/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
 #define EXT_MAX_BLOCK	0xffffffff
 
 /*
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 1f027b1ec430..c07a2915e40b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -1040,12 +1040,12 @@ mext_check_arguments(struct inode *orig_inode,
 		return -EINVAL;
 	}
 
-	if ((orig_start > MAX_DEFRAG_SIZE) ||
-	    (donor_start > MAX_DEFRAG_SIZE) ||
-	    (*len > MAX_DEFRAG_SIZE) ||
-	    (orig_start + *len > MAX_DEFRAG_SIZE))  {
-		ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
-			"[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
+	if ((orig_start > EXT_MAX_BLOCK) ||
+	    (donor_start > EXT_MAX_BLOCK) ||
+	    (*len > EXT_MAX_BLOCK) ||
+	    (orig_start + *len > EXT_MAX_BLOCK))  {
+		ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
+			"[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
-- 
cgit v1.2.3-70-g09d2