From a0da84f35b25875870270d16b6eccda4884d61a7 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:22 +1000
Subject: Improve setting of "events_cleared" for write-intent bitmaps.

When an array is degraded, bits in the write-intent bitmap are not
cleared, so that if the missing device is re-added, it can be synced
by only updated those parts of the device that have changed since
it was removed.

The enable this a 'events_cleared' value is stored. It is the event
counter for the array the last time that any bits were cleared.

Sometimes - if a device disappears from an array while it is 'clean' -
the events_cleared value gets updated incorrectly (there are subtle
ordering issues between updateing events in the main metadata and the
bitmap metadata) resulting in the missing device appearing to require
a full resync when it is re-added.

With this patch, we update events_cleared precisely when we are about
to clear a bit in the bitmap.  We record events_cleared when we clear
the bit internally, and copy that to the superblock which is written
out before the bit on storage.  This makes it more "obviously correct".

We also need to update events_cleared when the event_count is going
backwards (as happens on a dirty->clean transition of a non-degraded
array).

Thanks to Mike Snitzer for identifying this problem and testing early
"fixes".

Cc:  "Mike Snitzer" <snitzer@gmail.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 include/linux/raid/bitmap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 78bfdea24a8e..e98900671ca9 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -221,6 +221,7 @@ struct bitmap {
 	unsigned long syncchunk;
 
 	__u64	events_cleared;
+	int need_sync;
 
 	/* bitmap spinlock */
 	spinlock_t lock;
-- 
cgit v1.2.3-70-g09d2


From 5e96ee65c8bd629ce093da67a066d3946468298a Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:24 +1000
Subject: Allow setting start point for requested check/repair

This makes it possible to just resync a small part of an array.
e.g. if a drive reports that it has questionable sectors,
a 'repair' of just the region covering those sectors will
cause them to be read and, if there is an error, re-written
with correct data.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/md.c           | 47 ++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/raid/md_k.h |  2 ++
 2 files changed, 44 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2580ac1b9b0f..261322722c19 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -278,6 +278,7 @@ static mddev_t * mddev_find(dev_t unit)
 	init_waitqueue_head(&new->sb_wait);
 	init_waitqueue_head(&new->recovery_wait);
 	new->reshape_position = MaxSector;
+	new->resync_min = 0;
 	new->resync_max = MaxSector;
 	new->level = LEVEL_NONE;
 
@@ -3074,6 +3075,36 @@ sync_completed_show(mddev_t *mddev, char *page)
 
 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
 
+static ssize_t
+min_sync_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%llu\n",
+		       (unsigned long long)mddev->resync_min);
+}
+static ssize_t
+min_sync_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	unsigned long long min;
+	if (strict_strtoull(buf, 10, &min))
+		return -EINVAL;
+	if (min > mddev->resync_max)
+		return -EINVAL;
+	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		return -EBUSY;
+
+	/* Must be a multiple of chunk_size */
+	if (mddev->chunk_size) {
+		if (min & (sector_t)((mddev->chunk_size>>9)-1))
+			return -EINVAL;
+	}
+	mddev->resync_min = min;
+
+	return len;
+}
+
+static struct md_sysfs_entry md_min_sync =
+__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
+
 static ssize_t
 max_sync_show(mddev_t *mddev, char *page)
 {
@@ -3089,9 +3120,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
 	if (strncmp(buf, "max", 3) == 0)
 		mddev->resync_max = MaxSector;
 	else {
-		char *ep;
-		unsigned long long max = simple_strtoull(buf, &ep, 10);
-		if (ep == buf || (*ep != 0 && *ep != '\n'))
+		unsigned long long max;
+		if (strict_strtoull(buf, 10, &max))
+			return -EINVAL;
+		if (max < mddev->resync_min)
 			return -EINVAL;
 		if (max < mddev->resync_max &&
 		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -3222,6 +3254,7 @@ static struct attribute *md_redundancy_attrs[] = {
 	&md_sync_speed.attr,
 	&md_sync_force_parallel.attr,
 	&md_sync_completed.attr,
+	&md_min_sync.attr,
 	&md_max_sync.attr,
 	&md_suspend_lo.attr,
 	&md_suspend_hi.attr,
@@ -3777,6 +3810,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
 		mddev->size = 0;
 		mddev->raid_disks = 0;
 		mddev->recovery_cp = 0;
+		mddev->resync_min = 0;
 		mddev->resync_max = MaxSector;
 		mddev->reshape_position = MaxSector;
 		mddev->external = 0;
@@ -5625,9 +5659,11 @@ void md_do_sync(mddev_t *mddev)
 		max_sectors = mddev->resync_max_sectors;
 		mddev->resync_mismatches = 0;
 		/* we don't use the checkpoint if there's a bitmap */
-		if (!mddev->bitmap &&
-		    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+			j = mddev->resync_min;
+		else if (!mddev->bitmap)
 			j = mddev->recovery_cp;
+
 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		max_sectors = mddev->size << 1;
 	else {
@@ -5796,6 +5832,7 @@ void md_do_sync(mddev_t *mddev)
 
  skip:
 	mddev->curr_resync = 0;
+	mddev->resync_min = 0;
 	mddev->resync_max = MaxSector;
 	sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	wake_up(&resync_wait);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 3dea9f545c8f..780e0613e6d5 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -227,6 +227,8 @@ struct mddev_s
 	atomic_t			recovery_active; /* blocks scheduled, but not written */
 	wait_queue_head_t		recovery_wait;
 	sector_t			recovery_cp;
+	sector_t			resync_min;	/* user requested sync
+							 * starts here */
 	sector_t			resync_max;	/* resync should pause
 							 * when it gets here */
 
-- 
cgit v1.2.3-70-g09d2


From 72a23c211e4587859d5bf61ac4962d76e593fb02 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:41 +1000
Subject: Make sure all changes to md/sync_action are notified.

When the 'resync' thread starts or stops, when we explicitly
set sync_action, or when we determine that there is definitely nothing
to do, we notify sync_action.

To stop "sync_action" from occasionally showing the wrong value,
we introduce a new flags - MD_RECOVERY_RECOVER - to say that a
recovery is probably needed or happening, and we make sure
that we set MD_RECOVERY_RUNNING before clearing MD_RECOVERY_NEEDED.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 Documentation/md.txt      |  6 ++++++
 drivers/md/md.c           | 34 ++++++++++++++++++++++++++++------
 include/linux/raid/md_k.h |  2 ++
 3 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index dca97ba4944a..c05bfb55659e 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -386,6 +386,12 @@ also have
 	'check' and 'repair' will start the appropriate process
            providing the current state is 'idle'.
 
+      This file responds to select/poll.  Any important change in the value
+      triggers a poll event.  Sometimes the value will briefly be
+      "recover" if a recovery seems to be needed, but cannot be
+      achieved. In that case, the transition to "recover" isn't
+      notified, but the transition away is.
+
    mismatch_count
       When performing 'check' and 'repair', and possibly when
       performing 'resync', md will count the number of errors that are
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5b9d4fe4e6e4..c26dcad8a3ac 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -169,7 +169,6 @@ void md_new_event(mddev_t *mddev)
 {
 	atomic_inc(&md_event_count);
 	wake_up(&md_event_waiters);
-	sysfs_notify(&mddev->kobj, NULL, "sync_action");
 }
 EXPORT_SYMBOL_GPL(md_new_event);
 
@@ -2936,7 +2935,7 @@ action_show(mddev_t *mddev, char *page)
 				type = "check";
 			else
 				type = "repair";
-		} else
+		} else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
 			type = "recover";
 	}
 	return sprintf(page, "%s\n", type);
@@ -2958,9 +2957,12 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
 		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
 		return -EBUSY;
-	else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
+	else if (cmd_match(page, "resync"))
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	else if (cmd_match(page, "recover")) {
+		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-	else if (cmd_match(page, "reshape")) {
+	} else if (cmd_match(page, "reshape")) {
 		int err;
 		if (mddev->pers->start_reshape == NULL)
 			return -EINVAL;
@@ -2977,6 +2979,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 	}
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
+	sysfs_notify(&mddev->kobj, NULL, "sync_action");
 	return len;
 }
 
@@ -3682,6 +3685,7 @@ static int do_md_run(mddev_t * mddev)
 	mddev->changed = 1;
 	md_new_event(mddev);
 	sysfs_notify(&mddev->kobj, NULL, "array_state");
+	sysfs_notify(&mddev->kobj, NULL, "sync_action");
 	kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
 	return 0;
 }
@@ -4252,6 +4256,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 			export_rdev(rdev);
 
 		md_update_sb(mddev, 1);
+		if (mddev->degraded)
+			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		md_wakeup_thread(mddev->thread);
 		return err;
@@ -5105,6 +5111,8 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (!mddev->pers->error_handler)
 		return;
 	mddev->pers->error_handler(mddev,rdev);
+	if (mddev->degraded)
+		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
@@ -6055,13 +6063,18 @@ void md_check_recovery(mddev_t *mddev)
 			mddev->recovery = 0;
 			/* flag recovery needed just to double check */
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+			sysfs_notify(&mddev->kobj, NULL, "sync_action");
 			md_new_event(mddev);
 			goto unlock;
 		}
+		/* Set RUNNING before clearing NEEDED to avoid
+		 * any transients in the value of "sync_action".
+		 */
+		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+		clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		/* Clear some bits that don't mean anything, but
 		 * might be left set
 		 */
-		clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
 		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 
@@ -6079,17 +6092,19 @@ void md_check_recovery(mddev_t *mddev)
 				/* Cannot proceed */
 				goto unlock;
 			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		} else if ((spares = remove_and_add_spares(mddev))) {
 			clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 			clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		} else if (mddev->recovery_cp < MaxSector) {
 			set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 		} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
 			/* nothing to be done ... */
 			goto unlock;
 
 		if (mddev->pers->sync_request) {
-			set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 			if (spares && mddev->bitmap && ! mddev->bitmap->file) {
 				/* We are adding a device or devices to an array
 				 * which has the bitmap stored on all devices.
@@ -6108,9 +6123,16 @@ void md_check_recovery(mddev_t *mddev)
 				mddev->recovery = 0;
 			} else
 				md_wakeup_thread(mddev->sync_thread);
+			sysfs_notify(&mddev->kobj, NULL, "sync_action");
 			md_new_event(mddev);
 		}
 	unlock:
+		if (!mddev->sync_thread) {
+			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+			if (test_and_clear_bit(MD_RECOVERY_RECOVER,
+					       &mddev->recovery))
+				sysfs_notify(&mddev->kobj, NULL, "sync_action");
+		}
 		mddev_unlock(mddev);
 	}
 }
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 780e0613e6d5..62aa9c9a6ddc 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -188,6 +188,7 @@ struct mddev_s
 	 * NEEDED:   we might need to start a resync/recover
 	 * RUNNING:  a thread is running, or about to be started
 	 * SYNC:     actually doing a resync, not a recovery
+	 * RECOVER:  doing recovery, or need to try it.
 	 * INTR:     resync needs to be aborted for some reason
 	 * DONE:     thread is done and is waiting to be reaped
 	 * REQUEST:  user-space has requested a sync (used with SYNC)
@@ -198,6 +199,7 @@ struct mddev_s
 	 */
 #define	MD_RECOVERY_RUNNING	0
 #define	MD_RECOVERY_SYNC	1
+#define	MD_RECOVERY_RECOVER	2
 #define	MD_RECOVERY_INTR	3
 #define	MD_RECOVERY_DONE	4
 #define	MD_RECOVERY_NEEDED	5
-- 
cgit v1.2.3-70-g09d2


From 526647320e696f434647f38421a6ecf65b859c43 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@notabene.brown>
Date: Sat, 28 Jun 2008 08:31:44 +1000
Subject: Make sure all changes to md/dev-XX/state are notified

The important state change happens during an interrupt
in md_error.  So just set a flag there and call sysfs_notify
later in process context.

Signed-off-by: Neil Brown <neilb@suse.de>
---
 Documentation/md.txt      | 10 ++++++++++
 drivers/md/md.c           | 14 +++++++++++++-
 include/linux/raid/md_k.h |  3 +++
 3 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index eb6e69e3732e..e06cc59437e4 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -297,6 +297,10 @@ Each directory contains:
 	      writemostly - device will only be subject to read
 		         requests if there are no other options.
 			 This applies only to raid1 arrays.
+	      blocked  - device has failed, metadata is "external",
+	                 and the failure hasn't been acknowledged yet.
+			 Writes that would write to this device if
+			 it were not faulty are blocked.
 	      spare    - device is working, but not a full member.
 			 This includes spares that are in the process
 			 of being recovered to
@@ -306,6 +310,12 @@ Each directory contains:
 	Writing "remove" removes the device from the array.
 	Writing "writemostly" sets the writemostly flag.
 	Writing "-writemostly" clears the writemostly flag.
+	Writing "blocked" sets the "blocked" flag.
+	Writing "-blocked" clear the "blocked" flag and allows writes
+		to complete.
+
+	This file responds to select/poll. Any change to 'faulty'
+	or 'blocked' causes an event.
 
       errors
 	An approximate count of read errors that have been detected on
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 60d4cad88c20..dc99d95a1b6d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1886,6 +1886,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 
 		err = 0;
 	}
+	if (!err)
+		sysfs_notify(&rdev->kobj, NULL, "state");
 	return err ? err : len;
 }
 static struct rdev_sysfs_entry rdev_state =
@@ -1979,7 +1981,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		if (err) {
 			rdev->raid_disk = -1;
 			return err;
-		}
+		} else
+			sysfs_notify(&rdev->kobj, NULL, "state");
 		sprintf(nm, "rd%d", rdev->raid_disk);
 		if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
 			printk(KERN_WARNING
@@ -1996,6 +1999,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		clear_bit(Faulty, &rdev->flags);
 		clear_bit(WriteMostly, &rdev->flags);
 		set_bit(In_sync, &rdev->flags);
+		sysfs_notify(&rdev->kobj, NULL, "state");
 	}
 	return len;
 }
@@ -3525,6 +3529,7 @@ static int do_md_run(mddev_t * mddev)
 				return -EINVAL;
 			}
 		}
+		sysfs_notify(&rdev->kobj, NULL, "state");
 	}
 
 	md_probe(mddev->unit, NULL, NULL);
@@ -4256,6 +4261,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		}
 		if (err)
 			export_rdev(rdev);
+		else
+			sysfs_notify(&rdev->kobj, NULL, "state");
 
 		md_update_sb(mddev, 1);
 		if (mddev->degraded)
@@ -5115,6 +5122,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
 	mddev->pers->error_handler(mddev,rdev);
 	if (mddev->degraded)
 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+	set_bit(StateChanged, &rdev->flags);
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
@@ -6037,6 +6045,10 @@ void md_check_recovery(mddev_t *mddev)
 		if (mddev->flags)
 			md_update_sb(mddev, 0);
 
+		rdev_for_each(rdev, rtmp, mddev)
+			if (test_and_clear_bit(StateChanged, &rdev->flags))
+				sysfs_notify(&rdev->kobj, NULL, "state");
+
 
 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
 		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 62aa9c9a6ddc..df30c4395875 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -87,6 +87,9 @@ struct mdk_rdev_s
 #define Blocked		8		/* An error occured on an externally
 					 * managed array, don't allow writes
 					 * until it is cleared */
+#define StateChanged	9		/* Faulty or Blocked has changed during
+					 * interrupt, so it needs to be
+					 * notified by the thread */
 	wait_queue_head_t blocked_wait;
 
 	int desc_nr;			/* descriptor index in the superblock */
-- 
cgit v1.2.3-70-g09d2


From b203886edbcaac3ca427cf4dbcb50b18bdb346fd Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:50 +1000
Subject: md: kill STRIPE_OP_MOD_DMA in raid5 offload

From: Dan Williams <dan.j.williams@intel.com>

This micro-optimization allowed the raid code to skip a re-read of the
parity block after checking parity.  It took advantage of the fact that
xor-offload-engines have their own internal result buffer and can check
parity without writing to memory.  Remove it for the following reasons:

1/ It is a layering violation for MD to need to manage the DMA and
   non-DMA paths within async_xor_zero_sum
2/ Bad precedent to toggle the 'ops' flags outside the lock
3/ Hard to realize a performance gain as reads will not need an updated
   parity block and writes will dirty it anyways.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 10 ----------
 include/linux/raid/raid5.h |  2 --
 2 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8c4e6149daea..60e61d2464b5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -837,15 +837,10 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 static void ops_complete_check(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
-	int pd_idx = sh->pd_idx;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
-	if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
-		sh->ops.zero_sum_result == 0)
-		set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-
 	set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
@@ -873,11 +868,6 @@ static void ops_run_check(struct stripe_head *sh)
 	tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
 		&sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
 
-	if (tx)
-		set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
-	else
-		clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
-
 	atomic_inc(&sh->count);
 	tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
 		ops_complete_check, sh);
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index f0827d31ae6f..4ecae31a3dcb 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -267,10 +267,8 @@ struct r6_state {
 
 /* modifiers to the base operations
  * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
- * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check
  */
 #define STRIPE_OP_MOD_REPAIR_PD 7
-#define STRIPE_OP_MOD_DMA_CHECK 8
 
 /*
  * Plugging:
-- 
cgit v1.2.3-70-g09d2


From 2b7497f0e0a0b9cf21d822e427d5399b2056501a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:52 +1000
Subject: md: kill STRIPE_OP_IO flag

From: Dan Williams <dan.j.williams@intel.com>

The R5_Want{Read,Write} flags already gate i/o.  So, this flag is
superfluous and we can unconditionally call ops_run_io().

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 32 +++++---------------------------
 include/linux/raid/raid5.h |  1 -
 2 files changed, 5 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 60e61d2464b5..cac97080b278 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -373,8 +373,6 @@ static unsigned long get_stripe_work(struct stripe_head *sh)
 	test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
 	test_and_ack_op(STRIPE_OP_POSTXOR, pending);
 	test_and_ack_op(STRIPE_OP_CHECK, pending);
-	if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
-		ack++;
 
 	sh->ops.count -= ack;
 	if (unlikely(sh->ops.count < 0)) {
@@ -399,7 +397,6 @@ static void ops_run_io(struct stripe_head *sh)
 
 	might_sleep();
 
-	set_bit(STRIPE_IO_STARTED, &sh->state);
 	for (i = disks; i--; ) {
 		int rw;
 		struct bio *bi;
@@ -433,6 +430,8 @@ static void ops_run_io(struct stripe_head *sh)
 				test_bit(STRIPE_EXPAND_READY, &sh->state))
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
+			set_bit(STRIPE_IO_STARTED, &sh->state);
+
 			bi->bi_bdev = rdev->bdev;
 			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
 				__func__, (unsigned long long)sh->sector,
@@ -900,9 +899,6 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
 	if (test_bit(STRIPE_OP_CHECK, &pending))
 		ops_run_check(sh);
 
-	if (test_bit(STRIPE_OP_IO, &pending))
-		ops_run_io(sh);
-
 	if (overlap_clear)
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -2013,8 +2009,6 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 			 */
 			set_bit(R5_LOCKED, &dev->flags);
 			set_bit(R5_Wantread, &dev->flags);
-			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-				sh->ops.count++;
 			s->locked++;
 			pr_debug("Reading block %d (sync=%d)\n", disk_idx,
 				s->syncing);
@@ -2208,9 +2202,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 						"%d for r-m-w\n", i);
 					set_bit(R5_LOCKED, &dev->flags);
 					set_bit(R5_Wantread, &dev->flags);
-					if (!test_and_set_bit(
-						STRIPE_OP_IO, &sh->ops.pending))
-						sh->ops.count++;
 					s->locked++;
 				} else {
 					set_bit(STRIPE_DELAYED, &sh->state);
@@ -2234,9 +2225,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 						"%d for Reconstruct\n", i);
 					set_bit(R5_LOCKED, &dev->flags);
 					set_bit(R5_Wantread, &dev->flags);
-					if (!test_and_set_bit(
-						STRIPE_OP_IO, &sh->ops.pending))
-						sh->ops.count++;
 					s->locked++;
 				} else {
 					set_bit(STRIPE_DELAYED, &sh->state);
@@ -2444,8 +2432,6 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 
 		set_bit(R5_LOCKED, &dev->flags);
 		set_bit(R5_Wantwrite, &dev->flags);
-		if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-			sh->ops.count++;
 
 		clear_bit(STRIPE_DEGRADED, &sh->state);
 		s->locked++;
@@ -2801,9 +2787,6 @@ static void handle_stripe5(struct stripe_head *sh)
 				(i == sh->pd_idx || dev->written)) {
 				pr_debug("Writing block %d\n", i);
 				set_bit(R5_Wantwrite, &dev->flags);
-				if (!test_and_set_bit(
-				    STRIPE_OP_IO, &sh->ops.pending))
-					sh->ops.count++;
 				if (prexor)
 					continue;
 				if (!test_bit(R5_Insync, &dev->flags) ||
@@ -2857,16 +2840,12 @@ static void handle_stripe5(struct stripe_head *sh)
 		dev = &sh->dev[s.failed_num];
 		if (!test_bit(R5_ReWrite, &dev->flags)) {
 			set_bit(R5_Wantwrite, &dev->flags);
-			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-				sh->ops.count++;
 			set_bit(R5_ReWrite, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
 			s.locked++;
 		} else {
 			/* let's read it back */
 			set_bit(R5_Wantread, &dev->flags);
-			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-				sh->ops.count++;
 			set_bit(R5_LOCKED, &dev->flags);
 			s.locked++;
 		}
@@ -2884,13 +2863,10 @@ static void handle_stripe5(struct stripe_head *sh)
 		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
 		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
 
-		for (i = conf->raid_disks; i--; ) {
+		for (i = conf->raid_disks; i--; )
 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
 			set_bit(R5_LOCKED, &dev->flags);
 			s.locked++;
-			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
-				sh->ops.count++;
-		}
 	}
 
 	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
@@ -2926,6 +2902,8 @@ static void handle_stripe5(struct stripe_head *sh)
 	if (pending)
 		raid5_run_ops(sh, pending);
 
+	ops_run_io(sh);
+
 	return_io(return_bi);
 
 }
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 4ecae31a3dcb..1301195abf4b 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -263,7 +263,6 @@ struct r6_state {
 #define STRIPE_OP_BIODRAIN	3
 #define STRIPE_OP_POSTXOR	4
 #define STRIPE_OP_CHECK	5
-#define STRIPE_OP_IO		6
 
 /* modifiers to the base operations
  * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
-- 
cgit v1.2.3-70-g09d2


From ecc65c9b3f9b9d740a5deade3d85b39be56401b6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:31:57 +1000
Subject: md: replace STRIPE_OP_CHECK with 'check_states'

From: Dan Williams <dan.j.williams@intel.com>

The STRIPE_OP_* flags record the state of stripe operations which are
performed outside the stripe lock.  Their use in indicating which
operations need to be run is straightforward; however, interpolating what
the next state of the stripe should be based on a given combination of
these flags is not straightforward, and has led to bugs.  An easier to read
implementation with minimal degrees of freedom is needed.

Towards this goal, this patch introduces explicit states to replace what was
previously interpolated from the STRIPE_OP_* flags.  For now this only converts
the handle_parity_checks5 path, removing a user of the
ops.{pending,ack,complete,count} fields of struct stripe_operations.

This conversion also found a remaining issue with the current code.  There is
a small window for a drive to fail between when we schedule a repair and when
the parity calculation for that repair completes.  When this happens we will
writeback to 'failed_num' when we really want to write back to 'pd_idx'.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 172 ++++++++++++++++++++++-----------------------
 include/linux/raid/raid5.h |  46 ++++++++++--
 2 files changed, 123 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6f3dd12dd3a4..544e1600f208 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -605,7 +605,11 @@ static void ops_complete_compute5(void *stripe_head_ref)
 	set_bit(R5_UPTODATE, &tgt->flags);
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 	clear_bit(R5_Wantcompute, &tgt->flags);
-	set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
+	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
+	if (sh->check_state == check_state_compute_run)
+		sh->check_state = check_state_compute_result;
+	else
+		set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
@@ -838,7 +842,7 @@ static void ops_complete_check(void *stripe_head_ref)
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
-	set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
+	sh->check_state = check_state_check_result;
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
@@ -870,7 +874,8 @@ static void ops_run_check(struct stripe_head *sh)
 		ops_complete_check, sh);
 }
 
-static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
+static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
+			  unsigned long ops_request)
 {
 	int overlap_clear = 0, i, disks = sh->disks;
 	struct dma_async_tx_descriptor *tx = NULL;
@@ -880,7 +885,8 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
 		overlap_clear++;
 	}
 
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending))
+	if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending) ||
+	    test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request))
 		tx = ops_run_compute5(sh, pending);
 
 	if (test_bit(STRIPE_OP_PREXOR, &pending))
@@ -894,7 +900,7 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
 	if (test_bit(STRIPE_OP_POSTXOR, &pending))
 		ops_run_postxor(sh, tx, pending);
 
-	if (test_bit(STRIPE_OP_CHECK, &pending))
+	if (test_bit(STRIPE_OP_CHECK, &ops_request))
 		ops_run_check(sh);
 
 	if (overlap_clear)
@@ -1961,8 +1967,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 	/* don't schedule compute operations or reads on the parity block while
 	 * a check is in flight
 	 */
-	if ((disk_idx == sh->pd_idx) &&
-	     test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
+	if (disk_idx == sh->pd_idx && sh->check_state)
 		return ~0;
 
 	/* is the data in this block needed, and can we get it? */
@@ -1983,9 +1988,8 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
 		 * 3/ We hold off parity block re-reads until check operations
 		 * have quiesced.
 		 */
-		if ((s->uptodate == disks - 1) &&
-		    (s->failed && disk_idx == s->failed_num) &&
-		    !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
+		if ((s->uptodate == disks - 1) && !sh->check_state &&
+		    (s->failed && disk_idx == s->failed_num)) {
 			set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
 			set_bit(R5_Wantcompute, &dev->flags);
 			sh->ops.target = disk_idx;
@@ -2021,12 +2025,8 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh,
 {
 	int i;
 
-	/* Clear completed compute operations.  Parity recovery
-	 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
-	 * later on in this routine
-	 */
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
-		!test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+	/* Clear completed compute operations */
+	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete)) {
 		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
 		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
 		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
@@ -2350,90 +2350,85 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
 static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
 				struct stripe_head_state *s, int disks)
 {
-	int canceled_check = 0;
+	struct r5dev *dev = NULL;
 
 	set_bit(STRIPE_HANDLE, &sh->state);
 
-	/* complete a check operation */
-	if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
-		clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
-		clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
+	switch (sh->check_state) {
+	case check_state_idle:
+		/* start a new check operation if there are no failures */
 		if (s->failed == 0) {
-			if (sh->ops.zero_sum_result == 0)
-				/* parity is correct (on disc,
-				 * not in buffer any more)
-				 */
-				set_bit(STRIPE_INSYNC, &sh->state);
-			else {
-				conf->mddev->resync_mismatches +=
-					STRIPE_SECTORS;
-				if (test_bit(
-				     MD_RECOVERY_CHECK, &conf->mddev->recovery))
-					/* don't try to repair!! */
-					set_bit(STRIPE_INSYNC, &sh->state);
-				else {
-					set_bit(STRIPE_OP_COMPUTE_BLK,
-						&sh->ops.pending);
-					set_bit(STRIPE_OP_MOD_REPAIR_PD,
-						&sh->ops.pending);
-					set_bit(R5_Wantcompute,
-						&sh->dev[sh->pd_idx].flags);
-					sh->ops.target = sh->pd_idx;
-					sh->ops.count++;
-					s->uptodate++;
-				}
-			}
-		} else
-			canceled_check = 1; /* STRIPE_INSYNC is not set */
-	}
-
-	/* start a new check operation if there are no failures, the stripe is
-	 * not insync, and a repair is not in flight
-	 */
-	if (s->failed == 0 &&
-	    !test_bit(STRIPE_INSYNC, &sh->state) &&
-	    !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
-		if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
 			BUG_ON(s->uptodate != disks);
+			sh->check_state = check_state_run;
+			set_bit(STRIPE_OP_CHECK, &s->ops_request);
 			clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
-			sh->ops.count++;
 			s->uptodate--;
+			break;
 		}
-	}
-
-	/* check if we can clear a parity disk reconstruct */
-	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
-	    test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
-
-		clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
-		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
-	}
-
+		dev = &sh->dev[s->failed_num];
+		/* fall through */
+	case check_state_compute_result:
+		sh->check_state = check_state_idle;
+		if (!dev)
+			dev = &sh->dev[sh->pd_idx];
+
+		/* check that a write has not made the stripe insync */
+		if (test_bit(STRIPE_INSYNC, &sh->state))
+			break;
 
-	/* Wait for check parity and compute block operations to complete
-	 * before write-back.  If a failure occurred while the check operation
-	 * was in flight we need to cycle this stripe through handle_stripe
-	 * since the parity block may not be uptodate
-	 */
-	if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
-	    !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
-	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
-		struct r5dev *dev;
 		/* either failed parity check, or recovery is happening */
-		if (s->failed == 0)
-			s->failed_num = sh->pd_idx;
-		dev = &sh->dev[s->failed_num];
 		BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
 		BUG_ON(s->uptodate != disks);
 
 		set_bit(R5_LOCKED, &dev->flags);
+		s->locked++;
 		set_bit(R5_Wantwrite, &dev->flags);
 
 		clear_bit(STRIPE_DEGRADED, &sh->state);
-		s->locked++;
 		set_bit(STRIPE_INSYNC, &sh->state);
+		break;
+	case check_state_run:
+		break; /* we will be called again upon completion */
+	case check_state_check_result:
+		sh->check_state = check_state_idle;
+
+		/* if a failure occurred during the check operation, leave
+		 * STRIPE_INSYNC not set and let the stripe be handled again
+		 */
+		if (s->failed)
+			break;
+
+		/* handle a successful check operation, if parity is correct
+		 * we are done.  Otherwise update the mismatch count and repair
+		 * parity if !MD_RECOVERY_CHECK
+		 */
+		if (sh->ops.zero_sum_result == 0)
+			/* parity is correct (on disc,
+			 * not in buffer any more)
+			 */
+			set_bit(STRIPE_INSYNC, &sh->state);
+		else {
+			conf->mddev->resync_mismatches += STRIPE_SECTORS;
+			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+				/* don't try to repair!! */
+				set_bit(STRIPE_INSYNC, &sh->state);
+			else {
+				sh->check_state = check_state_compute_run;
+				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
+				set_bit(R5_Wantcompute,
+					&sh->dev[sh->pd_idx].flags);
+				sh->ops.target = sh->pd_idx;
+				s->uptodate++;
+			}
+		}
+		break;
+	case check_state_compute_run:
+		break;
+	default:
+		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+		       __func__, sh->check_state,
+		       (unsigned long long) sh->sector);
+		BUG();
 	}
 }
 
@@ -2807,7 +2802,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 *    block.
 	 */
 	if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
-			  !test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
+	    !sh->check_state)
 		handle_issuing_new_write_requests5(conf, sh, &s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
@@ -2815,11 +2810,10 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * data is available.  The parity check is held off while parity
 	 * dependent operations are in flight.
 	 */
-	if ((s.syncing && s.locked == 0 &&
+	if (sh->check_state ||
+	    (s.syncing && s.locked == 0 &&
 	     !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
-	     !test_bit(STRIPE_INSYNC, &sh->state)) ||
-	      test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
-	      test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
+	     !test_bit(STRIPE_INSYNC, &sh->state)))
 		handle_parity_checks5(conf, sh, &s, disks);
 
 	if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -2897,8 +2891,8 @@ static void handle_stripe5(struct stripe_head *sh)
 	if (unlikely(blocked_rdev))
 		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
 
-	if (pending)
-		raid5_run_ops(sh, pending);
+	if (pending || s.ops_request)
+		raid5_run_ops(sh, pending, s.ops_request);
 
 	ops_run_io(sh, &s);
 
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 1301195abf4b..2c96d5fd54bf 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -158,6 +158,41 @@
  *    the compute block completes.
  */
 
+/*
+ * Operations state - intermediate states that are visible outside of sh->lock
+ * In general _idle indicates nothing is running, _run indicates a data
+ * processing operation is active, and _result means the data processing result
+ * is stable and can be acted upon.  For simple operations like biofill and
+ * compute that only have an _idle and _run state they are indicated with
+ * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
+ */
+/**
+ * enum check_states - handles syncing / repairing a stripe
+ * @check_state_idle - check operations are quiesced
+ * @check_state_run - check operation is running
+ * @check_state_result - set outside lock when check result is valid
+ * @check_state_compute_run - check failed and we are repairing
+ * @check_state_compute_result - set outside lock when compute result is valid
+ */
+enum check_states {
+	check_state_idle = 0,
+	check_state_run, /* parity check */
+	check_state_check_result,
+	check_state_compute_run, /* parity repair */
+	check_state_compute_result,
+};
+
+/**
+ * enum reconstruct_states - handles writing or expanding a stripe
+ */
+enum reconstruct_states {
+	reconstruct_state_idle = 0,
+	reconstruct_state_drain_run,		/* write */
+	reconstruct_state_run,			/* expand */
+	reconstruct_state_drain_result,
+	reconstruct_state_result,
+};
+
 struct stripe_head {
 	struct hlist_node	hash;
 	struct list_head	lru;			/* inactive_list or handle_list */
@@ -169,6 +204,7 @@ struct stripe_head {
 	spinlock_t		lock;
 	int			bm_seq;	/* sequence number for bitmap flushes */
 	int			disks;			/* disks in stripe */
+	enum check_states	check_state;
 	/* stripe_operations
 	 * @pending - pending ops flags (set for request->issue->complete)
 	 * @ack - submitted ops flags (set for issue->complete)
@@ -202,6 +238,7 @@ struct stripe_head_state {
 	int locked, uptodate, to_read, to_write, failed, written;
 	int to_fill, compute, req_compute, non_overwrite;
 	int failed_num;
+	unsigned long ops_request;
 };
 
 /* r6_state - extra state data only relevant to r6 */
@@ -254,8 +291,10 @@ struct r6_state {
 #define	STRIPE_EXPAND_READY	11
 #define	STRIPE_IO_STARTED	12 /* do not count towards 'bypass_count' */
 #define	STRIPE_FULL_WRITE	13 /* all blocks are set to be overwritten */
+#define	STRIPE_BIOFILL_RUN	14
+#define	STRIPE_COMPUTE_RUN	15
 /*
- * Operations flags (in issue order)
+ * Operation request flags
  */
 #define STRIPE_OP_BIOFILL	0
 #define STRIPE_OP_COMPUTE_BLK	1
@@ -264,11 +303,6 @@ struct r6_state {
 #define STRIPE_OP_POSTXOR	4
 #define STRIPE_OP_CHECK	5
 
-/* modifiers to the base operations
- * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
- */
-#define STRIPE_OP_MOD_REPAIR_PD 7
-
 /*
  * Plugging:
  *
-- 
cgit v1.2.3-70-g09d2


From 600aa10993012ff2dd5617720dac081e4f992017 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:32:05 +1000
Subject: md: replace STRIPE_OP_{BIODRAIN,PREXOR,POSTXOR} with
 'reconstruct_states'

From: Dan Williams <dan.j.williams@intel.com>

Track the state of reconstruct operations (recalculating the parity block
usually due to incoming writes, or as part of array expansion)  Reduces the
scope of the STRIPE_OP_{BIODRAIN,PREXOR,POSTXOR} flags to only tracking whether
a reconstruct operation has been requested via the ops_request field of struct
stripe_head_state.

This is the final step in the removal of ops.{pending,ack,complete,count}, i.e.
the STRIPE_OP_{BIODRAIN,PREXOR,POSTXOR} flags only request an operation and do
not track the state of the operation.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 204 ++++++++++++++-------------------------------
 include/linux/raid/raid5.h |   9 +-
 2 files changed, 63 insertions(+), 150 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 835046bf384e..b9159367491a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -122,6 +122,13 @@ static void return_io(struct bio *return_bi)
 
 static void print_raid5_conf (raid5_conf_t *conf);
 
+static int stripe_operations_active(struct stripe_head *sh)
+{
+	return sh->check_state || sh->reconstruct_state ||
+	       test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
+	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
+}
+
 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 {
 	if (atomic_dec_and_test(&sh->count)) {
@@ -141,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 			}
 			md_wakeup_thread(conf->mddev->thread);
 		} else {
-			BUG_ON(sh->ops.pending);
+			BUG_ON(stripe_operations_active(sh));
 			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
 				atomic_dec(&conf->preread_active_stripes);
 				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -243,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
 
 	BUG_ON(atomic_read(&sh->count) != 0);
 	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
-	BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete);
+	BUG_ON(stripe_operations_active(sh));
 
 	CHECK_DEVLOCK();
 	pr_debug("init_stripe called, stripe %llu\n",
@@ -344,47 +351,6 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 	return sh;
 }
 
-/* test_and_ack_op() ensures that we only dequeue an operation once */
-#define test_and_ack_op(op, pend) \
-do {							\
-	if (test_bit(op, &sh->ops.pending) &&		\
-		!test_bit(op, &sh->ops.complete)) {	\
-		if (test_and_set_bit(op, &sh->ops.ack)) \
-			clear_bit(op, &pend);		\
-		else					\
-			ack++;				\
-	} else						\
-		clear_bit(op, &pend);			\
-} while (0)
-
-/* find new work to run, do not resubmit work that is already
- * in flight
- */
-static unsigned long get_stripe_work(struct stripe_head *sh)
-{
-	unsigned long pending;
-	int ack = 0;
-
-	pending = sh->ops.pending;
-
-	test_and_ack_op(STRIPE_OP_BIOFILL, pending);
-	test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);
-	test_and_ack_op(STRIPE_OP_PREXOR, pending);
-	test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
-	test_and_ack_op(STRIPE_OP_POSTXOR, pending);
-	test_and_ack_op(STRIPE_OP_CHECK, pending);
-
-	sh->ops.count -= ack;
-	if (unlikely(sh->ops.count < 0)) {
-		printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
-			"ops.complete: %#lx\n", pending, sh->ops.pending,
-			sh->ops.ack, sh->ops.complete);
-		BUG();
-	}
-
-	return pending;
-}
-
 static void
 raid5_end_read_request(struct bio *bi, int error);
 static void
@@ -609,7 +575,7 @@ static void ops_complete_compute5(void *stripe_head_ref)
 }
 
 static struct dma_async_tx_descriptor *
-ops_run_compute5(struct stripe_head *sh, unsigned long pending)
+ops_run_compute5(struct stripe_head *sh, unsigned long ops_request)
 {
 	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
@@ -640,7 +606,7 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending)
 			ops_complete_compute5, sh);
 
 	/* ack now if postxor is not set to be run */
-	if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
+	if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
 		async_tx_ack(tx);
 
 	return tx;
@@ -652,8 +618,6 @@ static void ops_complete_prexor(void *stripe_head_ref)
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
-
-	set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
 }
 
 static struct dma_async_tx_descriptor *
@@ -686,7 +650,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 
 static struct dma_async_tx_descriptor *
 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
-		 unsigned long pending)
+		 unsigned long ops_request)
 {
 	int disks = sh->disks;
 	int pd_idx = sh->pd_idx, i;
@@ -694,7 +658,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (Wantprexor)
 	 */
-	int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
+	int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request);
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
@@ -744,7 +708,7 @@ static void ops_complete_postxor(void *stripe_head_ref)
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
-	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+	sh->reconstruct_state = reconstruct_state_result;
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
@@ -763,16 +727,14 @@ static void ops_complete_write(void *stripe_head_ref)
 			set_bit(R5_UPTODATE, &dev->flags);
 	}
 
-	set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
-	set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-
+	sh->reconstruct_state = reconstruct_state_drain_result;
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
 
 static void
 ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
-		unsigned long pending)
+		unsigned long ops_request)
 {
 	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
@@ -780,7 +742,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
-	int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
+	int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request);
 	unsigned long flags;
 	dma_async_tx_callback callback;
 
@@ -807,7 +769,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	}
 
 	/* check whether this postxor is part of a write */
-	callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ?
+	callback = test_bit(STRIPE_OP_BIODRAIN, &ops_request) ?
 		ops_complete_write : ops_complete_postxor;
 
 	/* 1/ if we prexor'd then the dest is reused as a source
@@ -868,8 +830,7 @@ static void ops_run_check(struct stripe_head *sh)
 		ops_complete_check, sh);
 }
 
-static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
-			  unsigned long ops_request)
+static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
 {
 	int overlap_clear = 0, i, disks = sh->disks;
 	struct dma_async_tx_descriptor *tx = NULL;
@@ -880,18 +841,18 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long pending,
 	}
 
 	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request))
-		tx = ops_run_compute5(sh, pending);
+		tx = ops_run_compute5(sh, ops_request);
 
-	if (test_bit(STRIPE_OP_PREXOR, &pending))
+	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
 		tx = ops_run_prexor(sh, tx);
 
-	if (test_bit(STRIPE_OP_BIODRAIN, &pending)) {
-		tx = ops_run_biodrain(sh, tx, pending);
+	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
+		tx = ops_run_biodrain(sh, tx, ops_request);
 		overlap_clear++;
 	}
 
-	if (test_bit(STRIPE_OP_POSTXOR, &pending))
-		ops_run_postxor(sh, tx, pending);
+	if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
+		ops_run_postxor(sh, tx, ops_request);
 
 	if (test_bit(STRIPE_OP_CHECK, &ops_request))
 		ops_run_check(sh);
@@ -1684,11 +1645,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
 	}
 }
 
-static int
-handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
+static void
+handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s,
+			 int rcw, int expand)
 {
 	int i, pd_idx = sh->pd_idx, disks = sh->disks;
-	int locked = 0;
 
 	if (rcw) {
 		/* if we are not expanding this is a proper write request, and
@@ -1696,12 +1657,12 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
 		 * stripe cache
 		 */
 		if (!expand) {
-			set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
-			sh->ops.count++;
-		}
+			sh->reconstruct_state = reconstruct_state_drain_run;
+			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+		} else
+			sh->reconstruct_state = reconstruct_state_run;
 
-		set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
-		sh->ops.count++;
+		set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -1710,21 +1671,20 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
 				set_bit(R5_LOCKED, &dev->flags);
 				if (!expand)
 					clear_bit(R5_UPTODATE, &dev->flags);
-				locked++;
+				s->locked++;
 			}
 		}
-		if (locked + 1 == disks)
+		if (s->locked + 1 == disks)
 			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
 				atomic_inc(&sh->raid_conf->pending_full_writes);
 	} else {
 		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
 			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
 
-		set_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
-		set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
-		set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
-
-		sh->ops.count += 3;
+		sh->reconstruct_state = reconstruct_state_drain_run;
+		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
+		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+		set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -1742,7 +1702,7 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
 				set_bit(R5_Wantprexor, &dev->flags);
 				set_bit(R5_LOCKED, &dev->flags);
 				clear_bit(R5_UPTODATE, &dev->flags);
-				locked++;
+				s->locked++;
 			}
 		}
 	}
@@ -1752,13 +1712,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
 	 */
 	set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
 	clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-	locked++;
+	s->locked++;
 
-	pr_debug("%s: stripe %llu locked: %d pending: %lx\n",
+	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
 		__func__, (unsigned long long)sh->sector,
-		locked, sh->ops.pending);
-
-	return locked;
+		s->locked, s->ops_request);
 }
 
 /*
@@ -2005,8 +1963,7 @@ static void handle_issuing_new_read_requests5(struct stripe_head *sh,
 	 * midst of changing due to a write
 	 */
 	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
-	    !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
-	    !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+	    !sh->reconstruct_state) {
 		for (i = disks; i--; )
 			if (__handle_issuing_new_read_requests5(
 				sh, s, i, disks) == 0)
@@ -2211,7 +2168,7 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
 	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
 	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
 	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
-		s->locked += handle_write_operations5(sh, rcw == 0, 0);
+		handle_write_operations5(sh, s, rcw == 0, 0);
 }
 
 static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
@@ -2581,15 +2538,14 @@ static void handle_stripe5(struct stripe_head *sh)
 	struct bio *return_bi = NULL;
 	struct stripe_head_state s;
 	struct r5dev *dev;
-	unsigned long pending = 0;
 	mdk_rdev_t *blocked_rdev = NULL;
 	int prexor;
 
 	memset(&s, 0, sizeof(s));
-	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
-		"ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state,
-		atomic_read(&sh->count), sh->pd_idx,
-		sh->ops.pending, sh->ops.ack, sh->ops.complete);
+	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
+		 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
+		 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
+		 sh->reconstruct_state);
 
 	spin_lock(&sh->lock);
 	clear_bit(STRIPE_HANDLE, &sh->state);
@@ -2703,34 +2659,12 @@ static void handle_stripe5(struct stripe_head *sh)
 	/* Now we check to see if any write operations have recently
 	 * completed
 	 */
-
-	/* leave prexor set until postxor is done, allows us to distinguish
-	 * a rmw from a rcw during biodrain
-	 */
 	prexor = 0;
-	if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
-		test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
-
-		prexor = 1;
-		clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
-		clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
-		clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
-
+	if (sh->reconstruct_state == reconstruct_state_drain_result) {
+		sh->reconstruct_state = reconstruct_state_idle;
 		for (i = disks; i--; )
-			clear_bit(R5_Wantprexor, &sh->dev[i].flags);
-	}
-
-	/* if only POSTXOR is set then this is an 'expand' postxor */
-	if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
-		test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
-
-		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
-		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
-		clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
-
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
+			prexor += test_and_clear_bit(R5_Wantprexor,
+						     &sh->dev[i].flags);
 
 		/* All the 'written' buffers and the parity block are ready to
 		 * be written back to disk
@@ -2763,8 +2697,7 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * 2/ A 'check' operation is in flight, as it may clobber the parity
 	 *    block.
 	 */
-	if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
-	    !sh->check_state)
+	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
 		handle_issuing_new_write_requests5(conf, sh, &s, disks);
 
 	/* maybe we need to check and possibly fix the parity for this stripe
@@ -2805,18 +2738,10 @@ static void handle_stripe5(struct stripe_head *sh)
 		}
 	}
 
-	/* Finish postxor operations initiated by the expansion
-	 * process
-	 */
-	if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
-		!test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
-
+	/* Finish reconstruct operations initiated by the expansion process */
+	if (sh->reconstruct_state == reconstruct_state_result) {
+		sh->reconstruct_state = reconstruct_state_idle;
 		clear_bit(STRIPE_EXPANDING, &sh->state);
-
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
-		clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
-
 		for (i = conf->raid_disks; i--; )
 			set_bit(R5_Wantwrite, &sh->dev[i].flags);
 			set_bit(R5_LOCKED, &dev->flags);
@@ -2824,15 +2749,13 @@ static void handle_stripe5(struct stripe_head *sh)
 	}
 
 	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
-		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+	    !sh->reconstruct_state) {
 		/* Need to write out all blocks after computing parity */
 		sh->disks = conf->raid_disks;
 		sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
 			conf->raid_disks);
-		s.locked += handle_write_operations5(sh, 1, 1);
-	} else if (s.expanded &&
-		   s.locked == 0 &&
-		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+		handle_write_operations5(sh, &s, 1, 1);
+	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
 		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_overlap);
@@ -2843,9 +2766,6 @@ static void handle_stripe5(struct stripe_head *sh)
 	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
 		handle_stripe_expansion(conf, sh, NULL);
 
-	if (sh->ops.count)
-		pending = get_stripe_work(sh);
-
  unlock:
 	spin_unlock(&sh->lock);
 
@@ -2853,8 +2773,8 @@ static void handle_stripe5(struct stripe_head *sh)
 	if (unlikely(blocked_rdev))
 		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
 
-	if (pending || s.ops_request)
-		raid5_run_ops(sh, pending, s.ops_request);
+	if (s.ops_request)
+		raid5_run_ops(sh, s.ops_request);
 
 	ops_run_io(sh, &s);
 
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 2c96d5fd54bf..5f3e674b87dd 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -205,19 +205,12 @@ struct stripe_head {
 	int			bm_seq;	/* sequence number for bitmap flushes */
 	int			disks;			/* disks in stripe */
 	enum check_states	check_state;
+	enum reconstruct_states reconstruct_state;
 	/* stripe_operations
-	 * @pending - pending ops flags (set for request->issue->complete)
-	 * @ack - submitted ops flags (set for issue->complete)
-	 * @complete - completed ops flags (set for complete)
 	 * @target - STRIPE_OP_COMPUTE_BLK target
-	 * @count - raid5_runs_ops is set to run when this is non-zero
 	 */
 	struct stripe_operations {
-		unsigned long	   pending;
-		unsigned long	   ack;
-		unsigned long	   complete;
 		int		   target;
-		int		   count;
 		u32		   zero_sum_result;
 	} ops;
 	struct r5dev {
-- 
cgit v1.2.3-70-g09d2


From d8ee0728b5b30d7a6f62c399a95e953616d31f23 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 28 Jun 2008 08:32:06 +1000
Subject: md: replace R5_WantPrexor with R5_WantDrain, add 'prexor'
 reconstruct_states

From: Dan Williams <dan.j.williams@intel.com>

Currently ops_run_biodrain and other locations have extra logic to determine
which blocks are processed in the prexor and non-prexor cases.  This can be
eliminated if handle_write_operations5 flags the blocks to be processed in all
cases via R5_Wantdrain.  The presence of the prexor operation is tracked in
sh->reconstruct_state.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/raid5.c         | 89 +++++++++++++++-------------------------------
 include/linux/raid/raid5.h |  6 ++--
 2 files changed, 32 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b9159367491a..c71246061c0e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -637,7 +637,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		/* Only process blocks that are known to be uptodate */
-		if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags))
+		if (test_bit(R5_Wantdrain, &dev->flags))
 			xor_srcs[count++] = dev->page;
 	}
 
@@ -649,16 +649,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 }
 
 static struct dma_async_tx_descriptor *
-ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
-		 unsigned long ops_request)
+ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
-	int pd_idx = sh->pd_idx, i;
-
-	/* check if prexor is active which means only process blocks
-	 * that are part of a read-modify-write (Wantprexor)
-	 */
-	int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request);
+	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
@@ -666,20 +660,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		struct bio *chosen;
-		int towrite;
 
-		towrite = 0;
-		if (prexor) { /* rmw */
-			if (dev->towrite &&
-			    test_bit(R5_Wantprexor, &dev->flags))
-				towrite = 1;
-		} else { /* rcw */
-			if (i != pd_idx && dev->towrite &&
-				test_bit(R5_LOCKED, &dev->flags))
-				towrite = 1;
-		}
-
-		if (towrite) {
+		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
 			struct bio *wbi;
 
 			spin_lock(&sh->lock);
@@ -702,18 +684,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 }
 
 static void ops_complete_postxor(void *stripe_head_ref)
-{
-	struct stripe_head *sh = stripe_head_ref;
-
-	pr_debug("%s: stripe %llu\n", __func__,
-		(unsigned long long)sh->sector);
-
-	sh->reconstruct_state = reconstruct_state_result;
-	set_bit(STRIPE_HANDLE, &sh->state);
-	release_stripe(sh);
-}
-
-static void ops_complete_write(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
 	int disks = sh->disks, i, pd_idx = sh->pd_idx;
@@ -727,14 +697,21 @@ static void ops_complete_write(void *stripe_head_ref)
 			set_bit(R5_UPTODATE, &dev->flags);
 	}
 
-	sh->reconstruct_state = reconstruct_state_drain_result;
+	if (sh->reconstruct_state == reconstruct_state_drain_run)
+		sh->reconstruct_state = reconstruct_state_drain_result;
+	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
+		sh->reconstruct_state = reconstruct_state_prexor_drain_result;
+	else {
+		BUG_ON(sh->reconstruct_state != reconstruct_state_run);
+		sh->reconstruct_state = reconstruct_state_result;
+	}
+
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
 }
 
 static void
-ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
-		unsigned long ops_request)
+ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
 	/* kernel stack size limits the total number of disks */
 	int disks = sh->disks;
@@ -742,9 +719,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
-	int prexor = test_bit(STRIPE_OP_PREXOR, &ops_request);
+	int prexor = 0;
 	unsigned long flags;
-	dma_async_tx_callback callback;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
@@ -752,7 +728,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (written)
 	 */
-	if (prexor) {
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+		prexor = 1;
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -768,10 +745,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 		}
 	}
 
-	/* check whether this postxor is part of a write */
-	callback = test_bit(STRIPE_OP_BIODRAIN, &ops_request) ?
-		ops_complete_write : ops_complete_postxor;
-
 	/* 1/ if we prexor'd then the dest is reused as a source
 	 * 2/ if we did not prexor then we are redoing the parity
 	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
@@ -785,10 +758,10 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
 	if (unlikely(count == 1)) {
 		flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
 		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
-			flags, tx, callback, sh);
+			flags, tx, ops_complete_postxor, sh);
 	} else
 		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
-			flags, tx, callback, sh);
+			flags, tx, ops_complete_postxor, sh);
 }
 
 static void ops_complete_check(void *stripe_head_ref)
@@ -847,12 +820,12 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
 		tx = ops_run_prexor(sh, tx);
 
 	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
-		tx = ops_run_biodrain(sh, tx, ops_request);
+		tx = ops_run_biodrain(sh, tx);
 		overlap_clear++;
 	}
 
 	if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
-		ops_run_postxor(sh, tx, ops_request);
+		ops_run_postxor(sh, tx);
 
 	if (test_bit(STRIPE_OP_CHECK, &ops_request))
 		ops_run_check(sh);
@@ -1669,6 +1642,7 @@ handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s,
 
 			if (dev->towrite) {
 				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantdrain, &dev->flags);
 				if (!expand)
 					clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
@@ -1681,7 +1655,7 @@ handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s,
 		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
 			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
 
-		sh->reconstruct_state = reconstruct_state_drain_run;
+		sh->reconstruct_state = reconstruct_state_prexor_drain_run;
 		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
 		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
 		set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
@@ -1691,15 +1665,10 @@ handle_write_operations5(struct stripe_head *sh, struct stripe_head_state *s,
 			if (i == pd_idx)
 				continue;
 
-			/* For a read-modify write there may be blocks that are
-			 * locked for reading while others are ready to be
-			 * written so we distinguish these blocks by the
-			 * R5_Wantprexor bit
-			 */
 			if (dev->towrite &&
 			    (test_bit(R5_UPTODATE, &dev->flags) ||
-			    test_bit(R5_Wantcompute, &dev->flags))) {
-				set_bit(R5_Wantprexor, &dev->flags);
+			     test_bit(R5_Wantcompute, &dev->flags))) {
+				set_bit(R5_Wantdrain, &dev->flags);
 				set_bit(R5_LOCKED, &dev->flags);
 				clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
@@ -2660,11 +2629,11 @@ static void handle_stripe5(struct stripe_head *sh)
 	 * completed
 	 */
 	prexor = 0;
-	if (sh->reconstruct_state == reconstruct_state_drain_result) {
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
+		prexor = 1;
+	if (sh->reconstruct_state == reconstruct_state_drain_result ||
+	    sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
 		sh->reconstruct_state = reconstruct_state_idle;
-		for (i = disks; i--; )
-			prexor += test_and_clear_bit(R5_Wantprexor,
-						     &sh->dev[i].flags);
 
 		/* All the 'written' buffers and the parity block are ready to
 		 * be written back to disk
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 5f3e674b87dd..3b2672792457 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -187,8 +187,10 @@ enum check_states {
  */
 enum reconstruct_states {
 	reconstruct_state_idle = 0,
+	reconstruct_state_prexor_drain_run,	/* prexor-write */
 	reconstruct_state_drain_run,		/* write */
 	reconstruct_state_run,			/* expand */
+	reconstruct_state_prexor_drain_result,
 	reconstruct_state_drain_result,
 	reconstruct_state_result,
 };
@@ -258,9 +260,7 @@ struct r6_state {
 #define	R5_Wantfill	12 /* dev->toread contains a bio that needs
 				    * filling
 				    */
-#define	R5_Wantprexor	13 /* distinguish blocks ready for rmw from
-				    * other "towrites"
-				    */
+#define R5_Wantdrain	13 /* dev->towrite needs to be drained */
 /*
  * Write method
  */
-- 
cgit v1.2.3-70-g09d2


From b5470dc5fc18a8ff6517c3bb538d1479e58ecb02 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 27 Jun 2008 21:44:04 -0700
Subject: md: resolve external metadata handling deadlock in md_allow_write

md_allow_write() marks the metadata dirty while holding mddev->lock and then
waits for the write to complete.  For externally managed metadata this causes a
deadlock as userspace needs to take the lock to communicate that the metadata
update has completed.

Change md_allow_write() in the 'external' case to start the 'mark active'
operation and then return -EAGAIN.  The expected side effects while waiting for
userspace to write 'active' to 'array_state' are holding off reshape (code
currently handles -ENOMEM), cause some 'stripe_cache_size' change requests to
fail, cause some GET_BITMAP_FILE ioctl requests to fall back to GFP_NOIO, and
cause updates to 'raid_disks' to fail.  Except for 'stripe_cache_size' changes
these failures can be mitigated by coordinating with mdmon.

md_write_start() still prevents writes from occurring until the metadata
handler has had a chance to take action as it unconditionally waits for
MD_CHANGE_CLEAN to be cleared.

[neilb@suse.de: return -EAGAIN, try GFP_NOIO]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/md/md.c         | 27 ++++++++++++++++-----------
 drivers/md/raid1.c      |  6 ++++--
 drivers/md/raid5.c      | 12 +++++++++---
 include/linux/raid/md.h |  2 +-
 4 files changed, 30 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index df1230af02cd..43d033d9a05a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4172,9 +4172,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
 	char *ptr, *buf = NULL;
 	int err = -ENOMEM;
 
-	md_allow_write(mddev);
+	if (md_allow_write(mddev))
+		file = kmalloc(sizeof(*file), GFP_NOIO);
+	else
+		file = kmalloc(sizeof(*file), GFP_KERNEL);
 
-	file = kmalloc(sizeof(*file), GFP_KERNEL);
 	if (!file)
 		goto out;
 
@@ -5667,15 +5669,18 @@ void md_write_end(mddev_t *mddev)
  * may proceed without blocking.  It is important to call this before
  * attempting a GFP_KERNEL allocation while holding the mddev lock.
  * Must be called with mddev_lock held.
+ *
+ * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
+ * is dropped, so return -EAGAIN after notifying userspace.
  */
-void md_allow_write(mddev_t *mddev)
+int md_allow_write(mddev_t *mddev)
 {
 	if (!mddev->pers)
-		return;
+		return 0;
 	if (mddev->ro)
-		return;
+		return 0;
 	if (!mddev->pers->sync_request)
-		return;
+		return 0;
 
 	spin_lock_irq(&mddev->write_lock);
 	if (mddev->in_sync) {
@@ -5686,14 +5691,14 @@ void md_allow_write(mddev_t *mddev)
 			mddev->safemode = 1;
 		spin_unlock_irq(&mddev->write_lock);
 		md_update_sb(mddev, 0);
-
 		sysfs_notify(&mddev->kobj, NULL, "array_state");
-		/* wait for the dirty state to be recorded in the metadata */
-		wait_event(mddev->sb_wait,
-			   !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
-			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
 	} else
 		spin_unlock_irq(&mddev->write_lock);
+
+	if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
+		return -EAGAIN;
+	else
+		return 0;
 }
 EXPORT_SYMBOL_GPL(md_allow_write);
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f05d5983efb6..491dc2d4ad5f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2136,7 +2136,7 @@ static int raid1_reshape(mddev_t *mddev)
 	conf_t *conf = mddev_to_conf(mddev);
 	int cnt, raid_disks;
 	unsigned long flags;
-	int d, d2;
+	int d, d2, err;
 
 	/* Cannot change chunk_size, layout, or level */
 	if (mddev->chunk_size != mddev->new_chunk ||
@@ -2148,7 +2148,9 @@ static int raid1_reshape(mddev_t *mddev)
 		return -EINVAL;
 	}
 
-	md_allow_write(mddev);
+	err = md_allow_write(mddev);
+	if (err)
+		return err;
 
 	raid_disks = mddev->raid_disks + mddev->delta_disks;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 442622067cae..8f4c70a53210 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -911,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 	struct stripe_head *osh, *nsh;
 	LIST_HEAD(newstripes);
 	struct disk_info *ndisks;
-	int err = 0;
+	int err;
 	struct kmem_cache *sc;
 	int i;
 
 	if (newsize <= conf->pool_size)
 		return 0; /* never bother to shrink */
 
-	md_allow_write(conf->mddev);
+	err = md_allow_write(conf->mddev);
+	if (err)
+		return err;
 
 	/* Step 1 */
 	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -3843,6 +3845,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long new;
+	int err;
+
 	if (len >= PAGE_SIZE)
 		return -EINVAL;
 	if (!conf)
@@ -3858,7 +3862,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 		else
 			break;
 	}
-	md_allow_write(mddev);
+	err = md_allow_write(mddev);
+	if (err)
+		return err;
 	while (new > conf->max_nr_stripes) {
 		if (grow_one_stripe(conf))
 			conf->max_nr_stripes++;
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index b7386ae9d288..dc0e3fcb9f28 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 			struct page *page, int rw);
 extern void md_do_sync(mddev_t *mddev);
 extern void md_new_event(mddev_t *mddev);
-extern void md_allow_write(mddev_t *mddev);
+extern int md_allow_write(mddev_t *mddev);
 extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 
 #endif /* CONFIG_MD */
-- 
cgit v1.2.3-70-g09d2


From 0f420358e3a2abc028320ace7783e2e38cae77bf Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 11 Jul 2008 22:02:23 +1000
Subject: md: Turn rdev->sb_offset into a sector-based quantity.

Rename it to sb_start to make sure all users have been converted.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 drivers/md/bitmap.c       | 10 +++---
 drivers/md/md.c           | 81 ++++++++++++++++++++++-------------------------
 include/linux/raid/md_k.h |  2 +-
 3 files changed, 44 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index dedba16d42f7..eba83e25b678 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -225,7 +225,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
 		    || test_bit(Faulty, &rdev->flags))
 			continue;
 
-		target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512);
+		target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
 
 		if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) {
 			page->index = index;
@@ -262,12 +262,12 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 					/* bitmap runs in to metadata */
 					return -EINVAL;
 				if (rdev->data_offset + mddev->size*2
-				    > rdev->sb_offset*2 + bitmap->offset)
+				    > rdev->sb_start + bitmap->offset)
 					/* data runs in to bitmap */
 					return -EINVAL;
-			} else if (rdev->sb_offset*2 < rdev->data_offset) {
+			} else if (rdev->sb_start < rdev->data_offset) {
 				/* METADATA BITMAP DATA */
-				if (rdev->sb_offset*2
+				if (rdev->sb_start
 				    + bitmap->offset
 				    + page->index*(PAGE_SIZE/512) + size/512
 				    > rdev->data_offset)
@@ -277,7 +277,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 				/* DATA METADATA BITMAP - no problems */
 			}
 			md_super_write(mddev, rdev,
-				       (rdev->sb_offset<<1) + bitmap->offset
+				       rdev->sb_start + bitmap->offset
 				       + page->index * (PAGE_SIZE/512),
 				       size,
 				       page);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3276edde7576..5590cb54b584 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -356,7 +356,7 @@ static inline sector_t calc_dev_sboffset(struct block_device *bdev)
 
 static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
 {
-	sector_t num_sectors = rdev->sb_offset * 2;
+	sector_t num_sectors = rdev->sb_start;
 
 	if (chunk_size)
 		num_sectors &= ~((sector_t)chunk_size/512 - 1);
@@ -383,7 +383,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
 		put_page(rdev->sb_page);
 		rdev->sb_loaded = 0;
 		rdev->sb_page = NULL;
-		rdev->sb_offset = 0;
+		rdev->sb_start = 0;
 		rdev->size = 0;
 	}
 }
@@ -529,7 +529,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
 		return 0;
 
 
-	if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
+	if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
 		goto fail;
 	rdev->sb_loaded = 1;
 	return 0;
@@ -666,16 +666,14 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
 	mdp_super_t *sb;
 	int ret;
-	sector_t sb_offset;
 
 	/*
-	 * Calculate the position of the superblock,
+	 * Calculate the position of the superblock (512byte sectors),
 	 * it's at the end of the disk.
 	 *
 	 * It also happens to be a multiple of 4Kb.
 	 */
-	sb_offset = calc_dev_sboffset(rdev->bdev) / 2;
-	rdev->sb_offset = sb_offset;
+	rdev->sb_start = calc_dev_sboffset(rdev->bdev);
 
 	ret = read_disk_sb(rdev, MD_SB_BYTES);
 	if (ret) return ret;
@@ -1007,10 +1005,10 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
 	size *= 2; /* convert to sectors */
 	if (rdev->mddev->bitmap_offset)
 		return 0; /* can't move bitmap */
-	rdev->sb_offset = calc_dev_sboffset(rdev->bdev) / 2;
-	if (!size || size > rdev->sb_offset*2)
-		size = rdev->sb_offset*2;
-	md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size,
+	rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+	if (!size || size > rdev->sb_start)
+		size = rdev->sb_start;
+	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
 		       rdev->sb_page);
 	md_super_wait(rdev->mddev);
 	return size/2; /* kB for sysfs */
@@ -1048,12 +1046,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 {
 	struct mdp_superblock_1 *sb;
 	int ret;
-	sector_t sb_offset;
+	sector_t sb_start;
 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
 	int bmask;
 
 	/*
-	 * Calculate the position of the superblock.
+	 * Calculate the position of the superblock in 512byte sectors.
 	 * It is always aligned to a 4K boundary and
 	 * depeding on minor_version, it can be:
 	 * 0: At least 8K, but less than 12K, from end of device
@@ -1062,22 +1060,20 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	 */
 	switch(minor_version) {
 	case 0:
-		sb_offset = rdev->bdev->bd_inode->i_size >> 9;
-		sb_offset -= 8*2;
-		sb_offset &= ~(sector_t)(4*2-1);
-		/* convert from sectors to K */
-		sb_offset /= 2;
+		sb_start = rdev->bdev->bd_inode->i_size >> 9;
+		sb_start -= 8*2;
+		sb_start &= ~(sector_t)(4*2-1);
 		break;
 	case 1:
-		sb_offset = 0;
+		sb_start = 0;
 		break;
 	case 2:
-		sb_offset = 4;
+		sb_start = 8;
 		break;
 	default:
 		return -EINVAL;
 	}
-	rdev->sb_offset = sb_offset;
+	rdev->sb_start = sb_start;
 
 	/* superblock is rarely larger than 1K, but it can be larger,
 	 * and it is safe to read 4k, so we do that
@@ -1091,7 +1087,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
 	    sb->major_version != cpu_to_le32(1) ||
 	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
-	    le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
+	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
 	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
 		return -EINVAL;
 
@@ -1127,7 +1123,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 		rdev->sb_size = (rdev->sb_size | bmask) + 1;
 
 	if (minor_version
-	    && rdev->data_offset < sb_offset + (rdev->sb_size/512))
+	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
 		return -EINVAL;
 
 	if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
@@ -1163,7 +1159,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	if (minor_version)
 		rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
 	else
-		rdev->size = rdev->sb_offset;
+		rdev->size = rdev->sb_start / 2;
 	if (rdev->size < le64_to_cpu(sb->data_size)/2)
 		return -EINVAL;
 	rdev->size = le64_to_cpu(sb->data_size)/2;
@@ -1350,7 +1346,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
 	if (size && size < rdev->mddev->size)
 		return 0; /* component must fit device */
 	size *= 2; /* convert to sectors */
-	if (rdev->sb_offset < rdev->data_offset/2) {
+	if (rdev->sb_start < rdev->data_offset) {
 		/* minor versions 1 and 2; superblock before data */
 		max_size = (rdev->bdev->bd_inode->i_size >> 9);
 		max_size -= rdev->data_offset;
@@ -1361,19 +1357,19 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
 		return 0;
 	} else {
 		/* minor version 0; superblock after data */
-		sector_t sb_offset;
-		sb_offset = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
-		sb_offset &= ~(sector_t)(4*2 - 1);
-		max_size = rdev->size*2 + sb_offset - rdev->sb_offset*2;
+		sector_t sb_start;
+		sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
+		sb_start &= ~(sector_t)(4*2 - 1);
+		max_size = rdev->size*2 + sb_start - rdev->sb_start;
 		if (!size || size > max_size)
 			size = max_size;
-		rdev->sb_offset = sb_offset/2;
+		rdev->sb_start = sb_start;
 	}
 	sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
 	sb->data_size = cpu_to_le64(size);
-	sb->super_offset = rdev->sb_offset*2;
+	sb->super_offset = rdev->sb_start;
 	sb->sb_csum = calc_sb_1_csum(sb);
-	md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size,
+	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
 		       rdev->sb_page);
 	md_super_wait(rdev->mddev);
 	return size/2; /* kB for sysfs */
@@ -1810,11 +1806,11 @@ repeat:
 		dprintk("%s ", bdevname(rdev->bdev,b));
 		if (!test_bit(Faulty, &rdev->flags)) {
 			md_super_write(mddev,rdev,
-				       rdev->sb_offset<<1, rdev->sb_size,
+				       rdev->sb_start, rdev->sb_size,
 				       rdev->sb_page);
 			dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
 				bdevname(rdev->bdev,b),
-				(unsigned long long)rdev->sb_offset);
+				(unsigned long long)rdev->sb_start);
 			rdev->sb_events = mddev->events;
 
 		} else
@@ -3577,16 +3573,16 @@ static int do_md_run(mddev_t * mddev)
 		 * We don't want the data to overlap the metadata,
 		 * Internal Bitmap issues has handled elsewhere.
 		 */
-		if (rdev->data_offset < rdev->sb_offset) {
+		if (rdev->data_offset < rdev->sb_start) {
 			if (mddev->size &&
 			    rdev->data_offset + mddev->size*2
-			    > rdev->sb_offset*2) {
+			    > rdev->sb_start) {
 				printk("md: %s: data overlaps metadata\n",
 				       mdname(mddev));
 				return -EINVAL;
 			}
 		} else {
-			if (rdev->sb_offset*2 + rdev->sb_size/512
+			if (rdev->sb_start + rdev->sb_size/512
 			    > rdev->data_offset) {
 				printk("md: %s: metadata overlaps data\n",
 				       mdname(mddev));
@@ -4355,9 +4351,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 
 		if (!mddev->persistent) {
 			printk(KERN_INFO "md: nonpersistent superblock ...\n");
-			rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+			rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
 		} else 
-			rdev->sb_offset = calc_dev_sboffset(rdev->bdev) / 2;
+			rdev->sb_start = calc_dev_sboffset(rdev->bdev);
 		rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
 
 		err = bind_rdev_to_array(rdev, mddev);
@@ -4424,10 +4420,9 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 	}
 
 	if (mddev->persistent)
-		rdev->sb_offset = calc_dev_sboffset(rdev->bdev) / 2;
+		rdev->sb_start = calc_dev_sboffset(rdev->bdev);
 	else
-		rdev->sb_offset =
-			rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+		rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
 
 	rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
 
@@ -4628,7 +4623,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
 	 * linear and raid0 always use whatever space is available. We can only
 	 * consider changing this number if no resync or reconstruction is
 	 * happening, and if the new size is acceptable. It must fit before the
-	 * sb_offset or, if that is <data_offset, it must fit before the size
+	 * sb_start or, if that is <data_offset, it must fit before the size
 	 * of each device.  If num_sectors is zero, we find the largest size
 	 * that fits.
 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index df30c4395875..e37aaa41efc6 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -59,7 +59,7 @@ struct mdk_rdev_s
 	int		sb_loaded;
 	__u64		sb_events;
 	sector_t	data_offset;	/* start of data in array */
-	sector_t	sb_offset;
+	sector_t 	sb_start;	/* offset of the super block (in 512byte sectors) */
 	int		sb_size;	/* bytes in the superblock */
 	int		preferred_minor;	/* autorun support */
 
-- 
cgit v1.2.3-70-g09d2


From 7e93a89251d4ed7bd4475db62616ccd03ddfd01a Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Fri, 11 Jul 2008 22:02:23 +1000
Subject: md: Remove some unused macros.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: Neil Brown <neilb@suse.de>
---
 include/linux/raid/md_p.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 3f2cd98c508b..8b4de4a41ff1 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -43,14 +43,11 @@
  */
 #define MD_RESERVED_BYTES		(64 * 1024)
 #define MD_RESERVED_SECTORS		(MD_RESERVED_BYTES / 512)
-#define MD_RESERVED_BLOCKS		(MD_RESERVED_BYTES / BLOCK_SIZE)
 
 #define MD_NEW_SIZE_SECTORS(x)		((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
-#define MD_NEW_SIZE_BLOCKS(x)		((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
 
 #define MD_SB_BYTES			4096
 #define MD_SB_WORDS			(MD_SB_BYTES / 4)
-#define MD_SB_BLOCKS			(MD_SB_BYTES / BLOCK_SIZE)
 #define MD_SB_SECTORS			(MD_SB_BYTES / 512)
 
 /*
-- 
cgit v1.2.3-70-g09d2


From f233ea5c9e0d8b95e4283bf6a3436b88f6fd3586 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Mon, 21 Jul 2008 17:05:22 +1000
Subject: md: Make mddev->array_size sector-based.

This patch renames the array_size field of struct mddev_s to array_sectors
and converts all instances to use units of 512 byte sectors instead of 1k
blocks.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/faulty.c       |  2 +-
 drivers/md/linear.c       |  6 +++---
 drivers/md/md.c           | 12 +++++++-----
 drivers/md/multipath.c    |  2 +-
 drivers/md/raid0.c        |  8 ++++----
 drivers/md/raid1.c        | 11 ++++++-----
 drivers/md/raid10.c       |  2 +-
 drivers/md/raid5.c        | 16 +++++++++-------
 include/linux/raid/md_k.h |  2 +-
 9 files changed, 33 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index d107ddceefcd..268547dbfbd3 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -297,7 +297,7 @@ static int run(mddev_t *mddev)
 	rdev_for_each(rdev, tmp, mddev)
 		conf->rdev = rdev;
 
-	mddev->array_size = mddev->size;
+	mddev->array_sectors = mddev->size * 2;
 	mddev->private = conf;
 
 	reconfig(mddev, mddev->layout, -1);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ec921f58fbb8..57644a780f16 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -256,7 +256,7 @@ static int linear_run (mddev_t *mddev)
 	if (!conf)
 		return 1;
 	mddev->private = conf;
-	mddev->array_size = conf->array_size;
+	mddev->array_sectors = conf->array_size * 2;
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->unplug_fn = linear_unplug;
@@ -290,8 +290,8 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 	newconf->prev = mddev_to_conf(mddev);
 	mddev->private = newconf;
 	mddev->raid_disks++;
-	mddev->array_size = newconf->array_size;
-	set_capacity(mddev->gendisk, mddev->array_size << 1);
+	mddev->array_sectors = newconf->array_size * 2;
+	set_capacity(mddev->gendisk, mddev->array_sectors);
 	return 0;
 }
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index df13a17a9627..4bfbc1982cda 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3704,7 +3704,7 @@ static int do_md_run(mddev_t * mddev)
 	if (mddev->flags)
 		md_update_sb(mddev, 0);
 
-	set_capacity(disk, mddev->array_size<<1);
+	set_capacity(disk, mddev->array_sectors);
 
 	/* If we call blk_queue_make_request here, it will
 	 * re-initialise max_sectors etc which may have been
@@ -3905,7 +3905,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 
 		export_array(mddev);
 
-		mddev->array_size = 0;
+		mddev->array_sectors = 0;
 		mddev->size = 0;
 		mddev->raid_disks = 0;
 		mddev->recovery_cp = 0;
@@ -4644,7 +4644,8 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
 		bdev = bdget_disk(mddev->gendisk, 0);
 		if (bdev) {
 			mutex_lock(&bdev->bd_inode->i_mutex);
-			i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
+			i_size_write(bdev->bd_inode,
+				     (loff_t)mddev->array_sectors << 9);
 			mutex_unlock(&bdev->bd_inode->i_mutex);
 			bdput(bdev);
 		}
@@ -5391,10 +5392,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
 		if (!list_empty(&mddev->disks)) {
 			if (mddev->pers)
 				seq_printf(seq, "\n      %llu blocks",
-					(unsigned long long)mddev->array_size);
+					   (unsigned long long)
+					   mddev->array_sectors / 2);
 			else
 				seq_printf(seq, "\n      %llu blocks",
-					(unsigned long long)size);
+					   (unsigned long long)size);
 		}
 		if (mddev->persistent) {
 			if (mddev->major_version != 0 ||
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 541cbe3414bd..c4779ccba1c3 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -504,7 +504,7 @@ static int multipath_run (mddev_t *mddev)
 	/*
 	 * Ok, everything is just fine now
 	 */
-	mddev->array_size = mddev->size;
+	mddev->array_sectors = mddev->size * 2;
 
 	mddev->queue->unplug_fn = multipath_unplug;
 	mddev->queue->backing_dev_info.congested_fn = multipath_congested;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 914c04ddec7c..2f30ebd8b7ab 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -293,16 +293,16 @@ static int raid0_run (mddev_t *mddev)
 		goto out_free_conf;
 
 	/* calculate array device size */
-	mddev->array_size = 0;
+	mddev->array_sectors = 0;
 	rdev_for_each(rdev, tmp, mddev)
-		mddev->array_size += rdev->size;
+		mddev->array_sectors += rdev->size * 2;
 
 	printk("raid0 : md_size is %llu blocks.\n", 
-		(unsigned long long)mddev->array_size);
+		(unsigned long long)mddev->array_sectors / 2);
 	printk("raid0 : conf->hash_spacing is %llu blocks.\n",
 		(unsigned long long)conf->hash_spacing);
 	{
-		sector_t s = mddev->array_size;
+		sector_t s = mddev->array_sectors / 2;
 		sector_t space = conf->hash_spacing;
 		int round;
 		conf->preshift = 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 491dc2d4ad5f..03a5ab705c20 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2043,7 +2043,7 @@ static int run(mddev_t *mddev)
 	/*
 	 * Ok, everything is just fine now
 	 */
-	mddev->array_size = mddev->size;
+	mddev->array_sectors = mddev->size * 2;
 
 	mddev->queue->unplug_fn = raid1_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid1_congested;
@@ -2105,14 +2105,15 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
 	 * any io in the removed space completes, but it hardly seems
 	 * worth it.
 	 */
-	mddev->array_size = sectors>>1;
-	set_capacity(mddev->gendisk, mddev->array_size << 1);
+	mddev->array_sectors = sectors;
+	set_capacity(mddev->gendisk, mddev->array_sectors);
 	mddev->changed = 1;
-	if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
+	if (mddev->array_sectors / 2 > mddev->size &&
+	    mddev->recovery_cp == MaxSector) {
 		mddev->recovery_cp = mddev->size << 1;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	}
-	mddev->size = mddev->array_size;
+	mddev->size = mddev->array_sectors / 2;
 	mddev->resync_max_sectors = sectors;
 	return 0;
 }
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index df08a9fa3a1f..2acea4025243 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2164,7 +2164,7 @@ static int run(mddev_t *mddev)
 	/*
 	 * Ok, everything is just fine now
 	 */
-	mddev->array_size = size << (conf->chunk_shift-1);
+	mddev->array_sectors = size << conf->chunk_shift;
 	mddev->resync_max_sectors = size << conf->chunk_shift;
 
 	mddev->queue->unplug_fn = raid10_unplug;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8f4c70a53210..42a480ba767b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3540,7 +3540,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 			    j == raid6_next_disk(sh->pd_idx, sh->disks))
 				continue;
 			s = compute_blocknr(sh, j);
-			if (s < (mddev->array_size<<1)) {
+			if (s < mddev->array_sectors) {
 				skipped = 1;
 				continue;
 			}
@@ -4189,7 +4189,7 @@ static int run(mddev_t *mddev)
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	mddev->queue->backing_dev_info.congested_fn = raid5_congested;
 
-	mddev->array_size =  mddev->size * (conf->previous_raid_disks -
+	mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks -
 					    conf->max_degraded);
 
 	blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
@@ -4413,8 +4413,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 
 	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
-	mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
-	set_capacity(mddev->gendisk, mddev->array_size << 1);
+	mddev->array_sectors = sectors * (mddev->raid_disks
+					  - conf->max_degraded);
+	set_capacity(mddev->gendisk, mddev->array_sectors);
 	mddev->changed = 1;
 	if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
 		mddev->recovery_cp = mddev->size << 1;
@@ -4547,15 +4548,16 @@ static void end_reshape(raid5_conf_t *conf)
 	struct block_device *bdev;
 
 	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
-		conf->mddev->array_size = conf->mddev->size *
+		conf->mddev->array_sectors = 2 * conf->mddev->size *
 			(conf->raid_disks - conf->max_degraded);
-		set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
+		set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
 		conf->mddev->changed = 1;
 
 		bdev = bdget_disk(conf->mddev->gendisk, 0);
 		if (bdev) {
 			mutex_lock(&bdev->bd_inode->i_mutex);
-			i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10);
+			i_size_write(bdev->bd_inode,
+				     (loff_t)conf->mddev->array_sectors << 9);
 			mutex_unlock(&bdev->bd_inode->i_mutex);
 			bdput(bdev);
 		}
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index e37aaa41efc6..6f72b47ae41c 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -150,7 +150,7 @@ struct mddev_s
 	int				raid_disks;
 	int				max_disks;
 	sector_t			size; /* used size of component devices */
-	sector_t			array_size; /* exported array size */
+	sector_t			array_sectors; /* exported array size */
 	__u64				events;
 
 	char				uuid[16];
-- 
cgit v1.2.3-70-g09d2


From d6e2215052810678bc9782fd980b52706fc71f50 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Mon, 21 Jul 2008 17:05:25 +1000
Subject: md: linear: Make array_size sector-based and rename it to
 array_sectors.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.c         | 16 ++++++++--------
 include/linux/raid/linear.h |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 57644a780f16..1cafaa959443 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -120,7 +120,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		return NULL;
 
 	cnt = 0;
-	conf->array_size = 0;
+	conf->array_sectors = 0;
 
 	rdev_for_each(rdev, tmp, mddev) {
 		int j = rdev->raid_disk;
@@ -144,7 +144,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->size = rdev->size;
-		conf->array_size += rdev->size;
+		conf->array_sectors += rdev->size * 2;
 
 		cnt++;
 	}
@@ -153,7 +153,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		goto out;
 	}
 
-	min_spacing = conf->array_size;
+	min_spacing = conf->array_sectors / 2;
 	sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
 
 	/* min_spacing is the minimum spacing that will fit the hash
@@ -162,7 +162,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 	 * that is larger than min_spacing as use the size of that as
 	 * the actual spacing
 	 */
-	conf->hash_spacing = conf->array_size;
+	conf->hash_spacing = conf->array_sectors / 2;
 	for (i=0; i < cnt-1 ; i++) {
 		sector_t sz = 0;
 		int j;
@@ -192,7 +192,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		unsigned round;
 		unsigned long base;
 
-		sz = conf->array_size >> conf->preshift;
+		sz = conf->array_sectors >> (conf->preshift + 1);
 		sz += 1; /* force round-up */
 		base = conf->hash_spacing >> conf->preshift;
 		round = sector_div(sz, base);
@@ -219,7 +219,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 	curr_offset = 0;
 	i = 0;
 	for (curr_offset = 0;
-	     curr_offset < conf->array_size;
+	     curr_offset < conf->array_sectors / 2;
 	     curr_offset += conf->hash_spacing) {
 
 		while (i < raid_disks-1 &&
@@ -256,7 +256,7 @@ static int linear_run (mddev_t *mddev)
 	if (!conf)
 		return 1;
 	mddev->private = conf;
-	mddev->array_sectors = conf->array_size * 2;
+	mddev->array_sectors = conf->array_sectors;
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->unplug_fn = linear_unplug;
@@ -290,7 +290,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 	newconf->prev = mddev_to_conf(mddev);
 	mddev->private = newconf;
 	mddev->raid_disks++;
-	mddev->array_sectors = newconf->array_size * 2;
+	mddev->array_sectors = newconf->array_sectors;
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	return 0;
 }
diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h
index ba15469daf11..7e375111d007 100644
--- a/include/linux/raid/linear.h
+++ b/include/linux/raid/linear.h
@@ -16,7 +16,7 @@ struct linear_private_data
 	struct linear_private_data *prev;	/* earlier version */
 	dev_info_t		**hash_table;
 	sector_t		hash_spacing;
-	sector_t		array_size;
+	sector_t		array_sectors;
 	int			preshift; /* shift before dividing by hash_spacing */
 	dev_info_t		disks[0];
 };
-- 
cgit v1.2.3-70-g09d2


From f2ea68cf42aafdd93393b6b8b20fc3c2b5f4390c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 21 Jul 2008 17:05:25 +1000
Subject: md: only count actual openers as access which prevent a 'stop'

Open isn't the only thing that increments ->active.  e.g. reading
/proc/mdstat will increment it briefly.  So to avoid false positives
in testing for concurrent access, introduce a new counter that counts
just the number of times the md device it open.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c           | 9 ++++++---
 include/linux/raid/md_k.h | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4bfbc1982cda..450f30b6edf9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -273,6 +273,7 @@ static mddev_t * mddev_find(dev_t unit)
 	INIT_LIST_HEAD(&new->all_mddevs);
 	init_timer(&new->safemode_timer);
 	atomic_set(&new->active, 1);
+	atomic_set(&new->openers, 0);
 	spin_lock_init(&new->write_lock);
 	init_waitqueue_head(&new->sb_wait);
 	init_waitqueue_head(&new->recovery_wait);
@@ -2695,14 +2696,14 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
 		break;
 	case clear:
 		/* stopping an active array */
-		if (atomic_read(&mddev->active) > 1)
+		if (atomic_read(&mddev->openers) > 0)
 			return -EBUSY;
 		err = do_md_stop(mddev, 0, 0);
 		break;
 	case inactive:
 		/* stopping an active array */
 		if (mddev->pers) {
-			if (atomic_read(&mddev->active) > 1)
+			if (atomic_read(&mddev->openers) > 0)
 				return -EBUSY;
 			err = do_md_stop(mddev, 2, 0);
 		} else
@@ -3816,7 +3817,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 	int err = 0;
 	struct gendisk *disk = mddev->gendisk;
 
-	if (atomic_read(&mddev->active) > 1 + is_open) {
+	if (atomic_read(&mddev->openers) > is_open) {
 		printk("md: %s still in use.\n",mdname(mddev));
 		return -EBUSY;
 	}
@@ -5014,6 +5015,7 @@ static int md_open(struct inode *inode, struct file *file)
 
 	err = 0;
 	mddev_get(mddev);
+	atomic_inc(&mddev->openers);
 	mddev_unlock(mddev);
 
 	check_disk_change(inode->i_bdev);
@@ -5026,6 +5028,7 @@ static int md_release(struct inode *inode, struct file * file)
  	mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
 
 	BUG_ON(!mddev);
+	atomic_dec(&mddev->openers);
 	mddev_put(mddev);
 
 	return 0;
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 6f72b47ae41c..4bef4791d80d 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -215,7 +215,8 @@ struct mddev_s
 
 	int				in_sync;	/* know to not need resync */
 	struct mutex			reconfig_mutex;
-	atomic_t			active;
+	atomic_t			active;		/* general refcount */
+	atomic_t			openers;	/* number of active opens */
 
 	int				changed;	/* true if we might need to reread partition info */
 	int				degraded;	/* whether md should consider
-- 
cgit v1.2.3-70-g09d2


From 4b80991c6cb9efa607bc4fd6f3ecdf5511c31bb0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 21 Jul 2008 17:05:25 +1000
Subject: md: Protect access to mddev->disks list using RCU

All modifications and most access to the mddev->disks list are made
under the reconfig_mutex lock.  However there are three places where
the list is walked without any locking.  If a reconfig happens at this
time, havoc (and oops) can ensue.

So use RCU to protect these accesses:
  - wrap them in rcu_read_{,un}lock()
  - use list_for_each_entry_rcu
  - add to the list with list_add_rcu
  - delete from the list with list_del_rcu
  - delay the 'free' with call_rcu rather than schedule_work

Note that export_rdev did a list_del_init on this list.  In almost all
cases the entry was not in the list anymore so it was a no-op and so
safe.  It is no longer safe as after list_del_rcu we may not touch
the list_head.
An audit shows that export_rdev is called:
  - after unbind_rdev_from_array, in which case the delete has
     already been done,
  - after bind_rdev_to_array fails, in which case the delete isn't needed.
  - before the device has been put on a list at all (e.g. in
      add_new_disk where reading the superblock fails).
  - and in autorun devices after a failure when the device is on a
      different list.

So remove the list_del_init call from export_rdev, and add it back
immediately before the called to export_rdev for that last case.

Note also that ->same_set is sometimes used for lists other than
mddev->list (e.g. candidates).  In these cases rcu is not needed.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c       | 15 ++++++++++-----
 drivers/md/md.c           | 30 ++++++++++++++++++------------
 include/linux/raid/md_k.h |  3 +++
 3 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index eba83e25b678..621a272a2c74 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
 static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
 	mdk_rdev_t *rdev;
-	struct list_head *tmp;
 	mddev_t *mddev = bitmap->mddev;
 
-	rdev_for_each(rdev, tmp, mddev)
+	rcu_read_lock();
+	rdev_for_each_rcu(rdev, mddev)
 		if (test_bit(In_sync, &rdev->flags)
 		    && !test_bit(Faulty, &rdev->flags)) {
 			int size = PAGE_SIZE;
@@ -260,11 +260,11 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 				    + (long)(page->index * (PAGE_SIZE/512))
 				    + size/512 > 0)
 					/* bitmap runs in to metadata */
-					return -EINVAL;
+					goto bad_alignment;
 				if (rdev->data_offset + mddev->size*2
 				    > rdev->sb_start + bitmap->offset)
 					/* data runs in to bitmap */
-					return -EINVAL;
+					goto bad_alignment;
 			} else if (rdev->sb_start < rdev->data_offset) {
 				/* METADATA BITMAP DATA */
 				if (rdev->sb_start
@@ -272,7 +272,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 				    + page->index*(PAGE_SIZE/512) + size/512
 				    > rdev->data_offset)
 					/* bitmap runs in to data */
-					return -EINVAL;
+					goto bad_alignment;
 			} else {
 				/* DATA METADATA BITMAP - no problems */
 			}
@@ -282,10 +282,15 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 				       size,
 				       page);
 		}
+	rcu_read_unlock();
 
 	if (wait)
 		md_super_wait(mddev);
 	return 0;
+
+ bad_alignment:
+	rcu_read_unlock();
+	return -EINVAL;
 }
 
 static void bitmap_file_kick(struct bitmap *bitmap);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 450f30b6edf9..c2ff77ccec50 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1395,15 +1395,17 @@ static struct super_type super_types[] = {
 
 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
 {
-	struct list_head *tmp, *tmp2;
 	mdk_rdev_t *rdev, *rdev2;
 
-	rdev_for_each(rdev, tmp, mddev1)
-		rdev_for_each(rdev2, tmp2, mddev2)
+	rcu_read_lock();
+	rdev_for_each_rcu(rdev, mddev1)
+		rdev_for_each_rcu(rdev2, mddev2)
 			if (rdev->bdev->bd_contains ==
-			    rdev2->bdev->bd_contains)
+			    rdev2->bdev->bd_contains) {
+				rcu_read_unlock();
 				return 1;
-
+			}
+	rcu_read_unlock();
 	return 0;
 }
 
@@ -1470,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 		kobject_del(&rdev->kobj);
 		goto fail;
 	}
-	list_add(&rdev->same_set, &mddev->disks);
+	list_add_rcu(&rdev->same_set, &mddev->disks);
 	bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
 	return 0;
 
@@ -1495,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
 		return;
 	}
 	bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
-	list_del_init(&rdev->same_set);
+	list_del_rcu(&rdev->same_set);
 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
 	rdev->mddev = NULL;
 	sysfs_remove_link(&rdev->kobj, "block");
 
 	/* We need to delay this, otherwise we can deadlock when
-	 * writing to 'remove' to "dev/state"
+	 * writing to 'remove' to "dev/state".  We also need
+	 * to delay it due to rcu usage.
 	 */
+	synchronize_rcu();
 	INIT_WORK(&rdev->del_work, md_delayed_delete);
 	kobject_get(&rdev->kobj);
 	schedule_work(&rdev->del_work);
@@ -1558,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev)
 	if (rdev->mddev)
 		MD_BUG();
 	free_disk_sb(rdev);
-	list_del_init(&rdev->same_set);
 #ifndef MODULE
 	if (test_bit(AutoDetected, &rdev->flags))
 		md_autodetect_dev(rdev->bdev->bd_dev);
@@ -4062,8 +4065,10 @@ static void autorun_devices(int part)
 		/* on success, candidates will be empty, on error
 		 * it won't...
 		 */
-		rdev_for_each_list(rdev, tmp, candidates)
+		rdev_for_each_list(rdev, tmp, candidates) {
+			list_del_init(&rdev->same_set);
 			export_rdev(rdev);
+		}
 		mddev_put(mddev);
 	}
 	printk(KERN_INFO "md: ... autorun DONE.\n");
@@ -5529,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p)
 static int is_mddev_idle(mddev_t *mddev)
 {
 	mdk_rdev_t * rdev;
-	struct list_head *tmp;
 	int idle;
 	long curr_events;
 
 	idle = 1;
-	rdev_for_each(rdev, tmp, mddev) {
+	rcu_read_lock();
+	rdev_for_each_rcu(rdev, mddev) {
 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
 		curr_events = disk_stat_read(disk, sectors[0]) + 
 				disk_stat_read(disk, sectors[1]) - 
@@ -5566,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev)
 			idle = 0;
 		}
 	}
+	rcu_read_unlock();
 	return idle;
 }
 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 4bef4791d80d..9f2549ac0e2d 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -339,6 +339,9 @@ static inline char * mdname (mddev_t * mddev)
 #define rdev_for_each(rdev, tmp, mddev)				\
 	rdev_for_each_list(rdev, tmp, (mddev)->disks)
 
+#define rdev_for_each_rcu(rdev, mddev)				\
+	list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
+
 typedef struct mdk_thread_s {
 	void			(*run) (mddev_t *mddev);
 	mddev_t			*mddev;
-- 
cgit v1.2.3-70-g09d2