From 4cc96131afce3eaae7c13dff41c6ba771cf10e96 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 12 May 2016 16:28:10 -0400
Subject: dm: move request-based code out to dm-rq.[hc]

Add some seperation between bio-based and request-based DM core code.

'struct mapped_device' and other DM core only structures and functions
have been moved to dm-core.h and all relevant DM core .c files have been
updated to include dm-core.h rather than dm.h

DM targets should _never_ include dm-core.h!

[block core merge conflict resolution from Stephen Rothwell]
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 drivers/md/Makefile     |    3 +-
 drivers/md/dm-builtin.c |    2 +-
 drivers/md/dm-core.h    |  149 +++++++
 drivers/md/dm-io.c      |    2 +-
 drivers/md/dm-ioctl.c   |    2 +-
 drivers/md/dm-kcopyd.c  |    2 +-
 drivers/md/dm-mpath.c   |    4 +-
 drivers/md/dm-rq.c      |  959 ++++++++++++++++++++++++++++++++++++++++
 drivers/md/dm-rq.h      |   64 +++
 drivers/md/dm-stats.c   |    2 +-
 drivers/md/dm-sysfs.c   |    3 +-
 drivers/md/dm-table.c   |    2 +-
 drivers/md/dm-target.c  |    2 +-
 drivers/md/dm.c         | 1110 +----------------------------------------------
 drivers/md/dm.h         |   25 +-
 15 files changed, 1200 insertions(+), 1131 deletions(-)
 create mode 100644 drivers/md/dm-core.h
 create mode 100644 drivers/md/dm-rq.c
 create mode 100644 drivers/md/dm-rq.h

(limited to 'drivers/md')

diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 52ba8dd82821..3cbda1af87a0 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -3,7 +3,8 @@
 #
 
 dm-mod-y	+= dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
-		   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o
+		   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o \
+		   dm-rq.o
 dm-multipath-y	+= dm-path-selector.o dm-mpath.o
 dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 		    dm-snap-persistent.o
diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c
index 6c9049c51b2b..f092771878c2 100644
--- a/drivers/md/dm-builtin.c
+++ b/drivers/md/dm-builtin.c
@@ -1,4 +1,4 @@
-#include "dm.h"
+#include "dm-core.h"
 
 /*
  * The kobject release method must not be placed in the module itself,
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
new file mode 100644
index 000000000000..40ceba1fe8be
--- /dev/null
+++ b/drivers/md/dm-core.h
@@ -0,0 +1,149 @@
+/*
+ * Internal header file _only_ for device mapper core
+ *
+ * Copyright (C) 2016 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef DM_CORE_INTERNAL_H
+#define DM_CORE_INTERNAL_H
+
+#include <linux/kthread.h>
+#include <linux/ktime.h>
+#include <linux/blk-mq.h>
+
+#include <trace/events/block.h>
+
+#include "dm.h"
+
+#define DM_RESERVED_MAX_IOS		1024
+
+struct dm_kobject_holder {
+	struct kobject kobj;
+	struct completion completion;
+};
+
+/*
+ * DM core internal structure that used directly by dm.c and dm-rq.c
+ * DM targets must _not_ deference a mapped_device to directly access its members!
+ */
+struct mapped_device {
+	struct srcu_struct io_barrier;
+	struct mutex suspend_lock;
+
+	/*
+	 * The current mapping (struct dm_table *).
+	 * Use dm_get_live_table{_fast} or take suspend_lock for
+	 * dereference.
+	 */
+	void __rcu *map;
+
+	struct list_head table_devices;
+	struct mutex table_devices_lock;
+
+	unsigned long flags;
+
+	struct request_queue *queue;
+	int numa_node_id;
+
+	unsigned type;
+	/* Protect queue and type against concurrent access. */
+	struct mutex type_lock;
+
+	atomic_t holders;
+	atomic_t open_count;
+
+	struct dm_target *immutable_target;
+	struct target_type *immutable_target_type;
+
+	struct gendisk *disk;
+	char name[16];
+
+	void *interface_ptr;
+
+	/*
+	 * A list of ios that arrived while we were suspended.
+	 */
+	atomic_t pending[2];
+	wait_queue_head_t wait;
+	struct work_struct work;
+	spinlock_t deferred_lock;
+	struct bio_list deferred;
+
+	/*
+	 * Event handling.
+	 */
+	wait_queue_head_t eventq;
+	atomic_t event_nr;
+	atomic_t uevent_seq;
+	struct list_head uevent_list;
+	spinlock_t uevent_lock; /* Protect access to uevent_list */
+
+	/* the number of internal suspends */
+	unsigned internal_suspend_count;
+
+	/*
+	 * Processing queue (flush)
+	 */
+	struct workqueue_struct *wq;
+
+	/*
+	 * io objects are allocated from here.
+	 */
+	mempool_t *io_pool;
+	mempool_t *rq_pool;
+
+	struct bio_set *bs;
+
+	/*
+	 * freeze/thaw support require holding onto a super block
+	 */
+	struct super_block *frozen_sb;
+
+	/* forced geometry settings */
+	struct hd_geometry geometry;
+
+	struct block_device *bdev;
+
+	/* kobject and completion */
+	struct dm_kobject_holder kobj_holder;
+
+	/* zero-length flush that will be cloned and submitted to targets */
+	struct bio flush_bio;
+
+	struct dm_stats stats;
+
+	struct kthread_worker kworker;
+	struct task_struct *kworker_task;
+
+	/* for request-based merge heuristic in dm_request_fn() */
+	unsigned seq_rq_merge_deadline_usecs;
+	int last_rq_rw;
+	sector_t last_rq_pos;
+	ktime_t last_rq_start_time;
+
+	/* for blk-mq request-based DM support */
+	struct blk_mq_tag_set *tag_set;
+	bool use_blk_mq:1;
+	bool init_tio_pdu:1;
+};
+
+void dm_init_md_queue(struct mapped_device *md);
+void dm_init_normal_md_queue(struct mapped_device *md);
+int md_in_flight(struct mapped_device *md);
+void disable_write_same(struct mapped_device *md);
+
+static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
+{
+	return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
+}
+
+unsigned __dm_get_module_param(unsigned *module_param, unsigned def, unsigned max);
+
+static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
+{
+	return !maxlen || strlen(result) + 1 >= maxlen;
+}
+
+#endif
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 0e225fd4a8d1..daa03e41654a 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -5,7 +5,7 @@
  * This file is released under the GPL.
  */
 
-#include "dm.h"
+#include "dm-core.h"
 
 #include <linux/device-mapper.h>
 
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 2c7ca258c4e4..b59e34595ad8 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -5,7 +5,7 @@
  * This file is released under the GPL.
  */
 
-#include "dm.h"
+#include "dm-core.h"
 
 #include <linux/module.h>
 #include <linux/vmalloc.h>
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 9da1d54ac6cb..9e9d04cb7d51 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -26,7 +26,7 @@
 #include <linux/device-mapper.h>
 #include <linux/dm-kcopyd.h>
 
-#include "dm.h"
+#include "dm-core.h"
 
 #define SUB_JOB_SIZE	128
 #define SPLIT_COUNT	8
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 52baf8a5b0f4..e1c07d1ec80b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -7,7 +7,7 @@
 
 #include <linux/device-mapper.h>
 
-#include "dm.h"
+#include "dm-rq.h"
 #include "dm-path-selector.h"
 #include "dm-uevent.h"
 
@@ -1328,7 +1328,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
 	 * during end I/O handling, since those clone requests don't have
 	 * bio clones.  If we queue them inside the multipath target,
 	 * we need to make bio clones, that requires memory allocation.
-	 * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
+	 * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests
 	 *  don't have bio clones.)
 	 * Instead of queueing the clone request here, we queue the original
 	 * request into dm core, which will remake a clone request and
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
new file mode 100644
index 000000000000..787c81b16a26
--- /dev/null
+++ b/drivers/md/dm-rq.c
@@ -0,0 +1,959 @@
+/*
+ * Copyright (C) 2016 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-core.h"
+#include "dm-rq.h"
+
+#include <linux/elevator.h> /* for rq_end_sector() */
+#include <linux/blk-mq.h>
+
+#define DM_MSG_PREFIX "core-rq"
+
+#define DM_MQ_NR_HW_QUEUES 1
+#define DM_MQ_QUEUE_DEPTH 2048
+static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES;
+static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH;
+
+/*
+ * Request-based DM's mempools' reserved IOs set by the user.
+ */
+#define RESERVED_REQUEST_BASED_IOS	256
+static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
+
+#ifdef CONFIG_DM_MQ_DEFAULT
+static bool use_blk_mq = true;
+#else
+static bool use_blk_mq = false;
+#endif
+
+bool dm_use_blk_mq_default(void)
+{
+	return use_blk_mq;
+}
+
+bool dm_use_blk_mq(struct mapped_device *md)
+{
+	return md->use_blk_mq;
+}
+EXPORT_SYMBOL_GPL(dm_use_blk_mq);
+
+unsigned dm_get_reserved_rq_based_ios(void)
+{
+	return __dm_get_module_param(&reserved_rq_based_ios,
+				     RESERVED_REQUEST_BASED_IOS, DM_RESERVED_MAX_IOS);
+}
+EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
+
+static unsigned dm_get_blk_mq_nr_hw_queues(void)
+{
+	return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32);
+}
+
+static unsigned dm_get_blk_mq_queue_depth(void)
+{
+	return __dm_get_module_param(&dm_mq_queue_depth,
+				     DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH);
+}
+
+int dm_request_based(struct mapped_device *md)
+{
+	return blk_queue_stackable(md->queue);
+}
+
+static void dm_old_start_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (blk_queue_stopped(q))
+		blk_start_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+void dm_start_queue(struct request_queue *q)
+{
+	if (!q->mq_ops)
+		dm_old_start_queue(q);
+	else {
+		blk_mq_start_stopped_hw_queues(q, true);
+		blk_mq_kick_requeue_list(q);
+	}
+}
+
+static void dm_old_stop_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (blk_queue_stopped(q)) {
+		spin_unlock_irqrestore(q->queue_lock, flags);
+		return;
+	}
+
+	blk_stop_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+void dm_stop_queue(struct request_queue *q)
+{
+	if (!q->mq_ops)
+		dm_old_stop_queue(q);
+	else
+		blk_mq_stop_hw_queues(q);
+}
+
+static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md,
+						gfp_t gfp_mask)
+{
+	return mempool_alloc(md->io_pool, gfp_mask);
+}
+
+static void free_old_rq_tio(struct dm_rq_target_io *tio)
+{
+	mempool_free(tio, tio->md->io_pool);
+}
+
+static struct request *alloc_old_clone_request(struct mapped_device *md,
+					       gfp_t gfp_mask)
+{
+	return mempool_alloc(md->rq_pool, gfp_mask);
+}
+
+static void free_old_clone_request(struct mapped_device *md, struct request *rq)
+{
+	mempool_free(rq, md->rq_pool);
+}
+
+/*
+ * Partial completion handling for request-based dm
+ */
+static void end_clone_bio(struct bio *clone)
+{
+	struct dm_rq_clone_bio_info *info =
+		container_of(clone, struct dm_rq_clone_bio_info, clone);
+	struct dm_rq_target_io *tio = info->tio;
+	struct bio *bio = info->orig;
+	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
+	int error = clone->bi_error;
+
+	bio_put(clone);
+
+	if (tio->error)
+		/*
+		 * An error has already been detected on the request.
+		 * Once error occurred, just let clone->end_io() handle
+		 * the remainder.
+		 */
+		return;
+	else if (error) {
+		/*
+		 * Don't notice the error to the upper layer yet.
+		 * The error handling decision is made by the target driver,
+		 * when the request is completed.
+		 */
+		tio->error = error;
+		return;
+	}
+
+	/*
+	 * I/O for the bio successfully completed.
+	 * Notice the data completion to the upper layer.
+	 */
+
+	/*
+	 * bios are processed from the head of the list.
+	 * So the completing bio should always be rq->bio.
+	 * If it's not, something wrong is happening.
+	 */
+	if (tio->orig->bio != bio)
+		DMERR("bio completion is going in the middle of the request");
+
+	/*
+	 * Update the original request.
+	 * Do not use blk_end_request() here, because it may complete
+	 * the original request before the clone, and break the ordering.
+	 */
+	blk_update_request(tio->orig, 0, nr_bytes);
+}
+
+static struct dm_rq_target_io *tio_from_request(struct request *rq)
+{
+	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
+}
+
+static void rq_end_stats(struct mapped_device *md, struct request *orig)
+{
+	if (unlikely(dm_stats_used(&md->stats))) {
+		struct dm_rq_target_io *tio = tio_from_request(orig);
+		tio->duration_jiffies = jiffies - tio->duration_jiffies;
+		dm_stats_account_io(&md->stats, rq_data_dir(orig),
+				    blk_rq_pos(orig), tio->n_sectors, true,
+				    tio->duration_jiffies, &tio->stats_aux);
+	}
+}
+
+/*
+ * Don't touch any member of the md after calling this function because
+ * the md may be freed in dm_put() at the end of this function.
+ * Or do dm_get() before calling this function and dm_put() later.
+ */
+static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
+{
+	atomic_dec(&md->pending[rw]);
+
+	/* nudge anyone waiting on suspend queue */
+	if (!md_in_flight(md))
+		wake_up(&md->wait);
+
+	/*
+	 * Run this off this callpath, as drivers could invoke end_io while
+	 * inside their request_fn (and holding the queue lock). Calling
+	 * back into ->request_fn() could deadlock attempting to grab the
+	 * queue lock again.
+	 */
+	if (!md->queue->mq_ops && run_queue)
+		blk_run_queue_async(md->queue);
+
+	/*
+	 * dm_put() must be at the end of this function. See the comment above
+	 */
+	dm_put(md);
+}
+
+static void free_rq_clone(struct request *clone)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct mapped_device *md = tio->md;
+
+	blk_rq_unprep_clone(clone);
+
+	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
+		/* stacked on blk-mq queue(s) */
+		tio->ti->type->release_clone_rq(clone);
+	else if (!md->queue->mq_ops)
+		/* request_fn queue stacked on request_fn queue(s) */
+		free_old_clone_request(md, clone);
+
+	if (!md->queue->mq_ops)
+		free_old_rq_tio(tio);
+}
+
+/*
+ * Complete the clone and the original request.
+ * Must be called without clone's queue lock held,
+ * see end_clone_request() for more details.
+ */
+static void dm_end_request(struct request *clone, int error)
+{
+	int rw = rq_data_dir(clone);
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct mapped_device *md = tio->md;
+	struct request *rq = tio->orig;
+
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
+		rq->errors = clone->errors;
+		rq->resid_len = clone->resid_len;
+
+		if (rq->sense)
+			/*
+			 * We are using the sense buffer of the original
+			 * request.
+			 * So setting the length of the sense data is enough.
+			 */
+			rq->sense_len = clone->sense_len;
+	}
+
+	free_rq_clone(clone);
+	rq_end_stats(md, rq);
+	if (!rq->q->mq_ops)
+		blk_end_request_all(rq, error);
+	else
+		blk_mq_end_request(rq, error);
+	rq_completed(md, rw, true);
+}
+
+static void dm_unprep_request(struct request *rq)
+{
+	struct dm_rq_target_io *tio = tio_from_request(rq);
+	struct request *clone = tio->clone;
+
+	if (!rq->q->mq_ops) {
+		rq->special = NULL;
+		rq->cmd_flags &= ~REQ_DONTPREP;
+	}
+
+	if (clone)
+		free_rq_clone(clone);
+	else if (!tio->md->queue->mq_ops)
+		free_old_rq_tio(tio);
+}
+
+/*
+ * Requeue the original request of a clone.
+ */
+static void dm_old_requeue_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	blk_requeue_request(q, rq);
+	blk_run_queue_async(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void dm_mq_requeue_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	unsigned long flags;
+
+	blk_mq_requeue_request(rq);
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (!blk_queue_stopped(q))
+		blk_mq_kick_requeue_list(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void dm_requeue_original_request(struct mapped_device *md,
+					struct request *rq)
+{
+	int rw = rq_data_dir(rq);
+
+	rq_end_stats(md, rq);
+	dm_unprep_request(rq);
+
+	if (!rq->q->mq_ops)
+		dm_old_requeue_request(rq);
+	else
+		dm_mq_requeue_request(rq);
+
+	rq_completed(md, rw, false);
+}
+
+static void dm_done(struct request *clone, int error, bool mapped)
+{
+	int r = error;
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	dm_request_endio_fn rq_end_io = NULL;
+
+	if (tio->ti) {
+		rq_end_io = tio->ti->type->rq_end_io;
+
+		if (mapped && rq_end_io)
+			r = rq_end_io(tio->ti, clone, error, &tio->info);
+	}
+
+	if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) &&
+		     !clone->q->limits.max_write_same_sectors))
+		disable_write_same(tio->md);
+
+	if (r <= 0)
+		/* The target wants to complete the I/O */
+		dm_end_request(clone, r);
+	else if (r == DM_ENDIO_INCOMPLETE)
+		/* The target will handle the I/O */
+		return;
+	else if (r == DM_ENDIO_REQUEUE)
+		/* The target wants to requeue the I/O */
+		dm_requeue_original_request(tio->md, tio->orig);
+	else {
+		DMWARN("unimplemented target endio return value: %d", r);
+		BUG();
+	}
+}
+
+/*
+ * Request completion handler for request-based dm
+ */
+static void dm_softirq_done(struct request *rq)
+{
+	bool mapped = true;
+	struct dm_rq_target_io *tio = tio_from_request(rq);
+	struct request *clone = tio->clone;
+	int rw;
+
+	if (!clone) {
+		rq_end_stats(tio->md, rq);
+		rw = rq_data_dir(rq);
+		if (!rq->q->mq_ops) {
+			blk_end_request_all(rq, tio->error);
+			rq_completed(tio->md, rw, false);
+			free_old_rq_tio(tio);
+		} else {
+			blk_mq_end_request(rq, tio->error);
+			rq_completed(tio->md, rw, false);
+		}
+		return;
+	}
+
+	if (rq->cmd_flags & REQ_FAILED)
+		mapped = false;
+
+	dm_done(clone, tio->error, mapped);
+}
+
+/*
+ * Complete the clone and the original request with the error status
+ * through softirq context.
+ */
+static void dm_complete_request(struct request *rq, int error)
+{
+	struct dm_rq_target_io *tio = tio_from_request(rq);
+
+	tio->error = error;
+	if (!rq->q->mq_ops)
+		blk_complete_request(rq);
+	else
+		blk_mq_complete_request(rq, error);
+}
+
+/*
+ * Complete the not-mapped clone and the original request with the error status
+ * through softirq context.
+ * Target's rq_end_io() function isn't called.
+ * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
+ */
+static void dm_kill_unmapped_request(struct request *rq, int error)
+{
+	rq->cmd_flags |= REQ_FAILED;
+	dm_complete_request(rq, error);
+}
+
+/*
+ * Called with the clone's queue lock held (in the case of .request_fn)
+ */
+static void end_clone_request(struct request *clone, int error)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+
+	if (!clone->q->mq_ops) {
+		/*
+		 * For just cleaning up the information of the queue in which
+		 * the clone was dispatched.
+		 * The clone is *NOT* freed actually here because it is alloced
+		 * from dm own mempool (REQ_ALLOCED isn't set).
+		 */
+		__blk_put_request(clone->q, clone);
+	}
+
+	/*
+	 * Actual request completion is done in a softirq context which doesn't
+	 * hold the clone's queue lock.  Otherwise, deadlock could occur because:
+	 *     - another request may be submitted by the upper level driver
+	 *       of the stacking during the completion
+	 *     - the submission which requires queue lock may be done
+	 *       against this clone's queue
+	 */
+	dm_complete_request(tio->orig, error);
+}
+
+static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
+{
+	int r;
+
+	if (blk_queue_io_stat(clone->q))
+		clone->cmd_flags |= REQ_IO_STAT;
+
+	clone->start_time = jiffies;
+	r = blk_insert_cloned_request(clone->q, clone);
+	if (r)
+		/* must complete clone in terms of original request */
+		dm_complete_request(rq, r);
+}
+
+static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
+				 void *data)
+{
+	struct dm_rq_target_io *tio = data;
+	struct dm_rq_clone_bio_info *info =
+		container_of(bio, struct dm_rq_clone_bio_info, clone);
+
+	info->orig = bio_orig;
+	info->tio = tio;
+	bio->bi_end_io = end_clone_bio;
+
+	return 0;
+}
+
+static int setup_clone(struct request *clone, struct request *rq,
+		       struct dm_rq_target_io *tio, gfp_t gfp_mask)
+{
+	int r;
+
+	r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
+			      dm_rq_bio_constructor, tio);
+	if (r)
+		return r;
+
+	clone->cmd = rq->cmd;
+	clone->cmd_len = rq->cmd_len;
+	clone->sense = rq->sense;
+	clone->end_io = end_clone_request;
+	clone->end_io_data = tio;
+
+	tio->clone = clone;
+
+	return 0;
+}
+
+static struct request *clone_old_rq(struct request *rq, struct mapped_device *md,
+				    struct dm_rq_target_io *tio, gfp_t gfp_mask)
+{
+	/*
+	 * Create clone for use with .request_fn request_queue
+	 */
+	struct request *clone;
+
+	clone = alloc_old_clone_request(md, gfp_mask);
+	if (!clone)
+		return NULL;
+
+	blk_rq_init(NULL, clone);
+	if (setup_clone(clone, rq, tio, gfp_mask)) {
+		/* -ENOMEM */
+		free_old_clone_request(md, clone);
+		return NULL;
+	}
+
+	return clone;
+}
+
+static void map_tio_request(struct kthread_work *work);
+
+static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
+		     struct mapped_device *md)
+{
+	tio->md = md;
+	tio->ti = NULL;
+	tio->clone = NULL;
+	tio->orig = rq;
+	tio->error = 0;
+	/*
+	 * Avoid initializing info for blk-mq; it passes
+	 * target-specific data through info.ptr
+	 * (see: dm_mq_init_request)
+	 */
+	if (!md->init_tio_pdu)
+		memset(&tio->info, 0, sizeof(tio->info));
+	if (md->kworker_task)
+		init_kthread_work(&tio->work, map_tio_request);
+}
+
+static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq,
+					       struct mapped_device *md,
+					       gfp_t gfp_mask)
+{
+	struct dm_rq_target_io *tio;
+	int srcu_idx;
+	struct dm_table *table;
+
+	tio = alloc_old_rq_tio(md, gfp_mask);
+	if (!tio)
+		return NULL;
+
+	init_tio(tio, rq, md);
+
+	table = dm_get_live_table(md, &srcu_idx);
+	/*
+	 * Must clone a request if this .request_fn DM device
+	 * is stacked on .request_fn device(s).
+	 */
+	if (!dm_table_mq_request_based(table)) {
+		if (!clone_old_rq(rq, md, tio, gfp_mask)) {
+			dm_put_live_table(md, srcu_idx);
+			free_old_rq_tio(tio);
+			return NULL;
+		}
+	}
+	dm_put_live_table(md, srcu_idx);
+
+	return tio;
+}
+
+/*
+ * Called with the queue lock held.
+ */
+static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_rq_target_io *tio;
+
+	if (unlikely(rq->special)) {
+		DMWARN("Already has something in rq->special.");
+		return BLKPREP_KILL;
+	}
+
+	tio = dm_old_prep_tio(rq, md, GFP_ATOMIC);
+	if (!tio)
+		return BLKPREP_DEFER;
+
+	rq->special = tio;
+	rq->cmd_flags |= REQ_DONTPREP;
+
+	return BLKPREP_OK;
+}
+
+/*
+ * Returns:
+ * 0                : the request has been processed
+ * DM_MAPIO_REQUEUE : the original request needs to be requeued
+ * < 0              : the request was completed due to failure
+ */
+static int map_request(struct dm_rq_target_io *tio, struct request *rq,
+		       struct mapped_device *md)
+{
+	int r;
+	struct dm_target *ti = tio->ti;
+	struct request *clone = NULL;
+
+	if (tio->clone) {
+		clone = tio->clone;
+		r = ti->type->map_rq(ti, clone, &tio->info);
+	} else {
+		r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
+		if (r < 0) {
+			/* The target wants to complete the I/O */
+			dm_kill_unmapped_request(rq, r);
+			return r;
+		}
+		if (r != DM_MAPIO_REMAPPED)
+			return r;
+		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
+			/* -ENOMEM */
+			ti->type->release_clone_rq(clone);
+			return DM_MAPIO_REQUEUE;
+		}
+	}
+
+	switch (r) {
+	case DM_MAPIO_SUBMITTED:
+		/* The target has taken the I/O to submit by itself later */
+		break;
+	case DM_MAPIO_REMAPPED:
+		/* The target has remapped the I/O so dispatch it */
+		trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
+				     blk_rq_pos(rq));
+		dm_dispatch_clone_request(clone, rq);
+		break;
+	case DM_MAPIO_REQUEUE:
+		/* The target wants to requeue the I/O */
+		dm_requeue_original_request(md, tio->orig);
+		break;
+	default:
+		if (r > 0) {
+			DMWARN("unimplemented target map return value: %d", r);
+			BUG();
+		}
+
+		/* The target wants to complete the I/O */
+		dm_kill_unmapped_request(rq, r);
+		return r;
+	}
+
+	return 0;
+}
+
+static void dm_start_request(struct mapped_device *md, struct request *orig)
+{
+	if (!orig->q->mq_ops)
+		blk_start_request(orig);
+	else
+		blk_mq_start_request(orig);
+	atomic_inc(&md->pending[rq_data_dir(orig)]);
+
+	if (md->seq_rq_merge_deadline_usecs) {
+		md->last_rq_pos = rq_end_sector(orig);
+		md->last_rq_rw = rq_data_dir(orig);
+		md->last_rq_start_time = ktime_get();
+	}
+
+	if (unlikely(dm_stats_used(&md->stats))) {
+		struct dm_rq_target_io *tio = tio_from_request(orig);
+		tio->duration_jiffies = jiffies;
+		tio->n_sectors = blk_rq_sectors(orig);
+		dm_stats_account_io(&md->stats, rq_data_dir(orig),
+				    blk_rq_pos(orig), tio->n_sectors, false, 0,
+				    &tio->stats_aux);
+	}
+
+	/*
+	 * Hold the md reference here for the in-flight I/O.
+	 * We can't rely on the reference count by device opener,
+	 * because the device may be closed during the request completion
+	 * when all bios are completed.
+	 * See the comment in rq_completed() too.
+	 */
+	dm_get(md);
+}
+
+static void map_tio_request(struct kthread_work *work)
+{
+	struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
+	struct request *rq = tio->orig;
+	struct mapped_device *md = tio->md;
+
+	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
+		dm_requeue_original_request(md, rq);
+}
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
+{
+	return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
+}
+
+#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+						     const char *buf, size_t count)
+{
+	unsigned deadline;
+
+	if (!dm_request_based(md) || md->use_blk_mq)
+		return count;
+
+	if (kstrtouint(buf, 10, &deadline))
+		return -EINVAL;
+
+	if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
+		deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
+
+	md->seq_rq_merge_deadline_usecs = deadline;
+
+	return count;
+}
+
+static bool dm_old_request_peeked_before_merge_deadline(struct mapped_device *md)
+{
+	ktime_t kt_deadline;
+
+	if (!md->seq_rq_merge_deadline_usecs)
+		return false;
+
+	kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
+	kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
+
+	return !ktime_after(ktime_get(), kt_deadline);
+}
+
+/*
+ * q->request_fn for old request-based dm.
+ * Called with the queue lock held.
+ */
+static void dm_old_request_fn(struct request_queue *q)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_target *ti = md->immutable_target;
+	struct request *rq;
+	struct dm_rq_target_io *tio;
+	sector_t pos = 0;
+
+	if (unlikely(!ti)) {
+		int srcu_idx;
+		struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+
+		ti = dm_table_find_target(map, pos);
+		dm_put_live_table(md, srcu_idx);
+	}
+
+	/*
+	 * For suspend, check blk_queue_stopped() and increment
+	 * ->pending within a single queue_lock not to increment the
+	 * number of in-flight I/Os after the queue is stopped in
+	 * dm_suspend().
+	 */
+	while (!blk_queue_stopped(q)) {
+		rq = blk_peek_request(q);
+		if (!rq)
+			return;
+
+		/* always use block 0 to find the target for flushes for now */
+		pos = 0;
+		if (req_op(rq) != REQ_OP_FLUSH)
+			pos = blk_rq_pos(rq);
+
+		if ((dm_old_request_peeked_before_merge_deadline(md) &&
+		     md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
+		     md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) ||
+		    (ti->type->busy && ti->type->busy(ti))) {
+			blk_delay_queue(q, HZ / 100);
+			return;
+		}
+
+		dm_start_request(md, rq);
+
+		tio = tio_from_request(rq);
+		/* Establish tio->ti before queuing work (map_tio_request) */
+		tio->ti = ti;
+		queue_kthread_work(&md->kworker, &tio->work);
+		BUG_ON(!irqs_disabled());
+	}
+}
+
+/*
+ * Fully initialize a .request_fn request-based queue.
+ */
+int dm_old_init_request_queue(struct mapped_device *md)
+{
+	/* Fully initialize the queue */
+	if (!blk_init_allocated_queue(md->queue, dm_old_request_fn, NULL))
+		return -EINVAL;
+
+	/* disable dm_old_request_fn's merge heuristic by default */
+	md->seq_rq_merge_deadline_usecs = 0;
+
+	dm_init_normal_md_queue(md);
+	blk_queue_softirq_done(md->queue, dm_softirq_done);
+	blk_queue_prep_rq(md->queue, dm_old_prep_fn);
+
+	/* Initialize the request-based DM worker thread */
+	init_kthread_worker(&md->kworker);
+	md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
+				       "kdmwork-%s", dm_device_name(md));
+
+	elv_register_queue(md->queue);
+
+	return 0;
+}
+
+static int dm_mq_init_request(void *data, struct request *rq,
+		       unsigned int hctx_idx, unsigned int request_idx,
+		       unsigned int numa_node)
+{
+	struct mapped_device *md = data;
+	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+
+	/*
+	 * Must initialize md member of tio, otherwise it won't
+	 * be available in dm_mq_queue_rq.
+	 */
+	tio->md = md;
+
+	if (md->init_tio_pdu) {
+		/* target-specific per-io data is immediately after the tio */
+		tio->info.ptr = tio + 1;
+	}
+
+	return 0;
+}
+
+static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+			  const struct blk_mq_queue_data *bd)
+{
+	struct request *rq = bd->rq;
+	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+	struct mapped_device *md = tio->md;
+	struct dm_target *ti = md->immutable_target;
+
+	if (unlikely(!ti)) {
+		int srcu_idx;
+		struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+
+		ti = dm_table_find_target(map, 0);
+		dm_put_live_table(md, srcu_idx);
+	}
+
+	if (ti->type->busy && ti->type->busy(ti))
+		return BLK_MQ_RQ_QUEUE_BUSY;
+
+	dm_start_request(md, rq);
+
+	/* Init tio using md established in .init_request */
+	init_tio(tio, rq, md);
+
+	/*
+	 * Establish tio->ti before calling map_request().
+	 */
+	tio->ti = ti;
+
+	/* Direct call is fine since .queue_rq allows allocations */
+	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
+		/* Undo dm_start_request() before requeuing */
+		rq_end_stats(md, rq);
+		rq_completed(md, rq_data_dir(rq), false);
+		return BLK_MQ_RQ_QUEUE_BUSY;
+	}
+
+	return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static struct blk_mq_ops dm_mq_ops = {
+	.queue_rq = dm_mq_queue_rq,
+	.map_queue = blk_mq_map_queue,
+	.complete = dm_softirq_done,
+	.init_request = dm_mq_init_request,
+};
+
+int dm_mq_init_request_queue(struct mapped_device *md, struct dm_target *immutable_tgt)
+{
+	struct request_queue *q;
+	int err;
+
+	if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) {
+		DMERR("request-based dm-mq may only be stacked on blk-mq device(s)");
+		return -EINVAL;
+	}
+
+	md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id);
+	if (!md->tag_set)
+		return -ENOMEM;
+
+	md->tag_set->ops = &dm_mq_ops;
+	md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
+	md->tag_set->numa_node = md->numa_node_id;
+	md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
+	md->tag_set->driver_data = md;
+
+	md->tag_set->cmd_size = sizeof(struct dm_rq_target_io);
+	if (immutable_tgt && immutable_tgt->per_io_data_size) {
+		/* any target-specific per-io data is immediately after the tio */
+		md->tag_set->cmd_size += immutable_tgt->per_io_data_size;
+		md->init_tio_pdu = true;
+	}
+
+	err = blk_mq_alloc_tag_set(md->tag_set);
+	if (err)
+		goto out_kfree_tag_set;
+
+	q = blk_mq_init_allocated_queue(md->tag_set, md->queue);
+	if (IS_ERR(q)) {
+		err = PTR_ERR(q);
+		goto out_tag_set;
+	}
+	dm_init_md_queue(md);
+
+	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
+	blk_mq_register_disk(md->disk);
+
+	return 0;
+
+out_tag_set:
+	blk_mq_free_tag_set(md->tag_set);
+out_kfree_tag_set:
+	kfree(md->tag_set);
+
+	return err;
+}
+
+void dm_mq_cleanup_mapped_device(struct mapped_device *md)
+{
+	if (md->tag_set) {
+		blk_mq_free_tag_set(md->tag_set);
+		kfree(md->tag_set);
+	}
+}
+
+module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
+
+module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
+
+module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices");
+
+module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices");
diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h
new file mode 100644
index 000000000000..1559f6486024
--- /dev/null
+++ b/drivers/md/dm-rq.h
@@ -0,0 +1,64 @@
+/*
+ * Internal header file for device mapper
+ *
+ * Copyright (C) 2016 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef DM_RQ_INTERNAL_H
+#define DM_RQ_INTERNAL_H
+
+#include <linux/bio.h>
+#include <linux/kthread.h>
+
+#include "dm-stats.h"
+
+struct mapped_device;
+
+/*
+ * One of these is allocated per request.
+ */
+struct dm_rq_target_io {
+	struct mapped_device *md;
+	struct dm_target *ti;
+	struct request *orig, *clone;
+	struct kthread_work work;
+	int error;
+	union map_info info;
+	struct dm_stats_aux stats_aux;
+	unsigned long duration_jiffies;
+	unsigned n_sectors;
+};
+
+/*
+ * For request-based dm - the bio clones we allocate are embedded in these
+ * structs.
+ *
+ * We allocate these with bio_alloc_bioset, using the front_pad parameter when
+ * the bioset is created - this means the bio has to come at the end of the
+ * struct.
+ */
+struct dm_rq_clone_bio_info {
+	struct bio *orig;
+	struct dm_rq_target_io *tio;
+	struct bio clone;
+};
+
+bool dm_use_blk_mq_default(void);
+bool dm_use_blk_mq(struct mapped_device *md);
+
+int dm_old_init_request_queue(struct mapped_device *md);
+int dm_mq_init_request_queue(struct mapped_device *md, struct dm_target *immutable_tgt);
+void dm_mq_cleanup_mapped_device(struct mapped_device *md);
+
+void dm_start_queue(struct request_queue *q);
+void dm_stop_queue(struct request_queue *q);
+
+unsigned dm_get_reserved_rq_based_ios(void);
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf);
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+						     const char *buf, size_t count);
+
+#endif
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 4fba26cd6bdb..38b05f23b96c 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -10,7 +10,7 @@
 #include <linux/module.h>
 #include <linux/device-mapper.h>
 
-#include "dm.h"
+#include "dm-core.h"
 #include "dm-stats.h"
 
 #define DM_MSG_PREFIX "stats"
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 7e818f5f1dc4..c209b8a19b84 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -6,7 +6,8 @@
 
 #include <linux/sysfs.h>
 #include <linux/dm-ioctl.h>
-#include "dm.h"
+#include "dm-core.h"
+#include "dm-rq.h"
 
 struct dm_sysfs_attr {
 	struct attribute attr;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 626a5ec04466..a682d51111dd 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -5,7 +5,7 @@
  * This file is released under the GPL.
  */
 
-#include "dm.h"
+#include "dm-core.h"
 
 #include <linux/module.h>
 #include <linux/vmalloc.h>
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index a317dd884ba6..5c826b450aad 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -4,7 +4,7 @@
  * This file is released under the GPL.
  */
 
-#include "dm.h"
+#include "dm-core.h"
 
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index aba7ed9abb3a..8f22527134e9 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -5,13 +5,13 @@
  * This file is released under the GPL.
  */
 
-#include "dm.h"
+#include "dm-core.h"
+#include "dm-rq.h"
 #include "dm-uevent.h"
 
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <linux/moduleparam.h>
 #include <linux/blkpg.h>
 #include <linux/bio.h>
 #include <linux/mempool.h>
@@ -20,14 +20,8 @@
 #include <linux/hdreg.h>
 #include <linux/delay.h>
 #include <linux/wait.h>
-#include <linux/kthread.h>
-#include <linux/ktime.h>
-#include <linux/elevator.h> /* for rq_end_sector() */
-#include <linux/blk-mq.h>
 #include <linux/pr.h>
 
-#include <trace/events/block.h>
-
 #define DM_MSG_PREFIX "core"
 
 #ifdef CONFIG_PRINTK
@@ -63,7 +57,6 @@ static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
 static struct workqueue_struct *deferred_remove_workqueue;
 
 /*
- * For bio-based dm.
  * One of these is allocated per bio.
  */
 struct dm_io {
@@ -76,36 +69,6 @@ struct dm_io {
 	struct dm_stats_aux stats_aux;
 };
 
-/*
- * For request-based dm.
- * One of these is allocated per request.
- */
-struct dm_rq_target_io {
-	struct mapped_device *md;
-	struct dm_target *ti;
-	struct request *orig, *clone;
-	struct kthread_work work;
-	int error;
-	union map_info info;
-	struct dm_stats_aux stats_aux;
-	unsigned long duration_jiffies;
-	unsigned n_sectors;
-};
-
-/*
- * For request-based dm - the bio clones we allocate are embedded in these
- * structs.
- *
- * We allocate these with bio_alloc_bioset, using the front_pad parameter when
- * the bioset is created - this means the bio has to come at the end of the
- * struct.
- */
-struct dm_rq_clone_bio_info {
-	struct bio *orig;
-	struct dm_rq_target_io *tio;
-	struct bio clone;
-};
-
 #define MINOR_ALLOCED ((void *)-1)
 
 /*
@@ -120,130 +83,9 @@ struct dm_rq_clone_bio_info {
 #define DMF_DEFERRED_REMOVE 6
 #define DMF_SUSPENDED_INTERNALLY 7
 
-/*
- * Work processed by per-device workqueue.
- */
-struct mapped_device {
-	struct srcu_struct io_barrier;
-	struct mutex suspend_lock;
-
-	/*
-	 * The current mapping (struct dm_table *).
-	 * Use dm_get_live_table{_fast} or take suspend_lock for
-	 * dereference.
-	 */
-	void __rcu *map;
-
-	struct list_head table_devices;
-	struct mutex table_devices_lock;
-
-	unsigned long flags;
-
-	struct request_queue *queue;
-	int numa_node_id;
-
-	unsigned type;
-	/* Protect queue and type against concurrent access. */
-	struct mutex type_lock;
-
-	atomic_t holders;
-	atomic_t open_count;
-
-	struct dm_target *immutable_target;
-	struct target_type *immutable_target_type;
-
-	struct gendisk *disk;
-	char name[16];
-
-	void *interface_ptr;
-
-	/*
-	 * A list of ios that arrived while we were suspended.
-	 */
-	atomic_t pending[2];
-	wait_queue_head_t wait;
-	struct work_struct work;
-	spinlock_t deferred_lock;
-	struct bio_list deferred;
-
-	/*
-	 * Event handling.
-	 */
-	wait_queue_head_t eventq;
-	atomic_t event_nr;
-	atomic_t uevent_seq;
-	struct list_head uevent_list;
-	spinlock_t uevent_lock; /* Protect access to uevent_list */
-
-	/* the number of internal suspends */
-	unsigned internal_suspend_count;
-
-	/*
-	 * Processing queue (flush)
-	 */
-	struct workqueue_struct *wq;
-
-	/*
-	 * io objects are allocated from here.
-	 */
-	mempool_t *io_pool;
-	mempool_t *rq_pool;
-
-	struct bio_set *bs;
-
-	/*
-	 * freeze/thaw support require holding onto a super block
-	 */
-	struct super_block *frozen_sb;
-
-	/* forced geometry settings */
-	struct hd_geometry geometry;
-
-	struct block_device *bdev;
-
-	/* kobject and completion */
-	struct dm_kobject_holder kobj_holder;
-
-	/* zero-length flush that will be cloned and submitted to targets */
-	struct bio flush_bio;
-
-	struct dm_stats stats;
-
-	struct kthread_worker kworker;
-	struct task_struct *kworker_task;
-
-	/* for request-based merge heuristic in dm_request_fn() */
-	unsigned seq_rq_merge_deadline_usecs;
-	int last_rq_rw;
-	sector_t last_rq_pos;
-	ktime_t last_rq_start_time;
-
-	/* for blk-mq request-based DM support */
-	struct blk_mq_tag_set *tag_set;
-	bool use_blk_mq:1;
-	bool init_tio_pdu:1;
-};
-
-#ifdef CONFIG_DM_MQ_DEFAULT
-static bool use_blk_mq = true;
-#else
-static bool use_blk_mq = false;
-#endif
-
-#define DM_MQ_NR_HW_QUEUES 1
-#define DM_MQ_QUEUE_DEPTH 2048
 #define DM_NUMA_NODE NUMA_NO_NODE
-
-static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES;
-static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH;
 static int dm_numa_node = DM_NUMA_NODE;
 
-bool dm_use_blk_mq(struct mapped_device *md)
-{
-	return md->use_blk_mq;
-}
-EXPORT_SYMBOL_GPL(dm_use_blk_mq);
-
 /*
  * For mempools pre-allocation at the table loading time.
  */
@@ -259,9 +101,6 @@ struct table_device {
 	struct dm_dev dm_dev;
 };
 
-#define RESERVED_BIO_BASED_IOS		16
-#define RESERVED_REQUEST_BASED_IOS	256
-#define RESERVED_MAX_IOS		1024
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;
 static struct kmem_cache *_rq_cache;
@@ -269,13 +108,9 @@ static struct kmem_cache *_rq_cache;
 /*
  * Bio-based DM's mempools' reserved IOs set by the user.
  */
+#define RESERVED_BIO_BASED_IOS		16
 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 
-/*
- * Request-based DM's mempools' reserved IOs set by the user.
- */
-static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
-
 static int __dm_get_module_param_int(int *module_param, int min, int max)
 {
 	int param = ACCESS_ONCE(*module_param);
@@ -297,8 +132,8 @@ static int __dm_get_module_param_int(int *module_param, int min, int max)
 	return param;
 }
 
-static unsigned __dm_get_module_param(unsigned *module_param,
-				      unsigned def, unsigned max)
+unsigned __dm_get_module_param(unsigned *module_param,
+			       unsigned def, unsigned max)
 {
 	unsigned param = ACCESS_ONCE(*module_param);
 	unsigned modified_param = 0;
@@ -319,28 +154,10 @@ static unsigned __dm_get_module_param(unsigned *module_param,
 unsigned dm_get_reserved_bio_based_ios(void)
 {
 	return __dm_get_module_param(&reserved_bio_based_ios,
-				     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
+				     RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 
-unsigned dm_get_reserved_rq_based_ios(void)
-{
-	return __dm_get_module_param(&reserved_rq_based_ios,
-				     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
-}
-EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
-
-static unsigned dm_get_blk_mq_nr_hw_queues(void)
-{
-	return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32);
-}
-
-static unsigned dm_get_blk_mq_queue_depth(void)
-{
-	return __dm_get_module_param(&dm_mq_queue_depth,
-				     DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH);
-}
-
 static unsigned dm_get_numa_node(void)
 {
 	return __dm_get_module_param_int(&dm_numa_node,
@@ -679,29 +496,7 @@ static void free_tio(struct dm_target_io *tio)
 	bio_put(&tio->clone);
 }
 
-static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md,
-						gfp_t gfp_mask)
-{
-	return mempool_alloc(md->io_pool, gfp_mask);
-}
-
-static void free_old_rq_tio(struct dm_rq_target_io *tio)
-{
-	mempool_free(tio, tio->md->io_pool);
-}
-
-static struct request *alloc_old_clone_request(struct mapped_device *md,
-					       gfp_t gfp_mask)
-{
-	return mempool_alloc(md->rq_pool, gfp_mask);
-}
-
-static void free_old_clone_request(struct mapped_device *md, struct request *rq)
-{
-	mempool_free(rq, md->rq_pool);
-}
-
-static int md_in_flight(struct mapped_device *md)
+int md_in_flight(struct mapped_device *md)
 {
 	return atomic_read(&md->pending[READ]) +
 	       atomic_read(&md->pending[WRITE]);
@@ -1019,7 +814,7 @@ static void dec_pending(struct dm_io *io, int error)
 	}
 }
 
-static void disable_write_same(struct mapped_device *md)
+void disable_write_same(struct mapped_device *md)
 {
 	struct queue_limits *limits = dm_get_queue_limits(md);
 
@@ -1061,371 +856,6 @@ static void clone_endio(struct bio *bio)
 	dec_pending(io, error);
 }
 
-/*
- * Partial completion handling for request-based dm
- */
-static void end_clone_bio(struct bio *clone)
-{
-	struct dm_rq_clone_bio_info *info =
-		container_of(clone, struct dm_rq_clone_bio_info, clone);
-	struct dm_rq_target_io *tio = info->tio;
-	struct bio *bio = info->orig;
-	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
-	int error = clone->bi_error;
-
-	bio_put(clone);
-
-	if (tio->error)
-		/*
-		 * An error has already been detected on the request.
-		 * Once error occurred, just let clone->end_io() handle
-		 * the remainder.
-		 */
-		return;
-	else if (error) {
-		/*
-		 * Don't notice the error to the upper layer yet.
-		 * The error handling decision is made by the target driver,
-		 * when the request is completed.
-		 */
-		tio->error = error;
-		return;
-	}
-
-	/*
-	 * I/O for the bio successfully completed.
-	 * Notice the data completion to the upper layer.
-	 */
-
-	/*
-	 * bios are processed from the head of the list.
-	 * So the completing bio should always be rq->bio.
-	 * If it's not, something wrong is happening.
-	 */
-	if (tio->orig->bio != bio)
-		DMERR("bio completion is going in the middle of the request");
-
-	/*
-	 * Update the original request.
-	 * Do not use blk_end_request() here, because it may complete
-	 * the original request before the clone, and break the ordering.
-	 */
-	blk_update_request(tio->orig, 0, nr_bytes);
-}
-
-static struct dm_rq_target_io *tio_from_request(struct request *rq)
-{
-	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
-}
-
-static void rq_end_stats(struct mapped_device *md, struct request *orig)
-{
-	if (unlikely(dm_stats_used(&md->stats))) {
-		struct dm_rq_target_io *tio = tio_from_request(orig);
-		tio->duration_jiffies = jiffies - tio->duration_jiffies;
-		dm_stats_account_io(&md->stats, rq_data_dir(orig),
-				    blk_rq_pos(orig), tio->n_sectors, true,
-				    tio->duration_jiffies, &tio->stats_aux);
-	}
-}
-
-/*
- * Don't touch any member of the md after calling this function because
- * the md may be freed in dm_put() at the end of this function.
- * Or do dm_get() before calling this function and dm_put() later.
- */
-static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
-{
-	atomic_dec(&md->pending[rw]);
-
-	/* nudge anyone waiting on suspend queue */
-	if (!md_in_flight(md))
-		wake_up(&md->wait);
-
-	/*
-	 * Run this off this callpath, as drivers could invoke end_io while
-	 * inside their request_fn (and holding the queue lock). Calling
-	 * back into ->request_fn() could deadlock attempting to grab the
-	 * queue lock again.
-	 */
-	if (!md->queue->mq_ops && run_queue)
-		blk_run_queue_async(md->queue);
-
-	/*
-	 * dm_put() must be at the end of this function. See the comment above
-	 */
-	dm_put(md);
-}
-
-static void free_rq_clone(struct request *clone)
-{
-	struct dm_rq_target_io *tio = clone->end_io_data;
-	struct mapped_device *md = tio->md;
-
-	blk_rq_unprep_clone(clone);
-
-	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
-		/* stacked on blk-mq queue(s) */
-		tio->ti->type->release_clone_rq(clone);
-	else if (!md->queue->mq_ops)
-		/* request_fn queue stacked on request_fn queue(s) */
-		free_old_clone_request(md, clone);
-
-	if (!md->queue->mq_ops)
-		free_old_rq_tio(tio);
-}
-
-/*
- * Complete the clone and the original request.
- * Must be called without clone's queue lock held,
- * see end_clone_request() for more details.
- */
-static void dm_end_request(struct request *clone, int error)
-{
-	int rw = rq_data_dir(clone);
-	struct dm_rq_target_io *tio = clone->end_io_data;
-	struct mapped_device *md = tio->md;
-	struct request *rq = tio->orig;
-
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
-		rq->errors = clone->errors;
-		rq->resid_len = clone->resid_len;
-
-		if (rq->sense)
-			/*
-			 * We are using the sense buffer of the original
-			 * request.
-			 * So setting the length of the sense data is enough.
-			 */
-			rq->sense_len = clone->sense_len;
-	}
-
-	free_rq_clone(clone);
-	rq_end_stats(md, rq);
-	if (!rq->q->mq_ops)
-		blk_end_request_all(rq, error);
-	else
-		blk_mq_end_request(rq, error);
-	rq_completed(md, rw, true);
-}
-
-static void dm_unprep_request(struct request *rq)
-{
-	struct dm_rq_target_io *tio = tio_from_request(rq);
-	struct request *clone = tio->clone;
-
-	if (!rq->q->mq_ops) {
-		rq->special = NULL;
-		rq->cmd_flags &= ~REQ_DONTPREP;
-	}
-
-	if (clone)
-		free_rq_clone(clone);
-	else if (!tio->md->queue->mq_ops)
-		free_old_rq_tio(tio);
-}
-
-/*
- * Requeue the original request of a clone.
- */
-static void dm_old_requeue_request(struct request *rq)
-{
-	struct request_queue *q = rq->q;
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_requeue_request(q, rq);
-	blk_run_queue_async(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void dm_mq_requeue_request(struct request *rq)
-{
-	struct request_queue *q = rq->q;
-	unsigned long flags;
-
-	blk_mq_requeue_request(rq);
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (!blk_queue_stopped(q))
-		blk_mq_kick_requeue_list(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void dm_requeue_original_request(struct mapped_device *md,
-					struct request *rq)
-{
-	int rw = rq_data_dir(rq);
-
-	rq_end_stats(md, rq);
-	dm_unprep_request(rq);
-
-	if (!rq->q->mq_ops)
-		dm_old_requeue_request(rq);
-	else
-		dm_mq_requeue_request(rq);
-
-	rq_completed(md, rw, false);
-}
-
-static void dm_old_stop_queue(struct request_queue *q)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (blk_queue_stopped(q)) {
-		spin_unlock_irqrestore(q->queue_lock, flags);
-		return;
-	}
-
-	blk_stop_queue(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void dm_stop_queue(struct request_queue *q)
-{
-	if (!q->mq_ops)
-		dm_old_stop_queue(q);
-	else
-		blk_mq_stop_hw_queues(q);
-}
-
-static void dm_old_start_queue(struct request_queue *q)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (blk_queue_stopped(q))
-		blk_start_queue(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void dm_start_queue(struct request_queue *q)
-{
-	if (!q->mq_ops)
-		dm_old_start_queue(q);
-	else {
-		blk_mq_start_stopped_hw_queues(q, true);
-		blk_mq_kick_requeue_list(q);
-	}
-}
-
-static void dm_done(struct request *clone, int error, bool mapped)
-{
-	int r = error;
-	struct dm_rq_target_io *tio = clone->end_io_data;
-	dm_request_endio_fn rq_end_io = NULL;
-
-	if (tio->ti) {
-		rq_end_io = tio->ti->type->rq_end_io;
-
-		if (mapped && rq_end_io)
-			r = rq_end_io(tio->ti, clone, error, &tio->info);
-	}
-
-	if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) &&
-		     !clone->q->limits.max_write_same_sectors))
-		disable_write_same(tio->md);
-
-	if (r <= 0)
-		/* The target wants to complete the I/O */
-		dm_end_request(clone, r);
-	else if (r == DM_ENDIO_INCOMPLETE)
-		/* The target will handle the I/O */
-		return;
-	else if (r == DM_ENDIO_REQUEUE)
-		/* The target wants to requeue the I/O */
-		dm_requeue_original_request(tio->md, tio->orig);
-	else {
-		DMWARN("unimplemented target endio return value: %d", r);
-		BUG();
-	}
-}
-
-/*
- * Request completion handler for request-based dm
- */
-static void dm_softirq_done(struct request *rq)
-{
-	bool mapped = true;
-	struct dm_rq_target_io *tio = tio_from_request(rq);
-	struct request *clone = tio->clone;
-	int rw;
-
-	if (!clone) {
-		rq_end_stats(tio->md, rq);
-		rw = rq_data_dir(rq);
-		if (!rq->q->mq_ops) {
-			blk_end_request_all(rq, tio->error);
-			rq_completed(tio->md, rw, false);
-			free_old_rq_tio(tio);
-		} else {
-			blk_mq_end_request(rq, tio->error);
-			rq_completed(tio->md, rw, false);
-		}
-		return;
-	}
-
-	if (rq->cmd_flags & REQ_FAILED)
-		mapped = false;
-
-	dm_done(clone, tio->error, mapped);
-}
-
-/*
- * Complete the clone and the original request with the error status
- * through softirq context.
- */
-static void dm_complete_request(struct request *rq, int error)
-{
-	struct dm_rq_target_io *tio = tio_from_request(rq);
-
-	tio->error = error;
-	if (!rq->q->mq_ops)
-		blk_complete_request(rq);
-	else
-		blk_mq_complete_request(rq, error);
-}
-
-/*
- * Complete the not-mapped clone and the original request with the error status
- * through softirq context.
- * Target's rq_end_io() function isn't called.
- * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
- */
-static void dm_kill_unmapped_request(struct request *rq, int error)
-{
-	rq->cmd_flags |= REQ_FAILED;
-	dm_complete_request(rq, error);
-}
-
-/*
- * Called with the clone's queue lock held (in the case of .request_fn)
- */
-static void end_clone_request(struct request *clone, int error)
-{
-	struct dm_rq_target_io *tio = clone->end_io_data;
-
-	if (!clone->q->mq_ops) {
-		/*
-		 * For just cleaning up the information of the queue in which
-		 * the clone was dispatched.
-		 * The clone is *NOT* freed actually here because it is alloced
-		 * from dm own mempool (REQ_ALLOCED isn't set).
-		 */
-		__blk_put_request(clone->q, clone);
-	}
-
-	/*
-	 * Actual request completion is done in a softirq context which doesn't
-	 * hold the clone's queue lock.  Otherwise, deadlock could occur because:
-	 *     - another request may be submitted by the upper level driver
-	 *       of the stacking during the completion
-	 *     - the submission which requires queue lock may be done
-	 *       against this clone's queue
-	 */
-	dm_complete_request(tio->orig, error);
-}
-
 /*
  * Return maximum size of I/O possible at the supplied sector up to the current
  * target boundary.
@@ -1845,353 +1275,6 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
 	return BLK_QC_T_NONE;
 }
 
-int dm_request_based(struct mapped_device *md)
-{
-	return blk_queue_stackable(md->queue);
-}
-
-static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
-{
-	int r;
-
-	if (blk_queue_io_stat(clone->q))
-		clone->cmd_flags |= REQ_IO_STAT;
-
-	clone->start_time = jiffies;
-	r = blk_insert_cloned_request(clone->q, clone);
-	if (r)
-		/* must complete clone in terms of original request */
-		dm_complete_request(rq, r);
-}
-
-static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
-				 void *data)
-{
-	struct dm_rq_target_io *tio = data;
-	struct dm_rq_clone_bio_info *info =
-		container_of(bio, struct dm_rq_clone_bio_info, clone);
-
-	info->orig = bio_orig;
-	info->tio = tio;
-	bio->bi_end_io = end_clone_bio;
-
-	return 0;
-}
-
-static int setup_clone(struct request *clone, struct request *rq,
-		       struct dm_rq_target_io *tio, gfp_t gfp_mask)
-{
-	int r;
-
-	r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
-			      dm_rq_bio_constructor, tio);
-	if (r)
-		return r;
-
-	clone->cmd = rq->cmd;
-	clone->cmd_len = rq->cmd_len;
-	clone->sense = rq->sense;
-	clone->end_io = end_clone_request;
-	clone->end_io_data = tio;
-
-	tio->clone = clone;
-
-	return 0;
-}
-
-static struct request *clone_old_rq(struct request *rq, struct mapped_device *md,
-				    struct dm_rq_target_io *tio, gfp_t gfp_mask)
-{
-	/*
-	 * Create clone for use with .request_fn request_queue
-	 */
-	struct request *clone;
-
-	clone = alloc_old_clone_request(md, gfp_mask);
-	if (!clone)
-		return NULL;
-
-	blk_rq_init(NULL, clone);
-	if (setup_clone(clone, rq, tio, gfp_mask)) {
-		/* -ENOMEM */
-		free_old_clone_request(md, clone);
-		return NULL;
-	}
-
-	return clone;
-}
-
-static void map_tio_request(struct kthread_work *work);
-
-static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
-		     struct mapped_device *md)
-{
-	tio->md = md;
-	tio->ti = NULL;
-	tio->clone = NULL;
-	tio->orig = rq;
-	tio->error = 0;
-	/*
-	 * Avoid initializing info for blk-mq; it passes
-	 * target-specific data through info.ptr
-	 * (see: dm_mq_init_request)
-	 */
-	if (!md->init_tio_pdu)
-		memset(&tio->info, 0, sizeof(tio->info));
-	if (md->kworker_task)
-		init_kthread_work(&tio->work, map_tio_request);
-}
-
-static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq,
-					       struct mapped_device *md,
-					       gfp_t gfp_mask)
-{
-	struct dm_rq_target_io *tio;
-	int srcu_idx;
-	struct dm_table *table;
-
-	tio = alloc_old_rq_tio(md, gfp_mask);
-	if (!tio)
-		return NULL;
-
-	init_tio(tio, rq, md);
-
-	table = dm_get_live_table(md, &srcu_idx);
-	/*
-	 * Must clone a request if this .request_fn DM device
-	 * is stacked on .request_fn device(s).
-	 */
-	if (!dm_table_mq_request_based(table)) {
-		if (!clone_old_rq(rq, md, tio, gfp_mask)) {
-			dm_put_live_table(md, srcu_idx);
-			free_old_rq_tio(tio);
-			return NULL;
-		}
-	}
-	dm_put_live_table(md, srcu_idx);
-
-	return tio;
-}
-
-/*
- * Called with the queue lock held.
- */
-static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
-{
-	struct mapped_device *md = q->queuedata;
-	struct dm_rq_target_io *tio;
-
-	if (unlikely(rq->special)) {
-		DMWARN("Already has something in rq->special.");
-		return BLKPREP_KILL;
-	}
-
-	tio = dm_old_prep_tio(rq, md, GFP_ATOMIC);
-	if (!tio)
-		return BLKPREP_DEFER;
-
-	rq->special = tio;
-	rq->cmd_flags |= REQ_DONTPREP;
-
-	return BLKPREP_OK;
-}
-
-/*
- * Returns:
- * 0                : the request has been processed
- * DM_MAPIO_REQUEUE : the original request needs to be requeued
- * < 0              : the request was completed due to failure
- */
-static int map_request(struct dm_rq_target_io *tio, struct request *rq,
-		       struct mapped_device *md)
-{
-	int r;
-	struct dm_target *ti = tio->ti;
-	struct request *clone = NULL;
-
-	if (tio->clone) {
-		clone = tio->clone;
-		r = ti->type->map_rq(ti, clone, &tio->info);
-	} else {
-		r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
-		if (r < 0) {
-			/* The target wants to complete the I/O */
-			dm_kill_unmapped_request(rq, r);
-			return r;
-		}
-		if (r != DM_MAPIO_REMAPPED)
-			return r;
-		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
-			/* -ENOMEM */
-			ti->type->release_clone_rq(clone);
-			return DM_MAPIO_REQUEUE;
-		}
-	}
-
-	switch (r) {
-	case DM_MAPIO_SUBMITTED:
-		/* The target has taken the I/O to submit by itself later */
-		break;
-	case DM_MAPIO_REMAPPED:
-		/* The target has remapped the I/O so dispatch it */
-		trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
-				     blk_rq_pos(rq));
-		dm_dispatch_clone_request(clone, rq);
-		break;
-	case DM_MAPIO_REQUEUE:
-		/* The target wants to requeue the I/O */
-		dm_requeue_original_request(md, tio->orig);
-		break;
-	default:
-		if (r > 0) {
-			DMWARN("unimplemented target map return value: %d", r);
-			BUG();
-		}
-
-		/* The target wants to complete the I/O */
-		dm_kill_unmapped_request(rq, r);
-		return r;
-	}
-
-	return 0;
-}
-
-static void map_tio_request(struct kthread_work *work)
-{
-	struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
-	struct request *rq = tio->orig;
-	struct mapped_device *md = tio->md;
-
-	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
-		dm_requeue_original_request(md, rq);
-}
-
-static void dm_start_request(struct mapped_device *md, struct request *orig)
-{
-	if (!orig->q->mq_ops)
-		blk_start_request(orig);
-	else
-		blk_mq_start_request(orig);
-	atomic_inc(&md->pending[rq_data_dir(orig)]);
-
-	if (md->seq_rq_merge_deadline_usecs) {
-		md->last_rq_pos = rq_end_sector(orig);
-		md->last_rq_rw = rq_data_dir(orig);
-		md->last_rq_start_time = ktime_get();
-	}
-
-	if (unlikely(dm_stats_used(&md->stats))) {
-		struct dm_rq_target_io *tio = tio_from_request(orig);
-		tio->duration_jiffies = jiffies;
-		tio->n_sectors = blk_rq_sectors(orig);
-		dm_stats_account_io(&md->stats, rq_data_dir(orig),
-				    blk_rq_pos(orig), tio->n_sectors, false, 0,
-				    &tio->stats_aux);
-	}
-
-	/*
-	 * Hold the md reference here for the in-flight I/O.
-	 * We can't rely on the reference count by device opener,
-	 * because the device may be closed during the request completion
-	 * when all bios are completed.
-	 * See the comment in rq_completed() too.
-	 */
-	dm_get(md);
-}
-
-#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
-
-ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
-{
-	return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
-}
-
-ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
-						     const char *buf, size_t count)
-{
-	unsigned deadline;
-
-	if (!dm_request_based(md) || md->use_blk_mq)
-		return count;
-
-	if (kstrtouint(buf, 10, &deadline))
-		return -EINVAL;
-
-	if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
-		deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
-
-	md->seq_rq_merge_deadline_usecs = deadline;
-
-	return count;
-}
-
-static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
-{
-	ktime_t kt_deadline;
-
-	if (!md->seq_rq_merge_deadline_usecs)
-		return false;
-
-	kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
-	kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
-
-	return !ktime_after(ktime_get(), kt_deadline);
-}
-
-/*
- * q->request_fn for request-based dm.
- * Called with the queue lock held.
- */
-static void dm_request_fn(struct request_queue *q)
-{
-	struct mapped_device *md = q->queuedata;
-	struct dm_target *ti = md->immutable_target;
-	struct request *rq;
-	struct dm_rq_target_io *tio;
-	sector_t pos = 0;
-
-	if (unlikely(!ti)) {
-		int srcu_idx;
-		struct dm_table *map = dm_get_live_table(md, &srcu_idx);
-
-		ti = dm_table_find_target(map, pos);
-		dm_put_live_table(md, srcu_idx);
-	}
-
-	/*
-	 * For suspend, check blk_queue_stopped() and increment
-	 * ->pending within a single queue_lock not to increment the
-	 * number of in-flight I/Os after the queue is stopped in
-	 * dm_suspend().
-	 */
-	while (!blk_queue_stopped(q)) {
-		rq = blk_peek_request(q);
-		if (!rq)
-			return;
-
-		/* always use block 0 to find the target for flushes for now */
-		pos = 0;
-		if (req_op(rq) != REQ_OP_FLUSH)
-			pos = blk_rq_pos(rq);
-
-		if ((dm_request_peeked_before_merge_deadline(md) &&
-		     md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
-		     md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) ||
-		    (ti->type->busy && ti->type->busy(ti))) {
-			blk_delay_queue(q, HZ / 100);
-			return;
-		}
-
-		dm_start_request(md, rq);
-
-		tio = tio_from_request(rq);
-		/* Establish tio->ti before queuing work (map_tio_request) */
-		tio->ti = ti;
-		queue_kthread_work(&md->kworker, &tio->work);
-		BUG_ON(!irqs_disabled());
-	}
-}
-
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
 	int r = bdi_bits;
@@ -2269,7 +1352,7 @@ static const struct block_device_operations dm_blk_dops;
 
 static void dm_wq_work(struct work_struct *work);
 
-static void dm_init_md_queue(struct mapped_device *md)
+void dm_init_md_queue(struct mapped_device *md)
 {
 	/*
 	 * Request-based dm devices cannot be stacked on top of bio-based dm
@@ -2290,7 +1373,7 @@ static void dm_init_md_queue(struct mapped_device *md)
 	md->queue->backing_dev_info.congested_data = md;
 }
 
-static void dm_init_normal_md_queue(struct mapped_device *md)
+void dm_init_normal_md_queue(struct mapped_device *md)
 {
 	md->use_blk_mq = false;
 	dm_init_md_queue(md);
@@ -2330,6 +1413,8 @@ static void cleanup_mapped_device(struct mapped_device *md)
 		bdput(md->bdev);
 		md->bdev = NULL;
 	}
+
+	dm_mq_cleanup_mapped_device(md);
 }
 
 /*
@@ -2363,7 +1448,7 @@ static struct mapped_device *alloc_dev(int minor)
 		goto bad_io_barrier;
 
 	md->numa_node_id = numa_node_id;
-	md->use_blk_mq = use_blk_mq;
+	md->use_blk_mq = dm_use_blk_mq_default();
 	md->init_tio_pdu = false;
 	md->type = DM_TYPE_NONE;
 	mutex_init(&md->suspend_lock);
@@ -2448,10 +1533,6 @@ static void free_dev(struct mapped_device *md)
 	unlock_fs(md);
 
 	cleanup_mapped_device(md);
-	if (md->tag_set) {
-		blk_mq_free_tag_set(md->tag_set);
-		kfree(md->tag_set);
-	}
 
 	free_table_devices(&md->table_devices);
 	dm_stats_cleanup(&md->stats);
@@ -2657,159 +1738,6 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
 }
 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 
-static void dm_old_init_rq_based_worker_thread(struct mapped_device *md)
-{
-	/* Initialize the request-based DM worker thread */
-	init_kthread_worker(&md->kworker);
-	md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
-				       "kdmwork-%s", dm_device_name(md));
-}
-
-/*
- * Fully initialize a .request_fn request-based queue.
- */
-static int dm_old_init_request_queue(struct mapped_device *md)
-{
-	/* Fully initialize the queue */
-	if (!blk_init_allocated_queue(md->queue, dm_request_fn, NULL))
-		return -EINVAL;
-
-	/* disable dm_request_fn's merge heuristic by default */
-	md->seq_rq_merge_deadline_usecs = 0;
-
-	dm_init_normal_md_queue(md);
-	blk_queue_softirq_done(md->queue, dm_softirq_done);
-	blk_queue_prep_rq(md->queue, dm_old_prep_fn);
-
-	dm_old_init_rq_based_worker_thread(md);
-
-	elv_register_queue(md->queue);
-
-	return 0;
-}
-
-static int dm_mq_init_request(void *data, struct request *rq,
-			      unsigned int hctx_idx, unsigned int request_idx,
-			      unsigned int numa_node)
-{
-	struct mapped_device *md = data;
-	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
-
-	/*
-	 * Must initialize md member of tio, otherwise it won't
-	 * be available in dm_mq_queue_rq.
-	 */
-	tio->md = md;
-
-	if (md->init_tio_pdu) {
-		/* target-specific per-io data is immediately after the tio */
-		tio->info.ptr = tio + 1;
-	}
-
-	return 0;
-}
-
-static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
-			  const struct blk_mq_queue_data *bd)
-{
-	struct request *rq = bd->rq;
-	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
-	struct mapped_device *md = tio->md;
-	struct dm_target *ti = md->immutable_target;
-
-	if (unlikely(!ti)) {
-		int srcu_idx;
-		struct dm_table *map = dm_get_live_table(md, &srcu_idx);
-
-		ti = dm_table_find_target(map, 0);
-		dm_put_live_table(md, srcu_idx);
-	}
-
-	if (ti->type->busy && ti->type->busy(ti))
-		return BLK_MQ_RQ_QUEUE_BUSY;
-
-	dm_start_request(md, rq);
-
-	/* Init tio using md established in .init_request */
-	init_tio(tio, rq, md);
-
-	/*
-	 * Establish tio->ti before queuing work (map_tio_request)
-	 * or making direct call to map_request().
-	 */
-	tio->ti = ti;
-
-	/* Direct call is fine since .queue_rq allows allocations */
-	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
-		/* Undo dm_start_request() before requeuing */
-		rq_end_stats(md, rq);
-		rq_completed(md, rq_data_dir(rq), false);
-		return BLK_MQ_RQ_QUEUE_BUSY;
-	}
-
-	return BLK_MQ_RQ_QUEUE_OK;
-}
-
-static struct blk_mq_ops dm_mq_ops = {
-	.queue_rq = dm_mq_queue_rq,
-	.map_queue = blk_mq_map_queue,
-	.complete = dm_softirq_done,
-	.init_request = dm_mq_init_request,
-};
-
-static int dm_mq_init_request_queue(struct mapped_device *md,
-				    struct dm_target *immutable_tgt)
-{
-	struct request_queue *q;
-	int err;
-
-	if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) {
-		DMERR("request-based dm-mq may only be stacked on blk-mq device(s)");
-		return -EINVAL;
-	}
-
-	md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id);
-	if (!md->tag_set)
-		return -ENOMEM;
-
-	md->tag_set->ops = &dm_mq_ops;
-	md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
-	md->tag_set->numa_node = md->numa_node_id;
-	md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
-	md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
-	md->tag_set->driver_data = md;
-
-	md->tag_set->cmd_size = sizeof(struct dm_rq_target_io);
-	if (immutable_tgt && immutable_tgt->per_io_data_size) {
-		/* any target-specific per-io data is immediately after the tio */
-		md->tag_set->cmd_size += immutable_tgt->per_io_data_size;
-		md->init_tio_pdu = true;
-	}
-
-	err = blk_mq_alloc_tag_set(md->tag_set);
-	if (err)
-		goto out_kfree_tag_set;
-
-	q = blk_mq_init_allocated_queue(md->tag_set, md->queue);
-	if (IS_ERR(q)) {
-		err = PTR_ERR(q);
-		goto out_tag_set;
-	}
-	dm_init_md_queue(md);
-
-	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
-	blk_mq_register_disk(md->disk);
-
-	return 0;
-
-out_tag_set:
-	blk_mq_free_tag_set(md->tag_set);
-out_kfree_tag_set:
-	kfree(md->tag_set);
-
-	return err;
-}
-
 static unsigned filter_md_type(unsigned type, struct mapped_device *md)
 {
 	if (type == DM_TYPE_BIO_BASED)
@@ -3741,18 +2669,6 @@ MODULE_PARM_DESC(major, "The major number of the device mapper");
 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
 
-module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
-
-module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
-
-module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices");
-
-module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices");
-
 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 13a758ec0f88..b611b3064a7c 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -13,6 +13,7 @@
 #include <linux/fs.h>
 #include <linux/device-mapper.h>
 #include <linux/list.h>
+#include <linux/moduleparam.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/hdreg.h>
@@ -161,16 +162,6 @@ void dm_interface_exit(void);
 /*
  * sysfs interface
  */
-struct dm_kobject_holder {
-	struct kobject kobj;
-	struct completion completion;
-};
-
-static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
-{
-	return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
-}
-
 int dm_sysfs_init(struct mapped_device *md);
 void dm_sysfs_exit(struct mapped_device *md);
 struct kobject *dm_kobject(struct mapped_device *md);
@@ -212,8 +203,6 @@ int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
 void dm_internal_suspend(struct mapped_device *md);
 void dm_internal_resume(struct mapped_device *md);
 
-bool dm_use_blk_mq(struct mapped_device *md);
-
 int dm_io_init(void);
 void dm_io_exit(void);
 
@@ -228,18 +217,8 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 /*
- * Helpers that are used by DM core
+ * Various helpers
  */
 unsigned dm_get_reserved_bio_based_ios(void);
-unsigned dm_get_reserved_rq_based_ios(void);
-
-static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
-{
-	return !maxlen || strlen(result) + 1 >= maxlen;
-}
-
-ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf);
-ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
-						     const char *buf, size_t count);
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 76e33fe4e2c4363c2b9f627472bd43dc235c3406 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 19 May 2016 16:15:14 -0400
Subject: dm mpath: reinstate bio-based support

Add "multipath-bio" target that offers a bio-based multipath target as
an alternative to the request-based "multipath" target -- but in a
following commit "multipath-bio" will immediately be replaced by a new
"queue_mode" feature for the "multipath" target which will allow
bio-based mode to be selected.

When DM multipath was originally converted from bio-based to
request-based the motivation for the change was better dynamic load
balancing (by leveraging block core's request-based IO schedulers, for
merging and sorting, _before_ DM multipath would make the decision on
where to steer the IO -- based on path load and/or availability).

More background is available in this "Request-based Device-mapper
multipath and Dynamic load balancing" paper:
https://www.kernel.org/doc/ols/2007/ols2007v2-pages-235-244.pdf

But we've now come full circle where significantly faster storage
devices no longer need IOs to be made larger to drive optimal IO
performance.  And even if they do there have been changes to the block
and filesystem layers that help ensure upper layers are constructing
larger IOs.  In addition, SCSI's differentiated IO errors will propagate
through to bio-based IO completion hooks -- so that eliminates another
historic justiciation for request-based DM multipath.  Lastly, the block
layer's immutable biovec changes have made bio cloning cheaper than it
has ever been; whereas request cloning is still relatively expensive
(both on a CPU usage and memory footprint level).

As such, bio-based DM multipath offers the promise of a more efficient
IO path for high IOPs devices that are, or will be, emerging.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 296 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 274 insertions(+), 22 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index e1c07d1ec80b..f5921661bd99 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -8,6 +8,7 @@
 #include <linux/device-mapper.h>
 
 #include "dm-rq.h"
+#include "dm-bio-record.h"
 #include "dm-path-selector.h"
 #include "dm-uevent.h"
 
@@ -97,14 +98,22 @@ struct multipath {
 
 	struct mutex work_mutex;
 	struct work_struct trigger_event;
+
+	struct work_struct process_queued_bios;
+	struct bio_list queued_bios;
 };
 
 /*
- * Context information attached to each bio we process.
+ * Context information attached to each io we process.
  */
 struct dm_mpath_io {
 	struct pgpath *pgpath;
 	size_t nr_bytes;
+
+	/*
+	 * FIXME: make request-based code _not_ include this member.
+	 */
+	struct dm_bio_details bio_details;
 };
 
 typedef int (*action_fn) (struct pgpath *pgpath);
@@ -114,6 +123,7 @@ static struct kmem_cache *_mpio_cache;
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
+static void process_queued_bios(struct work_struct *work);
 
 /*-----------------------------------------------
  * Multipath state flags.
@@ -126,6 +136,7 @@ static void activate_path(struct work_struct *work);
 #define MPATHF_PG_INIT_DISABLED 4		/* pg_init is not currently allowed */
 #define MPATHF_PG_INIT_REQUIRED 5		/* pg_init needs calling? */
 #define MPATHF_PG_INIT_DELAY_RETRY 6		/* Delay pg_init retry? */
+#define MPATHF_BIO_BASED 7			/* Device is bio-based? */
 
 /*-----------------------------------------------
  * Allocation routines
@@ -185,7 +196,8 @@ static void free_priority_group(struct priority_group *pg,
 	kfree(pg);
 }
 
-static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq)
+static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq,
+					 bool bio_based)
 {
 	struct multipath *m;
 
@@ -203,7 +215,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq)
 		mutex_init(&m->work_mutex);
 
 		m->mpio_pool = NULL;
-		if (!use_blk_mq) {
+		if (!use_blk_mq && !bio_based) {
 			unsigned min_ios = dm_get_reserved_rq_based_ios();
 
 			m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
@@ -213,6 +225,16 @@ static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq)
 			}
 		}
 
+		if (bio_based) {
+			INIT_WORK(&m->process_queued_bios, process_queued_bios);
+			set_bit(MPATHF_BIO_BASED, &m->flags);
+			/*
+			 * bio-based doesn't support any direct scsi_dh management;
+			 * it just discovers if a scsi_dh is attached.
+			 */
+			set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
+		}
+
 		m->ti = ti;
 		ti->private = m;
 	}
@@ -272,6 +294,21 @@ static void clear_request_fn_mpio(struct multipath *m, union map_info *info)
 	}
 }
 
+static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio)
+{
+	return dm_per_bio_data(bio, sizeof(struct dm_mpath_io));
+}
+
+static struct dm_mpath_io *set_mpio_bio(struct multipath *m, struct bio *bio)
+{
+	struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
+
+	memset(mpio, 0, sizeof(*mpio));
+	dm_bio_record(&mpio->bio_details, bio);
+
+	return mpio;
+}
+
 /*-----------------------------------------------
  * Path selection
  *-----------------------------------------------*/
@@ -431,16 +468,26 @@ failed:
  * and multipath_resume() calls and we have no need to check
  * for the DMF_NOFLUSH_SUSPENDING flag.
  */
-static int must_push_back(struct multipath *m)
+static bool __must_push_back(struct multipath *m)
+{
+	return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
+		 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) &&
+		dm_noflush_suspending(m->ti));
+}
+
+static bool must_push_back_rq(struct multipath *m)
 {
 	return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
-		((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
-		  test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) &&
-		 dm_noflush_suspending(m->ti)));
+		__must_push_back(m));
+}
+
+static bool must_push_back_bio(struct multipath *m)
+{
+	return __must_push_back(m);
 }
 
 /*
- * Map cloned requests
+ * Map cloned requests (request-based multipath)
  */
 static int __multipath_map(struct dm_target *ti, struct request *clone,
 			   union map_info *map_context,
@@ -459,7 +506,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 		pgpath = choose_pgpath(m, nr_bytes);
 
 	if (!pgpath) {
-		if (!must_push_back(m))
+		if (!must_push_back_rq(m))
 			r = -EIO;	/* Failed */
 		return r;
 	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
@@ -529,6 +576,106 @@ static void multipath_release_clone(struct request *clone)
 	blk_mq_free_request(clone);
 }
 
+/*
+ * Map cloned bios (bio-based multipath)
+ */
+static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio)
+{
+	size_t nr_bytes = bio->bi_iter.bi_size;
+	struct pgpath *pgpath;
+	unsigned long flags;
+	bool queue_io;
+
+	/* Do we need to select a new pgpath? */
+	pgpath = lockless_dereference(m->current_pgpath);
+	queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
+	if (!pgpath || !queue_io)
+		pgpath = choose_pgpath(m, nr_bytes);
+
+	if ((pgpath && queue_io) ||
+	    (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
+		/* Queue for the daemon to resubmit */
+		spin_lock_irqsave(&m->lock, flags);
+		bio_list_add(&m->queued_bios, bio);
+		spin_unlock_irqrestore(&m->lock, flags);
+		/* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
+		if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
+			pg_init_all_paths(m);
+		else if (!queue_io)
+			queue_work(kmultipathd, &m->process_queued_bios);
+		return DM_MAPIO_SUBMITTED;
+	}
+
+	if (!pgpath) {
+		if (!must_push_back_bio(m))
+			return -EIO;
+		return DM_MAPIO_REQUEUE;
+	}
+
+	mpio->pgpath = pgpath;
+	mpio->nr_bytes = nr_bytes;
+
+	bio->bi_error = 0;
+	bio->bi_bdev = pgpath->path.dev->bdev;
+	bio->bi_rw |= REQ_FAILFAST_TRANSPORT;
+
+	if (pgpath->pg->ps.type->start_io)
+		pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
+					      &pgpath->path,
+					      nr_bytes);
+	return DM_MAPIO_REMAPPED;
+}
+
+static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
+{
+	struct multipath *m = ti->private;
+	struct dm_mpath_io *mpio = set_mpio_bio(m, bio);
+
+	return __multipath_map_bio(m, bio, mpio);
+}
+
+static void process_queued_bios_list(struct multipath *m)
+{
+	if (test_bit(MPATHF_BIO_BASED, &m->flags))
+		queue_work(kmultipathd, &m->process_queued_bios);
+}
+
+static void process_queued_bios(struct work_struct *work)
+{
+	int r;
+	unsigned long flags;
+	struct bio *bio;
+	struct bio_list bios;
+	struct blk_plug plug;
+	struct multipath *m =
+		container_of(work, struct multipath, process_queued_bios);
+
+	bio_list_init(&bios);
+
+	spin_lock_irqsave(&m->lock, flags);
+
+	if (bio_list_empty(&m->queued_bios)) {
+		spin_unlock_irqrestore(&m->lock, flags);
+		return;
+	}
+
+	bio_list_merge(&bios, &m->queued_bios);
+	bio_list_init(&m->queued_bios);
+
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	blk_start_plug(&plug);
+	while ((bio = bio_list_pop(&bios))) {
+		r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio));
+		if (r < 0 || r == DM_MAPIO_REQUEUE) {
+			bio->bi_error = r;
+			bio_endio(bio);
+		} else if (r == DM_MAPIO_REMAPPED)
+			generic_make_request(bio);
+	}
+	blk_finish_plug(&plug);
+}
+
 /*
  * If we run out of usable paths, should we queue I/O or error it?
  */
@@ -557,8 +704,10 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
 
 	spin_unlock_irqrestore(&m->lock, flags);
 
-	if (!queue_if_no_path)
+	if (!queue_if_no_path) {
 		dm_table_run_md_queue_async(m->ti->table);
+		process_queued_bios_list(m);
+	}
 
 	return 0;
 }
@@ -798,6 +947,12 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
 	if (!hw_argc)
 		return 0;
 
+	if (test_bit(MPATHF_BIO_BASED, &m->flags)) {
+		dm_consume_args(as, hw_argc);
+		DMERR("bio-based multipath doesn't allow hardware handler args");
+		return 0;
+	}
+
 	m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
 
 	if (hw_argc > 1) {
@@ -880,8 +1035,8 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 	return r;
 }
 
-static int multipath_ctr(struct dm_target *ti, unsigned int argc,
-			 char **argv)
+static int __multipath_ctr(struct dm_target *ti, unsigned int argc,
+			   char **argv, bool bio_based)
 {
 	/* target arguments */
 	static struct dm_arg _args[] = {
@@ -899,7 +1054,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	as.argc = argc;
 	as.argv = argv;
 
-	m = alloc_multipath(ti, use_blk_mq);
+	m = alloc_multipath(ti, use_blk_mq, bio_based);
 	if (!m) {
 		ti->error = "can't allocate multipath";
 		return -EINVAL;
@@ -958,7 +1113,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
-	if (use_blk_mq)
+	if (use_blk_mq || bio_based)
 		ti->per_io_data_size = sizeof(struct dm_mpath_io);
 
 	return 0;
@@ -968,6 +1123,16 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	return r;
 }
 
+static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	return __multipath_ctr(ti, argc, argv, false);
+}
+
+static int multipath_bio_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	return __multipath_ctr(ti, argc, argv, true);
+}
+
 static void multipath_wait_for_pg_init_completion(struct multipath *m)
 {
 	DECLARE_WAITQUEUE(wait, current);
@@ -1083,8 +1248,10 @@ static int reinstate_path(struct pgpath *pgpath)
 
 out:
 	spin_unlock_irqrestore(&m->lock, flags);
-	if (run_queue)
+	if (run_queue) {
 		dm_table_run_md_queue_async(m->ti->table);
+		process_queued_bios_list(m);
+	}
 
 	return r;
 }
@@ -1281,6 +1448,8 @@ static void pg_init_done(void *data, int errors)
 	}
 	clear_bit(MPATHF_QUEUE_IO, &m->flags);
 
+	process_queued_bios_list(m);
+
 	/*
 	 * Wake up any thread waiting to suspend.
 	 */
@@ -1347,7 +1516,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
 
 	if (!atomic_read(&m->nr_valid_paths)) {
 		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
-			if (!must_push_back(m))
+			if (!must_push_back_rq(m))
 				r = -EIO;
 		} else {
 			if (error == -EBADE)
@@ -1381,6 +1550,64 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 	return r;
 }
 
+static int do_end_io_bio(struct multipath *m, struct bio *clone,
+			 int error, struct dm_mpath_io *mpio)
+{
+	unsigned long flags;
+
+	if (!error)
+		return 0;	/* I/O complete */
+
+	if (noretry_error(error))
+		return error;
+
+	if (mpio->pgpath)
+		fail_path(mpio->pgpath);
+
+	if (!atomic_read(&m->nr_valid_paths)) {
+		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+			if (!must_push_back_bio(m))
+				return -EIO;
+			return DM_ENDIO_REQUEUE;
+		} else {
+			if (error == -EBADE)
+				return error;
+		}
+	}
+
+	/* Queue for the daemon to resubmit */
+	dm_bio_restore(&mpio->bio_details, clone);
+
+	spin_lock_irqsave(&m->lock, flags);
+	bio_list_add(&m->queued_bios, clone);
+	spin_unlock_irqrestore(&m->lock, flags);
+	if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
+		queue_work(kmultipathd, &m->process_queued_bios);
+
+	return DM_ENDIO_INCOMPLETE;
+}
+
+static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int error)
+{
+	struct multipath *m = ti->private;
+	struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
+	struct pgpath *pgpath;
+	struct path_selector *ps;
+	int r;
+
+	BUG_ON(!mpio);
+
+	r = do_end_io_bio(m, clone, error, mpio);
+	pgpath = mpio->pgpath;
+	if (pgpath) {
+		ps = &pgpath->pg->ps;
+		if (ps->type->end_io)
+			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
+	}
+
+	return r;
+}
+
 /*
  * Suspend can't complete until all the I/O is processed so if
  * the last path fails we must error any remaining I/O.
@@ -1642,6 +1869,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 		if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
 			pg_init_all_paths(m);
 		dm_table_run_md_queue_async(m->ti->table);
+		process_queued_bios_list(m);
 	}
 
 	/*
@@ -1767,22 +1995,47 @@ static struct target_type multipath_target = {
 	.busy = multipath_busy,
 };
 
+static struct target_type multipath_bio_target = {
+	.name = "multipath-bio",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr = multipath_bio_ctr,
+	.dtr = multipath_dtr,
+	.map = multipath_map_bio,
+	.end_io = multipath_end_io_bio,
+	.presuspend = multipath_presuspend,
+	.postsuspend = multipath_postsuspend,
+	.resume = multipath_resume,
+	.status = multipath_status,
+	.message = multipath_message,
+	.prepare_ioctl = multipath_prepare_ioctl,
+	.iterate_devices = multipath_iterate_devices,
+	.busy = multipath_busy,
+};
+
 static int __init dm_multipath_init(void)
 {
 	int r;
 
-	/* allocate a slab for the dm_ios */
+	/* allocate a slab for the dm_mpath_ios */
 	_mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
 	if (!_mpio_cache)
 		return -ENOMEM;
 
 	r = dm_register_target(&multipath_target);
 	if (r < 0) {
-		DMERR("register failed %d", r);
+		DMERR("request-based register failed %d", r);
 		r = -EINVAL;
 		goto bad_register_target;
 	}
 
+	r = dm_register_target(&multipath_bio_target);
+	if (r < 0) {
+		DMERR("bio-based register failed %d", r);
+		r = -EINVAL;
+		goto bad_register_bio_based_target;
+	}
+
 	kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
 	if (!kmultipathd) {
 		DMERR("failed to create workqueue kmpathd");
@@ -1804,15 +2057,13 @@ static int __init dm_multipath_init(void)
 		goto bad_alloc_kmpath_handlerd;
 	}
 
-	DMINFO("version %u.%u.%u loaded",
-	       multipath_target.version[0], multipath_target.version[1],
-	       multipath_target.version[2]);
-
 	return 0;
 
 bad_alloc_kmpath_handlerd:
 	destroy_workqueue(kmultipathd);
 bad_alloc_kmultipathd:
+	dm_unregister_target(&multipath_bio_target);
+bad_register_bio_based_target:
 	dm_unregister_target(&multipath_target);
 bad_register_target:
 	kmem_cache_destroy(_mpio_cache);
@@ -1826,6 +2077,7 @@ static void __exit dm_multipath_exit(void)
 	destroy_workqueue(kmultipathd);
 
 	dm_unregister_target(&multipath_target);
+	dm_unregister_target(&multipath_bio_target);
 	kmem_cache_destroy(_mpio_cache);
 }
 
-- 
cgit v1.2.3-70-g09d2


From bf661be1fcf9b1da8abc81a56ff41ce5964ce896 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 24 May 2016 15:48:08 -0400
Subject: dm mpath: remove bio-based bloat from struct dm_mpath_io

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 43 +++++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 12 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f5921661bd99..2d10ff780d84 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -109,11 +109,6 @@ struct multipath {
 struct dm_mpath_io {
 	struct pgpath *pgpath;
 	size_t nr_bytes;
-
-	/*
-	 * FIXME: make request-based code _not_ include this member.
-	 */
-	struct dm_bio_details bio_details;
 };
 
 typedef int (*action_fn) (struct pgpath *pgpath);
@@ -294,19 +289,39 @@ static void clear_request_fn_mpio(struct multipath *m, union map_info *info)
 	}
 }
 
+static size_t multipath_per_bio_data_size(void)
+{
+	return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details);
+}
+
 static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio)
 {
-	return dm_per_bio_data(bio, sizeof(struct dm_mpath_io));
+	return dm_per_bio_data(bio, multipath_per_bio_data_size());
 }
 
-static struct dm_mpath_io *set_mpio_bio(struct multipath *m, struct bio *bio)
+static struct dm_bio_details *get_bio_details_from_bio(struct bio *bio)
 {
+	/* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */
 	struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
+	void *bio_details = mpio + 1;
+
+	return bio_details;
+}
+
+static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p,
+					struct dm_bio_details **bio_details_p)
+{
+	struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
+	struct dm_bio_details *bio_details = get_bio_details_from_bio(bio);
 
 	memset(mpio, 0, sizeof(*mpio));
-	dm_bio_record(&mpio->bio_details, bio);
+	memset(bio_details, 0, sizeof(*bio_details));
+	dm_bio_record(bio_details, bio);
 
-	return mpio;
+	if (mpio_p)
+		*mpio_p = mpio;
+	if (bio_details_p)
+		*bio_details_p = bio_details;
 }
 
 /*-----------------------------------------------
@@ -629,7 +644,9 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
 {
 	struct multipath *m = ti->private;
-	struct dm_mpath_io *mpio = set_mpio_bio(m, bio);
+	struct dm_mpath_io *mpio = NULL;
+
+	multipath_init_per_bio_data(bio, &mpio, NULL);
 
 	return __multipath_map_bio(m, bio, mpio);
 }
@@ -1113,7 +1130,9 @@ static int __multipath_ctr(struct dm_target *ti, unsigned int argc,
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
-	if (use_blk_mq || bio_based)
+	if (bio_based)
+		ti->per_io_data_size = multipath_per_bio_data_size();
+	else if (use_blk_mq)
 		ti->per_io_data_size = sizeof(struct dm_mpath_io);
 
 	return 0;
@@ -1576,7 +1595,7 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone,
 	}
 
 	/* Queue for the daemon to resubmit */
-	dm_bio_restore(&mpio->bio_details, clone);
+	dm_bio_restore(get_bio_details_from_bio(clone), clone);
 
 	spin_lock_irqsave(&m->lock, flags);
 	bio_list_add(&m->queued_bios, clone);
-- 
cgit v1.2.3-70-g09d2


From e83068a5faafb8ca65d3b58bd1e1e3959ce1ddce Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 24 May 2016 21:16:51 -0400
Subject: dm mpath: add optional "queue_mode" feature

Allow a user to specify an optional feature 'queue_mode <mode>' where
<mode> may be "bio", "rq" or "mq" -- which corresponds to bio-based,
request_fn rq-based, and blk-mq rq-based respectively.

If the queue_mode feature isn't specified the default for the
"multipath" target is still "rq" but if dm_mod.use_blk_mq is set to Y
it'll default to mode "mq".

This new queue_mode feature introduces the ability for each multipath
device to have its own queue_mode (whereas before this feature all
multipath devices effectively had to have the same queue_mode).

This commit also goes a long way to eliminate the awkward (ab)use of
DM_TYPE_*, the associated filter_md_type() and other relatively fragile
and difficult to maintain code.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c         | 149 +++++++++++++++++++++++-------------------
 drivers/md/dm-rq.c            |  19 ++++--
 drivers/md/dm-rq.h            |   2 +-
 drivers/md/dm-table.c         |  67 ++++++++++++-------
 drivers/md/dm.c               |  15 +----
 drivers/md/dm.h               |  10 +--
 include/linux/device-mapper.h |  16 +++++
 7 files changed, 159 insertions(+), 119 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 2d10ff780d84..7eac080fcb18 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -90,6 +90,8 @@ struct multipath {
 	atomic_t pg_init_in_progress;	/* Only one pg_init allowed at once */
 	atomic_t pg_init_count;		/* Number of times pg_init called */
 
+	unsigned queue_mode;
+
 	/*
 	 * We must use a mempool of dm_mpath_io structs so that we
 	 * can resubmit bios on error.
@@ -131,7 +133,6 @@ static void process_queued_bios(struct work_struct *work);
 #define MPATHF_PG_INIT_DISABLED 4		/* pg_init is not currently allowed */
 #define MPATHF_PG_INIT_REQUIRED 5		/* pg_init needs calling? */
 #define MPATHF_PG_INIT_DELAY_RETRY 6		/* Delay pg_init retry? */
-#define MPATHF_BIO_BASED 7			/* Device is bio-based? */
 
 /*-----------------------------------------------
  * Allocation routines
@@ -191,8 +192,7 @@ static void free_priority_group(struct priority_group *pg,
 	kfree(pg);
 }
 
-static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq,
-					 bool bio_based)
+static struct multipath *alloc_multipath(struct dm_target *ti)
 {
 	struct multipath *m;
 
@@ -210,25 +210,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq,
 		mutex_init(&m->work_mutex);
 
 		m->mpio_pool = NULL;
-		if (!use_blk_mq && !bio_based) {
-			unsigned min_ios = dm_get_reserved_rq_based_ios();
-
-			m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
-			if (!m->mpio_pool) {
-				kfree(m);
-				return NULL;
-			}
-		}
-
-		if (bio_based) {
-			INIT_WORK(&m->process_queued_bios, process_queued_bios);
-			set_bit(MPATHF_BIO_BASED, &m->flags);
-			/*
-			 * bio-based doesn't support any direct scsi_dh management;
-			 * it just discovers if a scsi_dh is attached.
-			 */
-			set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
-		}
+		m->queue_mode = DM_TYPE_NONE;
 
 		m->ti = ti;
 		ti->private = m;
@@ -237,6 +219,39 @@ static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq,
 	return m;
 }
 
+static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
+{
+	if (m->queue_mode == DM_TYPE_NONE) {
+		/*
+		 * Default to request-based.
+		 */
+		if (dm_use_blk_mq(dm_table_get_md(ti->table)))
+			m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
+		else
+			m->queue_mode = DM_TYPE_REQUEST_BASED;
+	}
+
+	if (m->queue_mode == DM_TYPE_REQUEST_BASED) {
+		unsigned min_ios = dm_get_reserved_rq_based_ios();
+
+		m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
+		if (!m->mpio_pool)
+			return -ENOMEM;
+	}
+	else if (m->queue_mode == DM_TYPE_BIO_BASED) {
+		INIT_WORK(&m->process_queued_bios, process_queued_bios);
+		/*
+		 * bio-based doesn't support any direct scsi_dh management;
+		 * it just discovers if a scsi_dh is attached.
+		 */
+		set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
+	}
+
+	dm_table_set_type(ti->table, m->queue_mode);
+
+	return 0;
+}
+
 static void free_multipath(struct multipath *m)
 {
 	struct priority_group *pg, *tmp;
@@ -653,7 +668,7 @@ static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
 
 static void process_queued_bios_list(struct multipath *m)
 {
-	if (test_bit(MPATHF_BIO_BASED, &m->flags))
+	if (m->queue_mode == DM_TYPE_BIO_BASED)
 		queue_work(kmultipathd, &m->process_queued_bios);
 }
 
@@ -964,7 +979,7 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
 	if (!hw_argc)
 		return 0;
 
-	if (test_bit(MPATHF_BIO_BASED, &m->flags)) {
+	if (m->queue_mode == DM_TYPE_BIO_BASED) {
 		dm_consume_args(as, hw_argc);
 		DMERR("bio-based multipath doesn't allow hardware handler args");
 		return 0;
@@ -1005,7 +1020,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 	const char *arg_name;
 
 	static struct dm_arg _args[] = {
-		{0, 6, "invalid number of feature args"},
+		{0, 8, "invalid number of feature args"},
 		{1, 50, "pg_init_retries must be between 1 and 50"},
 		{0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
 	};
@@ -1045,6 +1060,24 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 			continue;
 		}
 
+		if (!strcasecmp(arg_name, "queue_mode") &&
+		    (argc >= 1)) {
+			const char *queue_mode_name = dm_shift_arg(as);
+
+			if (!strcasecmp(queue_mode_name, "bio"))
+				m->queue_mode = DM_TYPE_BIO_BASED;
+			else if (!strcasecmp(queue_mode_name, "rq"))
+				m->queue_mode = DM_TYPE_REQUEST_BASED;
+			else if (!strcasecmp(queue_mode_name, "mq"))
+				m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
+			else {
+				ti->error = "Unknown 'queue_mode' requested";
+				r = -EINVAL;
+			}
+			argc--;
+			continue;
+		}
+
 		ti->error = "Unrecognised multipath feature request";
 		r = -EINVAL;
 	} while (argc && !r);
@@ -1052,8 +1085,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 	return r;
 }
 
-static int __multipath_ctr(struct dm_target *ti, unsigned int argc,
-			   char **argv, bool bio_based)
+static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	/* target arguments */
 	static struct dm_arg _args[] = {
@@ -1066,12 +1098,11 @@ static int __multipath_ctr(struct dm_target *ti, unsigned int argc,
 	struct dm_arg_set as;
 	unsigned pg_count = 0;
 	unsigned next_pg_num;
-	bool use_blk_mq = dm_use_blk_mq(dm_table_get_md(ti->table));
 
 	as.argc = argc;
 	as.argv = argv;
 
-	m = alloc_multipath(ti, use_blk_mq, bio_based);
+	m = alloc_multipath(ti);
 	if (!m) {
 		ti->error = "can't allocate multipath";
 		return -EINVAL;
@@ -1081,6 +1112,10 @@ static int __multipath_ctr(struct dm_target *ti, unsigned int argc,
 	if (r)
 		goto bad;
 
+	r = alloc_multipath_stage2(ti, m);
+	if (r)
+		goto bad;
+
 	r = parse_hw_handler(&as, m);
 	if (r)
 		goto bad;
@@ -1130,9 +1165,9 @@ static int __multipath_ctr(struct dm_target *ti, unsigned int argc,
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
-	if (bio_based)
+	if (m->queue_mode == DM_TYPE_BIO_BASED)
 		ti->per_io_data_size = multipath_per_bio_data_size();
-	else if (use_blk_mq)
+	else if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED)
 		ti->per_io_data_size = sizeof(struct dm_mpath_io);
 
 	return 0;
@@ -1142,16 +1177,6 @@ static int __multipath_ctr(struct dm_target *ti, unsigned int argc,
 	return r;
 }
 
-static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
-{
-	return __multipath_ctr(ti, argc, argv, false);
-}
-
-static int multipath_bio_ctr(struct dm_target *ti, unsigned argc, char **argv)
-{
-	return __multipath_ctr(ti, argc, argv, true);
-}
-
 static void multipath_wait_for_pg_init_completion(struct multipath *m)
 {
 	DECLARE_WAITQUEUE(wait, current);
@@ -1700,7 +1725,9 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 		DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
 			      (m->pg_init_retries > 0) * 2 +
 			      (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
-			      test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags));
+			      test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) +
+			      (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2);
+
 		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
 			DMEMIT("queue_if_no_path ");
 		if (m->pg_init_retries)
@@ -1709,6 +1736,16 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 			DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
 		if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags))
 			DMEMIT("retain_attached_hw_handler ");
+		if (m->queue_mode != DM_TYPE_REQUEST_BASED) {
+			switch(m->queue_mode) {
+			case DM_TYPE_BIO_BASED:
+				DMEMIT("queue_mode bio ");
+				break;
+			case DM_TYPE_MQ_REQUEST_BASED:
+				DMEMIT("queue_mode mq ");
+				break;
+			}
+		}
 	}
 
 	if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1995,7 +2032,7 @@ static int multipath_busy(struct dm_target *ti)
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 11, 0},
+	.version = {1, 12, 0},
 	.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
@@ -2004,22 +2041,6 @@ static struct target_type multipath_target = {
 	.clone_and_map_rq = multipath_clone_and_map,
 	.release_clone_rq = multipath_release_clone,
 	.rq_end_io = multipath_end_io,
-	.presuspend = multipath_presuspend,
-	.postsuspend = multipath_postsuspend,
-	.resume = multipath_resume,
-	.status = multipath_status,
-	.message = multipath_message,
-	.prepare_ioctl = multipath_prepare_ioctl,
-	.iterate_devices = multipath_iterate_devices,
-	.busy = multipath_busy,
-};
-
-static struct target_type multipath_bio_target = {
-	.name = "multipath-bio",
-	.version = {1, 0, 0},
-	.module = THIS_MODULE,
-	.ctr = multipath_bio_ctr,
-	.dtr = multipath_dtr,
 	.map = multipath_map_bio,
 	.end_io = multipath_end_io_bio,
 	.presuspend = multipath_presuspend,
@@ -2048,13 +2069,6 @@ static int __init dm_multipath_init(void)
 		goto bad_register_target;
 	}
 
-	r = dm_register_target(&multipath_bio_target);
-	if (r < 0) {
-		DMERR("bio-based register failed %d", r);
-		r = -EINVAL;
-		goto bad_register_bio_based_target;
-	}
-
 	kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
 	if (!kmultipathd) {
 		DMERR("failed to create workqueue kmpathd");
@@ -2081,8 +2095,6 @@ static int __init dm_multipath_init(void)
 bad_alloc_kmpath_handlerd:
 	destroy_workqueue(kmultipathd);
 bad_alloc_kmultipathd:
-	dm_unregister_target(&multipath_bio_target);
-bad_register_bio_based_target:
 	dm_unregister_target(&multipath_target);
 bad_register_target:
 	kmem_cache_destroy(_mpio_cache);
@@ -2096,7 +2108,6 @@ static void __exit dm_multipath_exit(void)
 	destroy_workqueue(kmultipathd);
 
 	dm_unregister_target(&multipath_target);
-	dm_unregister_target(&multipath_bio_target);
 	kmem_cache_destroy(_mpio_cache);
 }
 
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 787c81b16a26..266f7b674108 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -230,7 +230,14 @@ static void free_rq_clone(struct request *clone)
 
 	blk_rq_unprep_clone(clone);
 
-	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
+	/*
+	 * It is possible for a clone_old_rq() allocated clone to
+	 * get passed in -- it may not yet have a request_queue.
+	 * This is known to occur if the error target replaces
+	 * a multipath target that has a request_fn queue stacked
+	 * on blk-mq queue(s).
+	 */
+	if (clone->q && clone->q->mq_ops)
 		/* stacked on blk-mq queue(s) */
 		tio->ti->type->release_clone_rq(clone);
 	else if (!md->queue->mq_ops)
@@ -561,7 +568,7 @@ static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq,
 	 * Must clone a request if this .request_fn DM device
 	 * is stacked on .request_fn device(s).
 	 */
-	if (!dm_table_mq_request_based(table)) {
+	if (!dm_table_all_blk_mq_devices(table)) {
 		if (!clone_old_rq(rq, md, tio, gfp_mask)) {
 			dm_put_live_table(md, srcu_idx);
 			free_old_rq_tio(tio);
@@ -711,7 +718,7 @@ ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
 {
 	unsigned deadline;
 
-	if (!dm_request_based(md) || md->use_blk_mq)
+	if (dm_get_md_type(md) != DM_TYPE_REQUEST_BASED)
 		return count;
 
 	if (kstrtouint(buf, 10, &deadline))
@@ -886,12 +893,13 @@ static struct blk_mq_ops dm_mq_ops = {
 	.init_request = dm_mq_init_request,
 };
 
-int dm_mq_init_request_queue(struct mapped_device *md, struct dm_target *immutable_tgt)
+int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
 {
 	struct request_queue *q;
+	struct dm_target *immutable_tgt;
 	int err;
 
-	if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) {
+	if (!dm_table_all_blk_mq_devices(t)) {
 		DMERR("request-based dm-mq may only be stacked on blk-mq device(s)");
 		return -EINVAL;
 	}
@@ -908,6 +916,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_target *immutab
 	md->tag_set->driver_data = md;
 
 	md->tag_set->cmd_size = sizeof(struct dm_rq_target_io);
+	immutable_tgt = dm_table_get_immutable_target(t);
 	if (immutable_tgt && immutable_tgt->per_io_data_size) {
 		/* any target-specific per-io data is immediately after the tio */
 		md->tag_set->cmd_size += immutable_tgt->per_io_data_size;
diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h
index 1559f6486024..9e6f0a3773d4 100644
--- a/drivers/md/dm-rq.h
+++ b/drivers/md/dm-rq.h
@@ -49,7 +49,7 @@ bool dm_use_blk_mq_default(void);
 bool dm_use_blk_mq(struct mapped_device *md);
 
 int dm_old_init_request_queue(struct mapped_device *md);
-int dm_mq_init_request_queue(struct mapped_device *md, struct dm_target *immutable_tgt);
+int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t);
 void dm_mq_cleanup_mapped_device(struct mapped_device *md);
 
 void dm_start_queue(struct request_queue *q);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a682d51111dd..88f01744ac16 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -43,8 +43,10 @@ struct dm_table {
 	struct dm_target *targets;
 
 	struct target_type *immutable_target_type;
-	unsigned integrity_supported:1;
-	unsigned singleton:1;
+
+	bool integrity_supported:1;
+	bool singleton:1;
+	bool all_blk_mq:1;
 
 	/*
 	 * Indicates the rw permissions for the new logical
@@ -206,6 +208,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 		return -ENOMEM;
 	}
 
+	t->type = DM_TYPE_NONE;
 	t->mode = mode;
 	t->md = md;
 	*result = t;
@@ -703,7 +706,7 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 			      dm_device_name(t->md), type);
 			return -EINVAL;
 		}
-		t->singleton = 1;
+		t->singleton = true;
 	}
 
 	if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) {
@@ -830,16 +833,29 @@ static bool __table_type_request_based(unsigned table_type)
 		table_type == DM_TYPE_MQ_REQUEST_BASED);
 }
 
-static int dm_table_set_type(struct dm_table *t)
+void dm_table_set_type(struct dm_table *t, unsigned type)
+{
+	t->type = type;
+}
+EXPORT_SYMBOL_GPL(dm_table_set_type);
+
+static int dm_table_determine_type(struct dm_table *t)
 {
 	unsigned i;
 	unsigned bio_based = 0, request_based = 0, hybrid = 0;
-	bool use_blk_mq = false;
+	bool verify_blk_mq = false;
 	struct dm_target *tgt;
 	struct dm_dev_internal *dd;
-	struct list_head *devices;
+	struct list_head *devices = dm_table_get_devices(t);
 	unsigned live_md_type = dm_get_md_type(t->md);
 
+	if (t->type != DM_TYPE_NONE) {
+		/* target already set the table's type */
+		if (t->type == DM_TYPE_BIO_BASED)
+			return 0;
+		goto verify_rq_based;
+	}
+
 	for (i = 0; i < t->num_targets; i++) {
 		tgt = t->targets + i;
 		if (dm_target_hybrid(tgt))
@@ -876,6 +892,19 @@ static int dm_table_set_type(struct dm_table *t)
 
 	BUG_ON(!request_based); /* No targets in this table */
 
+	if (list_empty(devices) && __table_type_request_based(live_md_type)) {
+		/* inherit live MD type */
+		t->type = live_md_type;
+		return 0;
+	}
+
+	/*
+	 * The only way to establish DM_TYPE_MQ_REQUEST_BASED is by
+	 * having a compatible target use dm_table_set_type.
+	 */
+	t->type = DM_TYPE_REQUEST_BASED;
+
+verify_rq_based:
 	/*
 	 * Request-based dm supports only tables that have a single target now.
 	 * To support multiple targets, request splitting support is needed,
@@ -888,7 +917,6 @@ static int dm_table_set_type(struct dm_table *t)
 	}
 
 	/* Non-request-stackable devices can't be used for request-based dm */
-	devices = dm_table_get_devices(t);
 	list_for_each_entry(dd, devices, list) {
 		struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
 
@@ -899,10 +927,10 @@ static int dm_table_set_type(struct dm_table *t)
 		}
 
 		if (q->mq_ops)
-			use_blk_mq = true;
+			verify_blk_mq = true;
 	}
 
-	if (use_blk_mq) {
+	if (verify_blk_mq) {
 		/* verify _all_ devices in the table are blk-mq devices */
 		list_for_each_entry(dd, devices, list)
 			if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) {
@@ -910,14 +938,9 @@ static int dm_table_set_type(struct dm_table *t)
 				      " are blk-mq request-stackable");
 				return -EINVAL;
 			}
-		t->type = DM_TYPE_MQ_REQUEST_BASED;
 
-	} else if (list_empty(devices) && __table_type_request_based(live_md_type)) {
-		/* inherit live MD type */
-		t->type = live_md_type;
-
-	} else
-		t->type = DM_TYPE_REQUEST_BASED;
+		t->all_blk_mq = true;
+	}
 
 	return 0;
 }
@@ -961,9 +984,9 @@ bool dm_table_request_based(struct dm_table *t)
 	return __table_type_request_based(dm_table_get_type(t));
 }
 
-bool dm_table_mq_request_based(struct dm_table *t)
+bool dm_table_all_blk_mq_devices(struct dm_table *t)
 {
-	return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
+	return t->all_blk_mq;
 }
 
 static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
@@ -1106,7 +1129,7 @@ static int dm_table_register_integrity(struct dm_table *t)
 		return 0;
 
 	if (!integrity_profile_exists(dm_disk(md))) {
-		t->integrity_supported = 1;
+		t->integrity_supported = true;
 		/*
 		 * Register integrity profile during table load; we can do
 		 * this because the final profile must match during resume.
@@ -1129,7 +1152,7 @@ static int dm_table_register_integrity(struct dm_table *t)
 	}
 
 	/* Preserve existing integrity profile */
-	t->integrity_supported = 1;
+	t->integrity_supported = true;
 	return 0;
 }
 
@@ -1141,9 +1164,9 @@ int dm_table_complete(struct dm_table *t)
 {
 	int r;
 
-	r = dm_table_set_type(t);
+	r = dm_table_determine_type(t);
 	if (r) {
-		DMERR("unable to set table type");
+		DMERR("unable to determine table type");
 		return r;
 	}
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8f22527134e9..2c907bc10fe9 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1738,23 +1738,14 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
 }
 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 
-static unsigned filter_md_type(unsigned type, struct mapped_device *md)
-{
-	if (type == DM_TYPE_BIO_BASED)
-		return type;
-
-	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
-}
-
 /*
  * Setup the DM device's queue based on md's type
  */
 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 {
 	int r;
-	unsigned md_type = filter_md_type(dm_get_md_type(md), md);
 
-	switch (md_type) {
+	switch (dm_get_md_type(md)) {
 	case DM_TYPE_REQUEST_BASED:
 		r = dm_old_init_request_queue(md);
 		if (r) {
@@ -1763,7 +1754,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 		}
 		break;
 	case DM_TYPE_MQ_REQUEST_BASED:
-		r = dm_mq_init_request_queue(md, dm_table_get_immutable_target(t));
+		r = dm_mq_init_request_queue(md, t);
 		if (r) {
 			DMERR("Cannot initialize queue for request-based dm-mq mapped device");
 			return r;
@@ -2472,8 +2463,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 	if (!pools)
 		return NULL;
 
-	type = filter_md_type(type, md);
-
 	switch (type) {
 	case DM_TYPE_BIO_BASED:
 		cachep = _io_cache;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index b611b3064a7c..2e0e4a53a312 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -33,14 +33,6 @@
  */
 #define DM_STATUS_NOFLUSH_FLAG		(1 << 0)
 
-/*
- * Type of table and mapped_device's mempool
- */
-#define DM_TYPE_NONE			0
-#define DM_TYPE_BIO_BASED		1
-#define DM_TYPE_REQUEST_BASED		2
-#define DM_TYPE_MQ_REQUEST_BASED	3
-
 /*
  * List of devices that a metadevice uses and should open/close.
  */
@@ -77,7 +69,7 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
-bool dm_table_mq_request_based(struct dm_table *t);
+bool dm_table_all_blk_mq_devices(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 0830c9e86f0d..2ce339212b6e 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -19,6 +19,14 @@ struct dm_table;
 struct mapped_device;
 struct bio_vec;
 
+/*
+ * Type of table, mapped_device's mempool and request_queue
+ */
+#define DM_TYPE_NONE			0
+#define DM_TYPE_BIO_BASED		1
+#define DM_TYPE_REQUEST_BASED		2
+#define DM_TYPE_MQ_REQUEST_BASED	3
+
 typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 
 union map_info {
@@ -443,6 +451,14 @@ int dm_table_add_target(struct dm_table *t, const char *type,
  */
 void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb);
 
+/*
+ * Target can use this to set the table's type.
+ * Can only ever be called from a target's ctr.
+ * Useful for "hybrid" target (supports both bio-based
+ * and request-based).
+ */
+void dm_table_set_type(struct dm_table *t, unsigned type);
+
 /*
  * Finally call this to make the table ready for use.
  */
-- 
cgit v1.2.3-70-g09d2


From 73c6f239a86271c17d77f826a0c657f3d393a51e Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 19 May 2016 18:49:24 +0200
Subject: dm raid: rename variable 'ret' to 'r' to conform to other dm code

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 68 +++++++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 32 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 8cbac62b1602..6982e23681be 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -225,7 +225,7 @@ static int dev_parms(struct raid_set *rs, char **argv)
 	int i;
 	int rebuild = 0;
 	int metadata_available = 0;
-	int ret = 0;
+	int r = 0;
 
 	for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
 		rs->dev[i].rdev.raid_disk = i;
@@ -241,12 +241,12 @@ static int dev_parms(struct raid_set *rs, char **argv)
 		rs->dev[i].rdev.mddev = &rs->md;
 
 		if (strcmp(argv[0], "-")) {
-			ret = dm_get_device(rs->ti, argv[0],
+			r = dm_get_device(rs->ti, argv[0],
 					    dm_table_get_mode(rs->ti->table),
 					    &rs->dev[i].meta_dev);
 			rs->ti->error = "RAID metadata device lookup failure";
-			if (ret)
-				return ret;
+			if (r)
+				return r;
 
 			rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
 			if (!rs->dev[i].rdev.sb_page)
@@ -267,12 +267,12 @@ static int dev_parms(struct raid_set *rs, char **argv)
 			continue;
 		}
 
-		ret = dm_get_device(rs->ti, argv[1],
+		r = dm_get_device(rs->ti, argv[1],
 				    dm_table_get_mode(rs->ti->table),
 				    &rs->dev[i].data_dev);
-		if (ret) {
+		if (r) {
 			rs->ti->error = "RAID device lookup failure";
-			return ret;
+			return r;
 		}
 
 		if (rs->dev[i].meta_dev) {
@@ -848,7 +848,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
  */
 static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 {
-	int ret;
+	int r;
 	struct dm_raid_superblock *sb;
 	struct dm_raid_superblock *refsb;
 	uint64_t events_sb, events_refsb;
@@ -860,9 +860,9 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 		return -EINVAL;
 	}
 
-	ret = read_disk_sb(rdev, rdev->sb_size);
-	if (ret)
-		return ret;
+	r = read_disk_sb(rdev, rdev->sb_size);
+	if (r)
+		return r;
 
 	sb = page_address(rdev->sb_page);
 
@@ -1072,7 +1072,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
  */
 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 {
-	int ret;
+	int r;
 	struct raid_dev *dev;
 	struct md_rdev *rdev, *tmp, *freshest;
 	struct mddev *mddev = &rs->md;
@@ -1097,9 +1097,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 		if (!rdev->meta_bdev)
 			continue;
 
-		ret = super_load(rdev, freshest);
+		r = super_load(rdev, freshest);
 
-		switch (ret) {
+		switch (r) {
 		case 1:
 			freshest = rdev;
 			break;
@@ -1207,17 +1207,21 @@ static void configure_discard_support(struct dm_target *ti, struct raid_set *rs)
 }
 
 /*
- * Construct a RAID4/5/6 mapping:
+ * Construct a RAID0/1/10/4/5/6 mapping:
  * Args:
- *	<raid_type> <#raid_params> <raid_params>		\
- *	<#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
+ *      <raid_type> <#raid_params> <raid_params>{0,}    \
+ *      <#raid_devs> [<meta_dev1> <dev1>]{1,}
  *
  * <raid_params> varies by <raid_type>.  See 'parse_raid_params' for
  * details on possible <raid_params>.
+ *
+ * Userspace is free to initialize the metadata devices, hence the superblocks to
+ * enforce recreation based on the passed in table parameters.
+ *
  */
 static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
-	int ret;
+	int r;
 	struct raid_type *rt;
 	unsigned long num_raid_params, num_raid_devs;
 	struct raid_set *rs = NULL;
@@ -1267,19 +1271,19 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (IS_ERR(rs))
 		return PTR_ERR(rs);
 
-	ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
-	if (ret)
+	r = parse_raid_params(rs, argv, (unsigned)num_raid_params);
+	if (r)
 		goto bad;
 
 	argv += num_raid_params + 1;
 
-	ret = dev_parms(rs, argv);
-	if (ret)
+	r = dev_parms(rs, argv);
+	if (r)
 		goto bad;
 
 	rs->md.sync_super = super_sync;
-	ret = analyse_superblocks(ti, rs);
-	if (ret)
+	r = analyse_superblocks(ti, rs);
+	if (r)
 		goto bad;
 
 	INIT_WORK(&rs->md.event_work, do_table_event);
@@ -1293,18 +1297,18 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	/* Has to be held on running the array */
 	mddev_lock_nointr(&rs->md);
-	ret = md_run(&rs->md);
+	r = md_run(&rs->md);
 	rs->md.in_sync = 0; /* Assume already marked dirty */
 	mddev_unlock(&rs->md);
 
-	if (ret) {
+	if (r) {
 		ti->error = "Fail to run raid array";
 		goto bad;
 	}
 
 	if (ti->len != rs->md.array_sectors) {
 		ti->error = "Array size does not match requested target length";
-		ret = -EINVAL;
+		r = -EINVAL;
 		goto size_mismatch;
 	}
 	rs->callbacks.congested_fn = raid_is_congested;
@@ -1318,7 +1322,7 @@ size_mismatch:
 bad:
 	context_free(rs);
 
-	return ret;
+	return r;
 }
 
 static void raid_dtr(struct dm_target *ti)
@@ -1603,17 +1607,17 @@ static int raid_iterate_devices(struct dm_target *ti,
 {
 	struct raid_set *rs = ti->private;
 	unsigned i;
-	int ret = 0;
+	int r = 0;
 
-	for (i = 0; !ret && i < rs->md.raid_disks; i++)
+	for (i = 0; !r && i < rs->md.raid_disks; i++)
 		if (rs->dev[i].data_dev)
-			ret = fn(ti,
+			r = fn(ti,
 				 rs->dev[i].data_dev,
 				 0, /* No offset on data devs */
 				 rs->md.dev_sectors,
 				 data);
 
-	return ret;
+	return r;
 }
 
 static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
-- 
cgit v1.2.3-70-g09d2


From 92c83d79b07ec1c53e0c74b8a7988799e00856db Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 19 May 2016 18:49:25 +0200
Subject: dm raid: use dm_arg_set API in constructor

- use dm_arg_set API in ctr and its callees parse_raid_params() and dev_parms()

- introduce _in_range() function to check a value is in a [ min, max ] range;
  this is to support more callers in parsing parameters etc. in the future

- correct comment on MAX_RAID_DEVICES

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 145 +++++++++++++++++++++++++++++----------------------
 1 file changed, 84 insertions(+), 61 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 6982e23681be..01aa511ebe44 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -17,7 +17,7 @@
 #include <linux/device-mapper.h>
 
 #define DM_MSG_PREFIX "raid"
-#define	MAX_RAID_DEVICES	253 /* raid4/5/6 limit */
+#define	MAX_RAID_DEVICES	253 /* md-raid kernel limit */
 
 static bool devices_handle_discard_safely = false;
 
@@ -95,6 +95,12 @@ static struct raid_type {
 	{"raid6_nc", "RAID6 (N continue)",		2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
 };
 
+/* True, if @v is in inclusive range [@min, @max] */
+static bool _in_range(long v, long min, long max)
+{
+	return v >= min && v <= max;
+}
+
 static char *raid10_md_layout_to_format(int layout)
 {
 	/*
@@ -135,7 +141,7 @@ static int raid10_format_to_md_layout(char *format, unsigned copies)
 	return (f << 8) | n;
 }
 
-static struct raid_type *get_raid_type(char *name)
+static struct raid_type *get_raid_type(const char *name)
 {
 	int i;
 
@@ -220,14 +226,20 @@ static void context_free(struct raid_set *rs)
  * This code parses those words.  If there is a failure,
  * the caller must use context_free to unwind the operations.
  */
-static int dev_parms(struct raid_set *rs, char **argv)
+static int parse_dev_parms(struct raid_set *rs, struct dm_arg_set *as)
 {
 	int i;
 	int rebuild = 0;
 	int metadata_available = 0;
 	int r = 0;
+	const char *arg;
 
-	for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
+	/* Put off the number of raid devices argument to get to dev pairs */
+	arg = dm_shift_arg(as);
+	if (!arg)
+		return -EINVAL;
+
+	for (i = 0; i < rs->md.raid_disks; i++) {
 		rs->dev[i].rdev.raid_disk = i;
 
 		rs->dev[i].meta_dev = NULL;
@@ -240,8 +252,12 @@ static int dev_parms(struct raid_set *rs, char **argv)
 		rs->dev[i].rdev.data_offset = 0;
 		rs->dev[i].rdev.mddev = &rs->md;
 
-		if (strcmp(argv[0], "-")) {
-			r = dm_get_device(rs->ti, argv[0],
+		arg = dm_shift_arg(as);
+		if (!arg)
+			return -EINVAL;
+
+		if (strcmp(arg, "-")) {
+			r = dm_get_device(rs->ti, arg,
 					    dm_table_get_mode(rs->ti->table),
 					    &rs->dev[i].meta_dev);
 			rs->ti->error = "RAID metadata device lookup failure";
@@ -253,7 +269,11 @@ static int dev_parms(struct raid_set *rs, char **argv)
 				return -ENOMEM;
 		}
 
-		if (!strcmp(argv[1], "-")) {
+		arg = dm_shift_arg(as);
+		if (!arg)
+			return -EINVAL;
+
+		if (!strcmp(arg, "-")) {
 			if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
 			    (!rs->dev[i].rdev.recovery_offset)) {
 				rs->ti->error = "Drive designated for rebuild not specified";
@@ -267,7 +287,7 @@ static int dev_parms(struct raid_set *rs, char **argv)
 			continue;
 		}
 
-		r = dm_get_device(rs->ti, argv[1],
+		r = dm_get_device(rs->ti, arg,
 				    dm_table_get_mode(rs->ti->table),
 				    &rs->dev[i].data_dev);
 		if (r) {
@@ -492,25 +512,30 @@ too_many:
  *    [raid10_copies <# copies>]        Number of copies.  (Default: 2)
  *    [raid10_format <near|far|offset>] Layout algorithm.  (Default: near)
  */
-static int parse_raid_params(struct raid_set *rs, char **argv,
+static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			     unsigned num_raid_params)
 {
 	char *raid10_format = "near";
 	unsigned raid10_copies = 2;
 	unsigned i;
-	unsigned long value, region_size = 0;
+	unsigned value, region_size = 0;
 	sector_t sectors_per_dev = rs->ti->len;
 	sector_t max_io_len;
-	char *key;
+	const char *arg, *key;
+
+	arg = dm_shift_arg(as);
+	num_raid_params--; /* Account for chunk_size argument */
+
+	if (kstrtouint(arg, 10, &value) < 0) {
+		rs->ti->error = "Bad numerical argument given for chunk_size";
+		return -EINVAL;
+	}
 
 	/*
 	 * First, parse the in-order required arguments
 	 * "chunk_size" is the only argument of this type.
 	 */
-	if ((kstrtoul(argv[0], 10, &value) < 0)) {
-		rs->ti->error = "Bad chunk size";
-		return -EINVAL;
-	} else if (rs->raid_type->level == 1) {
+	if (rs->raid_type->level == 1) {
 		if (value)
 			DMERR("Ignoring chunk size parameter for RAID 1");
 		value = 0;
@@ -523,8 +548,6 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	}
 
 	rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
-	argv++;
-	num_raid_params--;
 
 	/*
 	 * We set each individual device as In_sync with a completed
@@ -552,12 +575,18 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	 * Second, parse the unordered optional arguments
 	 */
 	for (i = 0; i < num_raid_params; i++) {
-		if (!strcasecmp(argv[i], "nosync")) {
+		arg = dm_shift_arg(as);
+		if (!arg) {
+			rs->ti->error = "Not enough raid parameters given";
+			return -EINVAL;
+		}
+
+		if (!strcasecmp(arg, "nosync")) {
 			rs->md.recovery_cp = MaxSector;
 			rs->ctr_flags |= CTR_FLAG_NOSYNC;
 			continue;
 		}
-		if (!strcasecmp(argv[i], "sync")) {
+		if (!strcasecmp(arg, "sync")) {
 			rs->md.recovery_cp = 0;
 			rs->ctr_flags |= CTR_FLAG_SYNC;
 			continue;
@@ -569,7 +598,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 			return -EINVAL;
 		}
 
-		key = argv[i++];
+		key = arg;
+		arg = dm_shift_arg(as);
+		i++; /* Account for the argument pairs */
 
 		/* Parameters that take a string value are checked here. */
 		if (!strcasecmp(key, "raid10_format")) {
@@ -577,18 +608,18 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
 				return -EINVAL;
 			}
-			if (strcmp("near", argv[i]) &&
-			    strcmp("far", argv[i]) &&
-			    strcmp("offset", argv[i])) {
+			if (strcmp("near", arg) &&
+			    strcmp("far", arg) &&
+			    strcmp("offset", arg)) {
 				rs->ti->error = "Invalid 'raid10_format' value given";
 				return -EINVAL;
 			}
-			raid10_format = argv[i];
+			raid10_format = (char *) arg;
 			rs->ctr_flags |= CTR_FLAG_RAID10_FORMAT;
 			continue;
 		}
 
-		if (kstrtoul(argv[i], 10, &value) < 0) {
+		if (kstrtouint(arg, 10, &value) < 0) {
 			rs->ti->error = "Bad numerical argument given in raid params";
 			return -EINVAL;
 		}
@@ -1223,61 +1254,53 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	int r;
 	struct raid_type *rt;
-	unsigned long num_raid_params, num_raid_devs;
+	unsigned num_raid_params, num_raid_devs;
 	struct raid_set *rs = NULL;
-
-	/* Must have at least <raid_type> <#raid_params> */
-	if (argc < 2) {
-		ti->error = "Too few arguments";
+	const char *arg;
+	struct dm_arg_set as = { argc, argv }, as_nrd;
+	struct dm_arg _args[] = {
+		{ 0, as.argc, "Cannot understand number of raid parameters" },
+		{ 1, 254, "Cannot understand number of raid devices parameters" }
+	};
+
+	/* Must have <raid_type> */
+	arg = dm_shift_arg(&as);
+	if (!arg) {
+		ti->error = "No arguments";
 		return -EINVAL;
 	}
 
-	/* raid type */
-	rt = get_raid_type(argv[0]);
+	rt = get_raid_type(arg);
 	if (!rt) {
 		ti->error = "Unrecognised raid_type";
 		return -EINVAL;
 	}
-	argc--;
-	argv++;
-
-	/* number of RAID parameters */
-	if (kstrtoul(argv[0], 10, &num_raid_params) < 0) {
-		ti->error = "Cannot understand number of RAID parameters";
-		return -EINVAL;
-	}
-	argc--;
-	argv++;
 
-	/* Skip over RAID params for now and find out # of devices */
-	if (num_raid_params >= argc) {
-		ti->error = "Arguments do not agree with counts given";
-		return -EINVAL;
-	}
+	/* Must have <#raid_params> */
+	if (dm_read_arg_group(_args, &as, &num_raid_params, &ti->error))
+                return -EINVAL;
 
-	if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
-	    (num_raid_devs > MAX_RAID_DEVICES)) {
-		ti->error = "Cannot understand number of raid devices";
-		return -EINVAL;
-	}
+	/* number of raid device tupples <meta_dev data_dev> */
+	as_nrd = as;
+	dm_consume_args(&as_nrd, num_raid_params);
+	_args[1].max = (as_nrd.argc - 1) / 2;
+	if (dm_read_arg(_args + 1, &as_nrd, &num_raid_devs, &ti->error))
+                return -EINVAL;
 
-	argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
-	if (argc != (num_raid_devs * 2)) {
-		ti->error = "Supplied RAID devices does not match the count given";
-		return -EINVAL;
+	if (!_in_range(num_raid_devs, 1, MAX_RAID_DEVICES)) {
+		ti->error = "Invalid number of supplied raid devices";
+                return -EINVAL;
 	}
 
-	rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
+	rs = context_alloc(ti, rt, num_raid_devs);
 	if (IS_ERR(rs))
 		return PTR_ERR(rs);
 
-	r = parse_raid_params(rs, argv, (unsigned)num_raid_params);
+	r = parse_raid_params(rs, &as, num_raid_params);
 	if (r)
 		goto bad;
 
-	argv += num_raid_params + 1;
-
-	r = dev_parms(rs, argv);
+	r = parse_dev_parms(rs, &as);
 	if (r)
 		goto bad;
 
-- 
cgit v1.2.3-70-g09d2


From 702108d194e3649f69afcd2661282a0157c71e54 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 19 May 2016 18:49:26 +0200
Subject: dm raid: cleanup / provide infrastructure

Provide necessary infrastructure to handle ctr flags and their names
and cleanup setting ti->error:

 - comment constructor flags

 - introduce constructor flag manipulation

 - introduce ti_error_*() functions to simplify
   setting the error message (use in other targets?)

 - introduce array to hold ctr flag <-> flag name mapping

 - introduce argument name by flag functions for that array

 - use those functions throughout the ctr call path

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 424 +++++++++++++++++++++++++++------------------------
 1 file changed, 228 insertions(+), 196 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 01aa511ebe44..ab7aa7d83364 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2015 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -47,18 +47,22 @@ struct raid_dev {
 
 /*
  * Flags for rs->ctr_flags field.
+ *
+ * 1 = no flag value
+ * 2 = flag with value
  */
-#define CTR_FLAG_SYNC              0x1
-#define CTR_FLAG_NOSYNC            0x2
-#define CTR_FLAG_REBUILD           0x4
-#define CTR_FLAG_DAEMON_SLEEP      0x8
-#define CTR_FLAG_MIN_RECOVERY_RATE 0x10
-#define CTR_FLAG_MAX_RECOVERY_RATE 0x20
-#define CTR_FLAG_MAX_WRITE_BEHIND  0x40
-#define CTR_FLAG_STRIPE_CACHE      0x80
-#define CTR_FLAG_REGION_SIZE       0x100
-#define CTR_FLAG_RAID10_COPIES     0x200
-#define CTR_FLAG_RAID10_FORMAT     0x400
+#define CTR_FLAG_SYNC              0x1   /* 1 */ /* Not with raid0! */
+#define CTR_FLAG_NOSYNC            0x2   /* 1 */ /* Not with raid0! */
+#define CTR_FLAG_REBUILD           0x4   /* 2 */ /* Not with raid0! */
+#define CTR_FLAG_DAEMON_SLEEP      0x8   /* 2 */ /* Not with raid0! */
+#define CTR_FLAG_MIN_RECOVERY_RATE 0x10  /* 2 */ /* Not with raid0! */
+#define CTR_FLAG_MAX_RECOVERY_RATE 0x20  /* 2 */ /* Not with raid0! */
+#define CTR_FLAG_MAX_WRITE_BEHIND  0x40  /* 2 */ /* Only with raid1! */
+#define CTR_FLAG_WRITE_MOSTLY      0x80  /* 2 */ /* Only with raid1! */
+#define CTR_FLAG_STRIPE_CACHE      0x100 /* 2 */ /* Only with raid4/5/6! */
+#define CTR_FLAG_REGION_SIZE       0x200 /* 2 */ /* Not with raid0! */
+#define CTR_FLAG_RAID10_COPIES     0x400 /* 2 */ /* Only with raid10 */
+#define CTR_FLAG_RAID10_FORMAT     0x800 /* 2 */ /* Only with raid10 */
 
 struct raid_set {
 	struct dm_target *ti;
@@ -101,6 +105,83 @@ static bool _in_range(long v, long min, long max)
 	return v >= min && v <= max;
 }
 
+/* ctr flag bit manipulation... */
+/* Set single @flag in @flags */
+static void _set_flag(uint32_t flag, uint32_t *flags)
+{
+	WARN_ON_ONCE(hweight32(flag) != 1);
+	*flags |= flag;
+}
+
+/* Test single @flag in @flags */
+static bool _test_flag(uint32_t flag, uint32_t flags)
+{
+	WARN_ON_ONCE(hweight32(flag) != 1);
+	return (flag & flags) ? true : false;
+}
+
+/* Return true if single @flag is set in @*flags, else set it and return false */
+static bool _test_and_set_flag(uint32_t flag, uint32_t *flags)
+{
+	if (_test_flag(flag, *flags))
+		return true;
+
+	_set_flag(flag, flags);
+	return false;
+}
+/* ...ctr and runtime flag bit manipulation */
+
+/* All table line arguments are defined here */
+static struct arg_name_flag {
+	const uint32_t flag;
+	const char *name;
+} _arg_name_flags[] = {
+	{ CTR_FLAG_SYNC, "sync"},
+	{ CTR_FLAG_NOSYNC, "nosync"},
+	{ CTR_FLAG_REBUILD, "rebuild"},
+	{ CTR_FLAG_DAEMON_SLEEP, "daemon_sleep"},
+	{ CTR_FLAG_MIN_RECOVERY_RATE, "min_recovery_rate"},
+	{ CTR_FLAG_MAX_RECOVERY_RATE, "max_recovery_rate"},
+	{ CTR_FLAG_MAX_WRITE_BEHIND, "max_write_behind"},
+	{ CTR_FLAG_WRITE_MOSTLY, "writemostly"},
+	{ CTR_FLAG_STRIPE_CACHE, "stripe_cache"},
+	{ CTR_FLAG_REGION_SIZE, "region_size"},
+	{ CTR_FLAG_RAID10_COPIES, "raid10_copies"},
+	{ CTR_FLAG_RAID10_FORMAT, "raid10_format"},
+};
+
+/* Return argument name string for given @flag */
+static const char *_argname_by_flag(const uint32_t flag)
+{
+	if (hweight32(flag) == 1) {
+		struct arg_name_flag *anf = _arg_name_flags + ARRAY_SIZE(_arg_name_flags);
+
+		while (anf-- > _arg_name_flags)
+			if (_test_flag(flag, anf->flag))
+				return anf->name;
+
+	} else
+		DMERR("%s called with more than one flag!", __func__);
+
+	return NULL;
+}
+
+/*
+ * Convenience functions to set ti->error to @errmsg and
+ * return @r in order to shorten code in a lot of places
+ */
+static int ti_error_ret(struct dm_target *ti, const char *errmsg, int r)
+{
+	ti->error = (char *) errmsg;
+	return r;
+}
+
+static int ti_error_einval(struct dm_target *ti, const char *errmsg)
+{
+	return ti_error_ret(ti, errmsg, -EINVAL);
+}
+/* END: convenience functions to set ti->error to @errmsg... */
+
 static char *raid10_md_layout_to_format(int layout)
 {
 	/*
@@ -157,16 +238,12 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
 	unsigned i;
 	struct raid_set *rs;
 
-	if (raid_devs <= raid_type->parity_devs) {
-		ti->error = "Insufficient number of devices";
-		return ERR_PTR(-EINVAL);
-	}
+	if (raid_devs <= raid_type->parity_devs)
+		return ERR_PTR(ti_error_einval(ti, "Insufficient number of devices"));
 
 	rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
-	if (!rs) {
-		ti->error = "Cannot allocate raid context";
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!rs)
+		return ERR_PTR(ti_error_ret(ti, "Cannot allocate raid context", -ENOMEM));
 
 	mddev_init(&rs->md);
 
@@ -226,7 +303,7 @@ static void context_free(struct raid_set *rs)
  * This code parses those words.  If there is a failure,
  * the caller must use context_free to unwind the operations.
  */
-static int parse_dev_parms(struct raid_set *rs, struct dm_arg_set *as)
+static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
 {
 	int i;
 	int rebuild = 0;
@@ -260,13 +337,12 @@ static int parse_dev_parms(struct raid_set *rs, struct dm_arg_set *as)
 			r = dm_get_device(rs->ti, arg,
 					    dm_table_get_mode(rs->ti->table),
 					    &rs->dev[i].meta_dev);
-			rs->ti->error = "RAID metadata device lookup failure";
 			if (r)
-				return r;
+				return ti_error_ret(rs->ti, "RAID metadata device lookup failure", r);
 
 			rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
 			if (!rs->dev[i].rdev.sb_page)
-				return -ENOMEM;
+				return ti_error_ret(rs->ti, "Failed to allocate superblock page", -ENOMEM);
 		}
 
 		arg = dm_shift_arg(as);
@@ -275,14 +351,11 @@ static int parse_dev_parms(struct raid_set *rs, struct dm_arg_set *as)
 
 		if (!strcmp(arg, "-")) {
 			if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
-			    (!rs->dev[i].rdev.recovery_offset)) {
-				rs->ti->error = "Drive designated for rebuild not specified";
-				return -EINVAL;
-			}
+			    (!rs->dev[i].rdev.recovery_offset))
+				return ti_error_einval(rs->ti, "Drive designated for rebuild not specified");
 
-			rs->ti->error = "No data device supplied with metadata device";
 			if (rs->dev[i].meta_dev)
-				return -EINVAL;
+				return ti_error_einval(rs->ti, "No data device supplied with metadata device");
 
 			continue;
 		}
@@ -290,10 +363,8 @@ static int parse_dev_parms(struct raid_set *rs, struct dm_arg_set *as)
 		r = dm_get_device(rs->ti, arg,
 				    dm_table_get_mode(rs->ti->table),
 				    &rs->dev[i].data_dev);
-		if (r) {
-			rs->ti->error = "RAID device lookup failure";
-			return r;
-		}
+		if (r)
+			return ti_error_ret(rs->ti, "RAID device lookup failure", r);
 
 		if (rs->dev[i].meta_dev) {
 			metadata_available = 1;
@@ -322,8 +393,7 @@ static int parse_dev_parms(struct raid_set *rs, struct dm_arg_set *as)
 		 * User could specify 'nosync' option if desperate.
 		 */
 		DMERR("Unable to rebuild drive while array is not in-sync");
-		rs->ti->error = "RAID device lookup failure";
-		return -EINVAL;
+		return ti_error_einval(rs->ti, "Unable to rebuild drive while array is not in-sync");
 	}
 
 	return 0;
@@ -360,27 +430,20 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
 		/*
 		 * Validate user-supplied value.
 		 */
-		if (region_size > rs->ti->len) {
-			rs->ti->error = "Supplied region size is too large";
-			return -EINVAL;
-		}
+		if (region_size > rs->ti->len)
+			return ti_error_einval(rs->ti, "Supplied region size is too large");
 
 		if (region_size < min_region_size) {
 			DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
 			      region_size, min_region_size);
-			rs->ti->error = "Supplied region size is too small";
-			return -EINVAL;
+			return ti_error_einval(rs->ti, "Supplied region size is too small");
 		}
 
-		if (!is_power_of_2(region_size)) {
-			rs->ti->error = "Region size is not a power of 2";
-			return -EINVAL;
-		}
+		if (!is_power_of_2(region_size))
+			return ti_error_einval(rs->ti, "Region size is not a power of 2");
 
-		if (region_size < rs->md.chunk_sectors) {
-			rs->ti->error = "Region size is smaller than the chunk size";
-			return -EINVAL;
-		}
+		if (region_size < rs->md.chunk_sectors)
+			return ti_error_einval(rs->ti, "Region size is smaller than the chunk size");
 	}
 
 	/*
@@ -522,14 +585,13 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	sector_t sectors_per_dev = rs->ti->len;
 	sector_t max_io_len;
 	const char *arg, *key;
+	struct raid_dev *rd;
 
 	arg = dm_shift_arg(as);
 	num_raid_params--; /* Account for chunk_size argument */
 
-	if (kstrtouint(arg, 10, &value) < 0) {
-		rs->ti->error = "Bad numerical argument given for chunk_size";
-		return -EINVAL;
-	}
+	if (kstrtouint(arg, 10, &value) < 0)
+		return ti_error_einval(rs->ti, "Bad numerical argument given for chunk_size");
 
 	/*
 	 * First, parse the in-order required arguments
@@ -539,13 +601,10 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		if (value)
 			DMERR("Ignoring chunk size parameter for RAID 1");
 		value = 0;
-	} else if (!is_power_of_2(value)) {
-		rs->ti->error = "Chunk size must be a power of 2";
-		return -EINVAL;
-	} else if (value < 8) {
-		rs->ti->error = "Chunk size value is too small";
-		return -EINVAL;
-	}
+	} else if (!is_power_of_2(value))
+		return ti_error_einval(rs->ti, "Chunk size must be a power of 2");
+	else if (value < 8)
+		return ti_error_einval(rs->ti, "Chunk size value is too small");
 
 	rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
 
@@ -576,144 +635,134 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	 */
 	for (i = 0; i < num_raid_params; i++) {
 		arg = dm_shift_arg(as);
-		if (!arg) {
-			rs->ti->error = "Not enough raid parameters given";
-			return -EINVAL;
-		}
+		if (!arg)
+			return ti_error_einval(rs->ti, "Not enough raid parameters given");
 
 		if (!strcasecmp(arg, "nosync")) {
 			rs->md.recovery_cp = MaxSector;
-			rs->ctr_flags |= CTR_FLAG_NOSYNC;
+			_set_flag(CTR_FLAG_NOSYNC, &rs->ctr_flags);
 			continue;
 		}
 		if (!strcasecmp(arg, "sync")) {
 			rs->md.recovery_cp = 0;
-			rs->ctr_flags |= CTR_FLAG_SYNC;
+			_set_flag(CTR_FLAG_SYNC, &rs->ctr_flags);
 			continue;
 		}
 
-		/* The rest of the optional arguments come in key/value pairs */
-		if ((i + 1) >= num_raid_params) {
-			rs->ti->error = "Wrong number of raid parameters given";
-			return -EINVAL;
-		}
-
 		key = arg;
 		arg = dm_shift_arg(as);
 		i++; /* Account for the argument pairs */
+		if (!arg)
+			return ti_error_einval(rs->ti, "Wrong number of raid parameters given");
 
-		/* Parameters that take a string value are checked here. */
-		if (!strcasecmp(key, "raid10_format")) {
-			if (rs->raid_type->level != 10) {
-				rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
-				return -EINVAL;
-			}
+		/*
+		 * Parameters that take a string value are checked here.
+		 */
+
+		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_RAID10_FORMAT))) {
+			if (_test_and_set_flag(CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
+				return ti_error_einval(rs->ti, "Only one raid10_format argument pair allowed");
+			if (rs->raid_type->level != 10)
+				return ti_error_einval(rs->ti, "'raid10_format' is an invalid parameter for this RAID type");
 			if (strcmp("near", arg) &&
 			    strcmp("far", arg) &&
-			    strcmp("offset", arg)) {
-				rs->ti->error = "Invalid 'raid10_format' value given";
-				return -EINVAL;
-			}
+			    strcmp("offset", arg))
+				return ti_error_einval(rs->ti, "Invalid 'raid10_format' value given");
+
 			raid10_format = (char *) arg;
-			rs->ctr_flags |= CTR_FLAG_RAID10_FORMAT;
 			continue;
 		}
 
-		if (kstrtouint(arg, 10, &value) < 0) {
-			rs->ti->error = "Bad numerical argument given in raid params";
-			return -EINVAL;
-		}
+		if (kstrtouint(arg, 10, &value) < 0)
+			return ti_error_einval(rs->ti, "Bad numerical argument given in raid params");
+
+		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_REBUILD))) {
+			/*
+			 * "rebuild" is being passed in by userspace to provide
+			 * indexes of replaced devices and to set up additional
+			 * devices on raid level takeover.
+ 			 */
+			if (!_in_range(value, 0, rs->md.raid_disks - 1))
+				return ti_error_einval(rs->ti, "Invalid rebuild index given");
+
+			rd = rs->dev + value;
+			clear_bit(In_sync, &rd->rdev.flags);
+			clear_bit(Faulty, &rd->rdev.flags);
+			rd->rdev.recovery_offset = 0;
+			_set_flag(CTR_FLAG_REBUILD, &rs->ctr_flags);
+		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_WRITE_MOSTLY))) {
+			if (rs->raid_type->level != 1)
+				return ti_error_einval(rs->ti, "write_mostly option is only valid for RAID1");
+
+			if (!_in_range(value, 0, rs->md.raid_disks - 1))
+				return ti_error_einval(rs->ti, "Invalid write_mostly index given");
 
-		/* Parameters that take a numeric value are checked here */
-		if (!strcasecmp(key, "rebuild")) {
-			if (value >= rs->md.raid_disks) {
-				rs->ti->error = "Invalid rebuild index given";
-				return -EINVAL;
-			}
-			clear_bit(In_sync, &rs->dev[value].rdev.flags);
-			rs->dev[value].rdev.recovery_offset = 0;
-			rs->ctr_flags |= CTR_FLAG_REBUILD;
-		} else if (!strcasecmp(key, "write_mostly")) {
-			if (rs->raid_type->level != 1) {
-				rs->ti->error = "write_mostly option is only valid for RAID1";
-				return -EINVAL;
-			}
-			if (value >= rs->md.raid_disks) {
-				rs->ti->error = "Invalid write_mostly drive index given";
-				return -EINVAL;
-			}
 			set_bit(WriteMostly, &rs->dev[value].rdev.flags);
-		} else if (!strcasecmp(key, "max_write_behind")) {
-			if (rs->raid_type->level != 1) {
-				rs->ti->error = "max_write_behind option is only valid for RAID1";
-				return -EINVAL;
-			}
-			rs->ctr_flags |= CTR_FLAG_MAX_WRITE_BEHIND;
+			_set_flag(CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags);
+		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) {
+			if (rs->raid_type->level != 1)
+				return ti_error_einval(rs->ti, "max_write_behind option is only valid for RAID1");
+
+			if (_test_and_set_flag(CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
+				return ti_error_einval(rs->ti, "Only one max_write_behind argument pair allowed");
 
 			/*
 			 * In device-mapper, we specify things in sectors, but
 			 * MD records this value in kB
 			 */
 			value /= 2;
-			if (value > COUNTER_MAX) {
-				rs->ti->error = "Max write-behind limit out of range";
-				return -EINVAL;
-			}
+			if (value > COUNTER_MAX)
+				return ti_error_einval(rs->ti, "Max write-behind limit out of range");
+
 			rs->md.bitmap_info.max_write_behind = value;
-		} else if (!strcasecmp(key, "daemon_sleep")) {
-			rs->ctr_flags |= CTR_FLAG_DAEMON_SLEEP;
-			if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
-				rs->ti->error = "daemon sleep period out of range";
-				return -EINVAL;
-			}
+		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_DAEMON_SLEEP))) {
+			if (_test_and_set_flag(CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
+				return ti_error_einval(rs->ti, "Only one daemon_sleep argument pair allowed");
+			if (!value || (value > MAX_SCHEDULE_TIMEOUT))
+				return ti_error_einval(rs->ti, "daemon sleep period out of range");
 			rs->md.bitmap_info.daemon_sleep = value;
-		} else if (!strcasecmp(key, "stripe_cache")) {
-			rs->ctr_flags |= CTR_FLAG_STRIPE_CACHE;
-
+		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_STRIPE_CACHE))) {
+			if (_test_and_set_flag(CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
+				return ti_error_einval(rs->ti, "Only one stripe_cache argument pair allowed");
 			/*
 			 * In device-mapper, we specify things in sectors, but
 			 * MD records this value in kB
 			 */
 			value /= 2;
 
-			if ((rs->raid_type->level != 5) &&
-			    (rs->raid_type->level != 6)) {
-				rs->ti->error = "Inappropriate argument: stripe_cache";
-				return -EINVAL;
-			}
-			if (raid5_set_cache_size(&rs->md, (int)value)) {
-				rs->ti->error = "Bad stripe_cache size";
-				return -EINVAL;
-			}
-		} else if (!strcasecmp(key, "min_recovery_rate")) {
-			rs->ctr_flags |= CTR_FLAG_MIN_RECOVERY_RATE;
-			if (value > INT_MAX) {
-				rs->ti->error = "min_recovery_rate out of range";
-				return -EINVAL;
-			}
+			if (!_in_range(rs->raid_type->level, 4, 6))
+				return ti_error_einval(rs->ti, "Inappropriate argument: stripe_cache");
+			if (raid5_set_cache_size(&rs->md, (int)value))
+				return ti_error_einval(rs->ti, "Bad stripe_cache size");
+
+		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_MIN_RECOVERY_RATE))) {
+			if (_test_and_set_flag(CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
+				return ti_error_einval(rs->ti, "Only one min_recovery_rate argument pair allowed");
+			if (value > INT_MAX)
+				return ti_error_einval(rs->ti, "min_recovery_rate out of range");
 			rs->md.sync_speed_min = (int)value;
-		} else if (!strcasecmp(key, "max_recovery_rate")) {
-			rs->ctr_flags |= CTR_FLAG_MAX_RECOVERY_RATE;
-			if (value > INT_MAX) {
-				rs->ti->error = "max_recovery_rate out of range";
-				return -EINVAL;
-			}
+		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_MAX_RECOVERY_RATE))) {
+			if (_test_and_set_flag(CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
+				return ti_error_einval(rs->ti, "Only one max_recovery_rate argument pair allowed");
+			if (value > INT_MAX)
+				return ti_error_einval(rs->ti, "max_recovery_rate out of range");
 			rs->md.sync_speed_max = (int)value;
-		} else if (!strcasecmp(key, "region_size")) {
-			rs->ctr_flags |= CTR_FLAG_REGION_SIZE;
+		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_REGION_SIZE))) {
+			if (_test_and_set_flag(CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
+				return ti_error_einval(rs->ti, "Only one region_size argument pair allowed");
+
 			region_size = value;
-		} else if (!strcasecmp(key, "raid10_copies") &&
-			   (rs->raid_type->level == 10)) {
-			if ((value < 2) || (value > 0xFF)) {
-				rs->ti->error = "Bad value for 'raid10_copies'";
-				return -EINVAL;
-			}
-			rs->ctr_flags |= CTR_FLAG_RAID10_COPIES;
+		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_RAID10_COPIES))) {
+			if (_test_and_set_flag(CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
+				return ti_error_einval(rs->ti, "Only one raid10_copies argument pair allowed");
+
+			if (!_in_range(value, 2, rs->md.raid_disks))
+				return ti_error_einval(rs->ti, "Bad value for 'raid10_copies'");
+
 			raid10_copies = value;
 		} else {
 			DMERR("Unable to parse RAID parameter: %s", key);
-			rs->ti->error = "Unable to parse RAID parameters";
-			return -EINVAL;
+			return ti_error_einval(rs->ti, "Unable to parse RAID parameters");
 		}
 	}
 
@@ -729,19 +778,15 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		return -EINVAL;
 
 	if (rs->raid_type->level == 10) {
-		if (raid10_copies > rs->md.raid_disks) {
-			rs->ti->error = "Not enough devices to satisfy specification";
-			return -EINVAL;
-		}
+		if (raid10_copies > rs->md.raid_disks)
+			return ti_error_einval(rs->ti, "Not enough devices to satisfy specification");
 
 		/*
 		 * If the format is not "near", we only support
 		 * two copies at the moment.
 		 */
-		if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
-			rs->ti->error = "Too many copies for given RAID10 format.";
-			return -EINVAL;
-		}
+		if (strcmp("near", raid10_format) && (raid10_copies > 2))
+			return ti_error_einval(rs->ti, "Too many copies for given RAID10 format.");
 
 		/* (Len * #mirrors) / #devices */
 		sectors_per_dev = rs->ti->len * raid10_copies;
@@ -752,10 +797,9 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		rs->md.new_layout = rs->md.layout;
 	} else if ((!rs->raid_type->level || rs->raid_type->level > 1) &&
 		   sector_div(sectors_per_dev,
-			      (rs->md.raid_disks - rs->raid_type->parity_devs))) {
-		rs->ti->error = "Target length not divisible by number of data devices";
-		return -EINVAL;
-	}
+			      (rs->md.raid_disks - rs->raid_type->parity_devs)))
+		return ti_error_einval(rs->ti, "Target length not divisible by number of data devices");
+
 	rs->md.dev_sectors = sectors_per_dev;
 
 	/* Assume there are no metadata devices until the drives are parsed */
@@ -1035,11 +1079,9 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 		if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
 			role = le32_to_cpu(sb2->array_position);
 			if (role != r->raid_disk) {
-				if (rs->raid_type->level != 1) {
-					rs->ti->error = "Cannot change device "
-						"positions in RAID array";
-					return -EINVAL;
-				}
+				if (rs->raid_type->level != 1)
+					return ti_error_einval(rs->ti, "Cannot change device "
+								       "positions in RAID array");
 				DMINFO("RAID1 device #%d now at position #%d",
 				       role, r->raid_disk);
 			}
@@ -1170,18 +1212,15 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 	if (!freshest)
 		return 0;
 
-	if (validate_raid_redundancy(rs)) {
-		rs->ti->error = "Insufficient redundancy to activate array";
-		return -EINVAL;
-	}
+	if (validate_raid_redundancy(rs))
+		return ti_error_einval(rs->ti, "Insufficient redundancy to activate array");
 
 	/*
 	 * Validation of the freshest device provides the source of
 	 * validation for the remaining devices.
 	 */
-	ti->error = "Unable to assemble array: Invalid superblocks";
 	if (super_validate(rs, freshest))
-		return -EINVAL;
+		return ti_error_einval(rs->ti, "Unable to assemble array: Invalid superblocks");
 
 	rdev_for_each(rdev, mddev)
 		if ((rdev != freshest) && super_validate(rs, rdev))
@@ -1265,16 +1304,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	/* Must have <raid_type> */
 	arg = dm_shift_arg(&as);
-	if (!arg) {
-		ti->error = "No arguments";
-		return -EINVAL;
-	}
+	if (!arg)
+		return ti_error_einval(rs->ti, "No arguments");
 
 	rt = get_raid_type(arg);
-	if (!rt) {
-		ti->error = "Unrecognised raid_type";
-		return -EINVAL;
-	}
+	if (!rt)
+		return ti_error_einval(rs->ti, "Unrecognised raid_type");
 
 	/* Must have <#raid_params> */
 	if (dm_read_arg_group(_args, &as, &num_raid_params, &ti->error))
@@ -1287,10 +1322,8 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (dm_read_arg(_args + 1, &as_nrd, &num_raid_devs, &ti->error))
                 return -EINVAL;
 
-	if (!_in_range(num_raid_devs, 1, MAX_RAID_DEVICES)) {
-		ti->error = "Invalid number of supplied raid devices";
-                return -EINVAL;
-	}
+	if (!_in_range(num_raid_devs, 1, MAX_RAID_DEVICES))
+		return ti_error_einval(rs->ti, "Invalid number of supplied raid devices");
 
 	rs = context_alloc(ti, rt, num_raid_devs);
 	if (IS_ERR(rs))
@@ -1300,7 +1333,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (r)
 		goto bad;
 
-	r = parse_dev_parms(rs, &as);
+	r = parse_dev_params(rs, &as);
 	if (r)
 		goto bad;
 
@@ -1330,8 +1363,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	if (ti->len != rs->md.array_sectors) {
-		ti->error = "Array size does not match requested target length";
-		r = -EINVAL;
+		r = ti_error_einval(ti, "Array size does not match requested target length");
 		goto size_mismatch;
 	}
 	rs->callbacks.congested_fn = raid_is_congested;
@@ -1751,7 +1783,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 8, 0},
+	.version = {1, 8, 1},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
-- 
cgit v1.2.3-70-g09d2


From f090279eaff814a550b35bb51aac6b8541bddf97 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 19 May 2016 18:49:27 +0200
Subject: dm raid: check constructor arguments for invalid raid level/argument
 combinations

Reject invalid flag combinations to avoid potential data corruption or
failing raid set construction:

 - add definitions for constructor flag combinations and invalid flags
   per level

 - add bool test functions for the various raid types
   (also will be used by future reshaping enhancements)

 - introduce rs_check_for_invalid_flags() and _invalid_flags()
   to perform the validity checks

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 130 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index ab7aa7d83364..ebb64eb66def 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -64,6 +64,61 @@ struct raid_dev {
 #define CTR_FLAG_RAID10_COPIES     0x400 /* 2 */ /* Only with raid10 */
 #define CTR_FLAG_RAID10_FORMAT     0x800 /* 2 */ /* Only with raid10 */
 
+/*
+ * Definitions of various constructor flags to
+ * be used in checks of valid / invalid flags
+ * per raid level.
+ */
+/* Define all any sync flags */
+#define	CTR_FLAGS_ANY_SYNC		(CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)
+
+/* Define flags for options without argument (e.g. 'nosync') */
+#define	CTR_FLAG_OPTIONS_NO_ARGS	CTR_FLAGS_ANY_SYNC
+
+/* Define flags for options with one argument (e.g. 'delta_disks +2') */
+#define CTR_FLAG_OPTIONS_ONE_ARG (CTR_FLAG_REBUILD | \
+				  CTR_FLAG_WRITE_MOSTLY | \
+				  CTR_FLAG_DAEMON_SLEEP | \
+				  CTR_FLAG_MIN_RECOVERY_RATE | \
+				  CTR_FLAG_MAX_RECOVERY_RATE | \
+				  CTR_FLAG_MAX_WRITE_BEHIND | \
+				  CTR_FLAG_STRIPE_CACHE | \
+				  CTR_FLAG_REGION_SIZE | \
+				  CTR_FLAG_RAID10_COPIES | \
+				  CTR_FLAG_RAID10_FORMAT)
+
+/* All ctr optional arguments */
+#define ALL_CTR_FLAGS		(CTR_FLAG_OPTIONS_NO_ARGS | \
+				 CTR_FLAG_OPTIONS_ONE_ARG)
+
+/* Invalid options definitions per raid level... */
+
+/* "raid0" does not accept any options */
+#define RAID0_INVALID_FLAGS ALL_CTR_FLAGS
+
+/* "raid1" does not accept stripe cache or any raid10 options */
+#define RAID1_INVALID_FLAGS	(CTR_FLAG_STRIPE_CACHE | \
+				 CTR_FLAG_RAID10_COPIES | \
+				 CTR_FLAG_RAID10_FORMAT)
+
+/* "raid10" does not accept any raid1 or stripe cache options */
+#define RAID10_INVALID_FLAGS	(CTR_FLAG_WRITE_MOSTLY | \
+				 CTR_FLAG_MAX_WRITE_BEHIND | \
+				 CTR_FLAG_STRIPE_CACHE)
+/*
+ * "raid4/5/6" do not accept any raid1 or raid10 specific options
+ *
+ * "raid6" does not accept "nosync", because it is not guaranteed
+ * that both parity and q-syndrome are being written properly with
+ * any writes
+ */
+#define RAID45_INVALID_FLAGS	(CTR_FLAG_WRITE_MOSTLY | \
+				 CTR_FLAG_MAX_WRITE_BEHIND | \
+				 CTR_FLAG_RAID10_FORMAT | \
+				 CTR_FLAG_RAID10_COPIES)
+#define RAID6_INVALID_FLAGS	(CTR_FLAG_NOSYNC | RAID45_INVALID_FLAGS)
+/* ...invalid options definitions per raid level */
+
 struct raid_set {
 	struct dm_target *ti;
 
@@ -166,6 +221,41 @@ static const char *_argname_by_flag(const uint32_t flag)
 	return NULL;
 }
 
+/*
+ * bool helpers to test for various raid levels of a raid type
+ */
+
+/* Return true, if raid type in @rt is raid0 */
+static bool rt_is_raid0(struct raid_type *rt)
+{
+	return !rt->level;
+}
+
+/* Return true, if raid type in @rt is raid1 */
+static bool rt_is_raid1(struct raid_type *rt)
+{
+	return rt->level == 1;
+}
+
+/* Return true, if raid type in @rt is raid10 */
+static bool rt_is_raid10(struct raid_type *rt)
+{
+	return rt->level == 10;
+}
+
+/* Return true, if raid type in @rt is raid4/5 */
+static bool rt_is_raid45(struct raid_type *rt)
+{
+	return _in_range(rt->level, 4, 5);
+}
+
+/* Return true, if raid type in @rt is raid6 */
+static bool rt_is_raid6(struct raid_type *rt)
+{
+	return rt->level == 6;
+}
+/* END: raid level bools */
+
 /*
  * Convenience functions to set ti->error to @errmsg and
  * return @r in order to shorten code in a lot of places
@@ -182,6 +272,44 @@ static int ti_error_einval(struct dm_target *ti, const char *errmsg)
 }
 /* END: convenience functions to set ti->error to @errmsg... */
 
+/* Return invalid ctr flags for the raid level of @rs */
+static uint32_t _invalid_flags(struct raid_set *rs)
+{
+	if (rt_is_raid0(rs->raid_type))
+		return RAID0_INVALID_FLAGS;
+	else if (rt_is_raid1(rs->raid_type))
+		return RAID1_INVALID_FLAGS;
+	else if (rt_is_raid10(rs->raid_type))
+		return RAID10_INVALID_FLAGS;
+	else if (rt_is_raid45(rs->raid_type))
+		return RAID45_INVALID_FLAGS;
+	else if (rt_is_raid6(rs->raid_type))
+		return RAID6_INVALID_FLAGS;
+
+	return ~0;
+}
+
+/*
+ * Check for any invalid flags set on @rs defined by bitset @invalid_flags
+ *
+ * Has to be called after parsing of the ctr flags!
+ */
+static int rs_check_for_invalid_flags(struct raid_set *rs)
+{
+	unsigned int ctr_flags = rs->ctr_flags, flag = 0;
+	const uint32_t invalid_flags = _invalid_flags(rs);
+
+	while ((ctr_flags &= ~flag)) {
+		flag = 1 << __ffs(ctr_flags);
+
+		if (_test_flag(flag, rs->ctr_flags) &&
+		    _test_flag(flag, invalid_flags))
+			return ti_error_einval(rs->ti, "Invalid flag combined");
+	}
+
+	return 0;
+}
+
 static char *raid10_md_layout_to_format(int layout)
 {
 	/*
@@ -806,7 +934,8 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	rs->md.persistent = 0;
 	rs->md.external = 1;
 
-	return 0;
+	/* Check, if any invalid ctr arguments have been passed in for the raid level */
+	return rs_check_for_invalid_flags(rs);
 }
 
 static void do_table_event(struct work_struct *ws)
-- 
cgit v1.2.3-70-g09d2


From ad51d7f1d1731f0fd62690edda706288bc965abb Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 19 May 2016 18:49:28 +0200
Subject: dm raid: more use of flag testing wrappers

 - add _test_flags() function

 - use it to simplify rs_check_for_invalid_flags()

 - use _test_flag() throughout

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 48 +++++++++++++++++++++++-------------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index ebb64eb66def..668398dfba32 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -175,6 +175,12 @@ static bool _test_flag(uint32_t flag, uint32_t flags)
 	return (flag & flags) ? true : false;
 }
 
+/* Test multiple @flags in @all_flags */
+static bool _test_flags(uint32_t flags, uint32_t all_flags)
+{
+	return (flags & all_flags) ? true : false;
+}
+
 /* Return true if single @flag is set in @*flags, else set it and return false */
 static bool _test_and_set_flag(uint32_t flag, uint32_t *flags)
 {
@@ -296,16 +302,8 @@ static uint32_t _invalid_flags(struct raid_set *rs)
  */
 static int rs_check_for_invalid_flags(struct raid_set *rs)
 {
-	unsigned int ctr_flags = rs->ctr_flags, flag = 0;
-	const uint32_t invalid_flags = _invalid_flags(rs);
-
-	while ((ctr_flags &= ~flag)) {
-		flag = 1 << __ffs(ctr_flags);
-
-		if (_test_flag(flag, rs->ctr_flags) &&
-		    _test_flag(flag, invalid_flags))
-			return ti_error_einval(rs->ti, "Invalid flag combined");
-	}
+	if (_test_flags(rs->ctr_flags, _invalid_flags(rs)))
+		return ti_error_einval(rs->ti, "Invalid flag combined");
 
 	return 0;
 }
@@ -1150,7 +1148,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 		return -EINVAL;
 	}
 
-	if (!(rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)))
+	if (!(_test_flags(CTR_FLAGS_ANY_SYNC, rs->ctr_flags)))
 		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
 
 	/*
@@ -1293,7 +1291,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 		 */
 		rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode));
 
-		if (rs->ctr_flags & CTR_FLAG_SYNC)
+		if (_test_flag(CTR_FLAG_SYNC, rs->ctr_flags))
 			continue;
 
 		if (!rdev->meta_bdev)
@@ -1650,7 +1648,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 	case STATUSTYPE_TABLE:
 		/* The string you would use to construct this array */
 		for (i = 0; i < rs->md.raid_disks; i++) {
-			if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
+			if (_test_flag(CTR_FLAG_REBUILD, rs->ctr_flags) &&
 			    rs->dev[i].data_dev &&
 			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
 				raid_param_cnt += 2; /* for rebuilds */
@@ -1666,26 +1664,26 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		DMEMIT("%s %u %u", rs->raid_type->name,
 		       raid_param_cnt, rs->md.chunk_sectors);
 
-		if ((rs->ctr_flags & CTR_FLAG_SYNC) &&
-		    (rs->md.recovery_cp == MaxSector))
+		if (_test_flag(CTR_FLAG_SYNC, rs->ctr_flags) &&
+		    rs->md.recovery_cp == MaxSector)
 			DMEMIT(" sync");
-		if (rs->ctr_flags & CTR_FLAG_NOSYNC)
+		if (_test_flag(CTR_FLAG_NOSYNC, rs->ctr_flags))
 			DMEMIT(" nosync");
 
 		for (i = 0; i < rs->md.raid_disks; i++)
-			if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
+			if (_test_flag(CTR_FLAG_REBUILD, rs->ctr_flags) &&
 			    rs->dev[i].data_dev &&
 			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
 				DMEMIT(" rebuild %u", i);
 
-		if (rs->ctr_flags & CTR_FLAG_DAEMON_SLEEP)
+		if (_test_flag(CTR_FLAG_DAEMON_SLEEP, rs->ctr_flags))
 			DMEMIT(" daemon_sleep %lu",
 			       rs->md.bitmap_info.daemon_sleep);
 
-		if (rs->ctr_flags & CTR_FLAG_MIN_RECOVERY_RATE)
+		if (_test_flag(CTR_FLAG_MIN_RECOVERY_RATE, rs->ctr_flags))
 			DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
 
-		if (rs->ctr_flags & CTR_FLAG_MAX_RECOVERY_RATE)
+		if (_test_flag(CTR_FLAG_MAX_RECOVERY_RATE, rs->ctr_flags))
 			DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
 
 		for (i = 0; i < rs->md.raid_disks; i++)
@@ -1693,11 +1691,11 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 			    test_bit(WriteMostly, &rs->dev[i].rdev.flags))
 				DMEMIT(" write_mostly %u", i);
 
-		if (rs->ctr_flags & CTR_FLAG_MAX_WRITE_BEHIND)
+		if (_test_flag(CTR_FLAG_MAX_WRITE_BEHIND, rs->ctr_flags))
 			DMEMIT(" max_write_behind %lu",
 			       rs->md.bitmap_info.max_write_behind);
 
-		if (rs->ctr_flags & CTR_FLAG_STRIPE_CACHE) {
+		if (_test_flag(CTR_FLAG_STRIPE_CACHE, rs->ctr_flags)) {
 			struct r5conf *conf = rs->md.private;
 
 			/* convert from kiB to sectors */
@@ -1705,15 +1703,15 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 			       conf ? conf->max_nr_stripes * 2 : 0);
 		}
 
-		if (rs->ctr_flags & CTR_FLAG_REGION_SIZE)
+		if (_test_flag(CTR_FLAG_REGION_SIZE, rs->ctr_flags))
 			DMEMIT(" region_size %lu",
 			       rs->md.bitmap_info.chunksize >> 9);
 
-		if (rs->ctr_flags & CTR_FLAG_RAID10_COPIES)
+		if (_test_flag(CTR_FLAG_RAID10_COPIES, rs->ctr_flags))
 			DMEMIT(" raid10_copies %u",
 			       raid10_md_layout_to_copies(rs->md.layout));
 
-		if (rs->ctr_flags & CTR_FLAG_RAID10_FORMAT)
+		if (_test_flag(CTR_FLAG_RAID10_FORMAT, rs->ctr_flags))
 			DMEMIT(" raid10_format %s",
 			       raid10_md_layout_to_format(rs->md.layout));
 
-- 
cgit v1.2.3-70-g09d2


From 676fa5ad6e96e5704b0f2d5bb56ea115c807eef4 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 19 May 2016 18:49:29 +0200
Subject: dm raid: use rt_is_raid*() in all appropriate checks

Make use if raid type rt_is_*() bool functions for simplification and
consistency reasons.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 668398dfba32..719612440dfc 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -260,6 +260,12 @@ static bool rt_is_raid6(struct raid_type *rt)
 {
 	return rt->level == 6;
 }
+
+/* Return true, if raid type in @rt is raid4/5/6 */
+static bool rt_is_raid456(struct raid_type *rt)
+{
+	return _in_range(rt->level, 4, 6);
+}
 /* END: raid level bools */
 
 /*
@@ -723,7 +729,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	 * First, parse the in-order required arguments
 	 * "chunk_size" is the only argument of this type.
 	 */
-	if (rs->raid_type->level == 1) {
+	if (rt_is_raid1(rs->raid_type)) {
 		if (value)
 			DMERR("Ignoring chunk size parameter for RAID 1");
 		value = 0;
@@ -788,7 +794,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_RAID10_FORMAT))) {
 			if (_test_and_set_flag(CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
 				return ti_error_einval(rs->ti, "Only one raid10_format argument pair allowed");
-			if (rs->raid_type->level != 10)
+			if (!rt_is_raid10(rs->raid_type))
 				return ti_error_einval(rs->ti, "'raid10_format' is an invalid parameter for this RAID type");
 			if (strcmp("near", arg) &&
 			    strcmp("far", arg) &&
@@ -817,7 +823,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			rd->rdev.recovery_offset = 0;
 			_set_flag(CTR_FLAG_REBUILD, &rs->ctr_flags);
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_WRITE_MOSTLY))) {
-			if (rs->raid_type->level != 1)
+			if (!rt_is_raid1(rs->raid_type))
 				return ti_error_einval(rs->ti, "write_mostly option is only valid for RAID1");
 
 			if (!_in_range(value, 0, rs->md.raid_disks - 1))
@@ -826,7 +832,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			set_bit(WriteMostly, &rs->dev[value].rdev.flags);
 			_set_flag(CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags);
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) {
-			if (rs->raid_type->level != 1)
+			if (!rt_is_raid1(rs->raid_type))
 				return ti_error_einval(rs->ti, "max_write_behind option is only valid for RAID1");
 
 			if (_test_and_set_flag(CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
@@ -856,7 +862,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			 */
 			value /= 2;
 
-			if (!_in_range(rs->raid_type->level, 4, 6))
+			if (!rt_is_raid456(rs->raid_type))
 				return ti_error_einval(rs->ti, "Inappropriate argument: stripe_cache");
 			if (raid5_set_cache_size(&rs->md, (int)value))
 				return ti_error_einval(rs->ti, "Bad stripe_cache size");
@@ -903,7 +909,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	if (dm_set_target_max_io_len(rs->ti, max_io_len))
 		return -EINVAL;
 
-	if (rs->raid_type->level == 10) {
+	if (rt_is_raid10(rs->raid_type)) {
 		if (raid10_copies > rs->md.raid_disks)
 			return ti_error_einval(rs->ti, "Not enough devices to satisfy specification");
 
@@ -921,7 +927,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		rs->md.layout = raid10_format_to_md_layout(raid10_format,
 							   raid10_copies);
 		rs->md.new_layout = rs->md.layout;
-	} else if ((!rs->raid_type->level || rs->raid_type->level > 1) &&
+	} else if (!rt_is_raid1(rs->raid_type) &&
 		   sector_div(sectors_per_dev,
 			      (rs->md.raid_disks - rs->raid_type->parity_devs)))
 		return ti_error_einval(rs->ti, "Target length not divisible by number of data devices");
@@ -1142,7 +1148,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 	}
 
 	/* We can only change the number of devices in RAID1 right now */
-	if ((rs->raid_type->level != 1) &&
+	if (!rt_is_raid1(rs->raid_type) &&
 	    (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
 		DMERR("Reshaping arrays not yet supported. (device count change)");
 		return -EINVAL;
@@ -1206,7 +1212,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 		if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
 			role = le32_to_cpu(sb2->array_position);
 			if (role != r->raid_disk) {
-				if (rs->raid_type->level != 1)
+				if (!rt_is_raid1(rs->raid_type))
 					return ti_error_einval(rs->ti, "Cannot change device "
 								       "positions in RAID array");
 				DMINFO("RAID1 device #%d now at position #%d",
@@ -1243,7 +1249,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 	}
 
 	/* Enable bitmap creation for RAID levels != 0 */
-	mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0;
+	mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
 	rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
 
 	if (!test_bit(FirstUse, &rdev->flags)) {
@@ -1564,7 +1570,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 	case STATUSTYPE_INFO:
 		DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
 
-		if (rs->raid_type->level) {
+		if (!rt_is_raid0(rs->raid_type)) {
 			if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
 				sync = rs->md.curr_resync_completed;
 			else
@@ -1887,7 +1893,7 @@ static void raid_resume(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;
 
-	if (rs->raid_type->level) {
+	if (!rt_is_raid0(rs->raid_type)) {
 		set_bit(MD_CHANGE_DEVS, &rs->md.flags);
 
 		if (!rs->bitmap_loaded) {
-- 
cgit v1.2.3-70-g09d2


From 33e53f06850f44ec9722e08a993ecf8816e447a5 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 19 May 2016 18:49:30 +0200
Subject: dm raid: introduce extended superblock and new raid types to support
 takeover/reshaping

Add new members to the dm-raid superblock and new raid types to support
takeover/reshape.

Add all necessary members needed to support takeover and reshape in one
go -- aiming to limit the amount of changes to the superblock layout.

This is a larger patch due to the new superblock members, their related
flags, validation of both and involved API additions/changes:

 - add additional members to keep track of:
   - state about forward/backward reshaping
   - reshape position
   - new level, layout, stripe size and delta disks
   - data offset to current and new data for out-of-place reshapes
   - failed devices bitfield extensions to keep track of max raid devices

 - adjust super_validate() to cope with new superblock members

 - adjust super_init_validation() to cope with new superblock members

 - add definitions for ctr flags supporting delta disks etc.

 - add new raid types (raid6_n_6 etc.)

 - add new raid10 supporting function API (_is_raid10_*())

 - adjust to changed raid10 supporting function API

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 604 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 471 insertions(+), 133 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 719612440dfc..c98c34c4d284 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -63,6 +63,10 @@ struct raid_dev {
 #define CTR_FLAG_REGION_SIZE       0x200 /* 2 */ /* Not with raid0! */
 #define CTR_FLAG_RAID10_COPIES     0x400 /* 2 */ /* Only with raid10 */
 #define CTR_FLAG_RAID10_FORMAT     0x800 /* 2 */ /* Only with raid10 */
+/* New for v1.8.0 */
+#define CTR_FLAG_DELTA_DISKS          0x1000 /* 2 */ /* Only with reshapable raid4/5/6/10! */
+#define CTR_FLAG_DATA_OFFSET          0x2000 /* 2 */ /* Only with reshapable raid4/5/6/10! */
+#define CTR_FLAG_RAID10_USE_NEAR_SETS 0x4000 /* 2 */ /* Only with raid10! */
 
 /*
  * Definitions of various constructor flags to
@@ -73,7 +77,8 @@ struct raid_dev {
 #define	CTR_FLAGS_ANY_SYNC		(CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)
 
 /* Define flags for options without argument (e.g. 'nosync') */
-#define	CTR_FLAG_OPTIONS_NO_ARGS	CTR_FLAGS_ANY_SYNC
+#define	CTR_FLAG_OPTIONS_NO_ARGS	(CTR_FLAGS_ANY_SYNC | \
+					 CTR_FLAG_RAID10_USE_NEAR_SETS)
 
 /* Define flags for options with one argument (e.g. 'delta_disks +2') */
 #define CTR_FLAG_OPTIONS_ONE_ARG (CTR_FLAG_REBUILD | \
@@ -85,7 +90,9 @@ struct raid_dev {
 				  CTR_FLAG_STRIPE_CACHE | \
 				  CTR_FLAG_REGION_SIZE | \
 				  CTR_FLAG_RAID10_COPIES | \
-				  CTR_FLAG_RAID10_FORMAT)
+				  CTR_FLAG_RAID10_FORMAT | \
+				  CTR_FLAG_DELTA_DISKS | \
+				  CTR_FLAG_DATA_OFFSET)
 
 /* All ctr optional arguments */
 #define ALL_CTR_FLAGS		(CTR_FLAG_OPTIONS_NO_ARGS | \
@@ -99,7 +106,9 @@ struct raid_dev {
 /* "raid1" does not accept stripe cache or any raid10 options */
 #define RAID1_INVALID_FLAGS	(CTR_FLAG_STRIPE_CACHE | \
 				 CTR_FLAG_RAID10_COPIES | \
-				 CTR_FLAG_RAID10_FORMAT)
+				 CTR_FLAG_RAID10_FORMAT | \
+				 CTR_FLAG_DELTA_DISKS | \
+				 CTR_FLAG_DATA_OFFSET)
 
 /* "raid10" does not accept any raid1 or stripe cache options */
 #define RAID10_INVALID_FLAGS	(CTR_FLAG_WRITE_MOSTLY | \
@@ -115,16 +124,24 @@ struct raid_dev {
 #define RAID45_INVALID_FLAGS	(CTR_FLAG_WRITE_MOSTLY | \
 				 CTR_FLAG_MAX_WRITE_BEHIND | \
 				 CTR_FLAG_RAID10_FORMAT | \
-				 CTR_FLAG_RAID10_COPIES)
+				 CTR_FLAG_RAID10_COPIES | \
+				 CTR_FLAG_RAID10_USE_NEAR_SETS)
 #define RAID6_INVALID_FLAGS	(CTR_FLAG_NOSYNC | RAID45_INVALID_FLAGS)
 /* ...invalid options definitions per raid level */
 
+/* Array elements of 64 bit needed for rebuild/write_mostly bits */
+#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
+
 struct raid_set {
 	struct dm_target *ti;
 
 	uint32_t bitmap_loaded;
 	uint32_t ctr_flags;
 
+	int raid_disks;
+	int delta_disks;
+	int raid10_copies;
+
 	struct mddev md;
 	struct raid_type *raid_type;
 	struct dm_target_callbacks callbacks;
@@ -132,6 +149,12 @@ struct raid_set {
 	struct raid_dev dev[0];
 };
 
+/* raid10 algorithms (i.e. formats) */
+#define	ALGORITHM_RAID10_DEFAULT	0
+#define	ALGORITHM_RAID10_NEAR		1
+#define	ALGORITHM_RAID10_OFFSET		2
+#define	ALGORITHM_RAID10_FAR		3
+
 /* Supported raid types and properties. */
 static struct raid_type {
 	const char *name;		/* RAID algorithm. */
@@ -141,17 +164,26 @@ static struct raid_type {
 	const unsigned level;		/* RAID level. */
 	const unsigned algorithm;	/* RAID algorithm. */
 } raid_types[] = {
-	{"raid0",    "RAID0 (striping)",                0, 2, 0, 0 /* NONE */},
-	{"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
-	{"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, UINT_MAX /* Varies */},
-	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
-	{"raid5_la", "RAID5 (left asymmetric)",		1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
-	{"raid5_ra", "RAID5 (right asymmetric)",	1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
-	{"raid5_ls", "RAID5 (left symmetric)",		1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
-	{"raid5_rs", "RAID5 (right symmetric)",		1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
-	{"raid6_zr", "RAID6 (zero restart)",		2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
-	{"raid6_nr", "RAID6 (N restart)",		2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
-	{"raid6_nc", "RAID6 (N continue)",		2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
+	{"raid0",         "raid0 (striping)",			    0, 2, 0,  0 /* NONE */},
+	{"raid1",         "raid1 (mirroring)",			    0, 2, 1,  0 /* NONE */},
+	{"raid10_far",    "raid10 far (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_FAR},
+	{"raid10_offset", "raid10 offset (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_OFFSET},
+	{"raid10_near",   "raid10 near (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_NEAR},
+	{"raid10",        "raid10 (striped mirrors)",		    0, 2, 10, ALGORITHM_RAID10_DEFAULT},
+	{"raid4",         "raid4 (dedicated last parity disk)",	    1, 2, 4,  ALGORITHM_PARITY_N}, /* raid4 layout = raid5_n */
+	{"raid5_n",       "raid5 (dedicated last parity disk)",	    1, 2, 5,  ALGORITHM_PARITY_N},
+	{"raid5_ls",      "raid5 (left symmetric)",		    1, 2, 5,  ALGORITHM_LEFT_SYMMETRIC},
+	{"raid5_rs",      "raid5 (right symmetric)",		    1, 2, 5,  ALGORITHM_RIGHT_SYMMETRIC},
+	{"raid5_la",      "raid5 (left asymmetric)",		    1, 2, 5,  ALGORITHM_LEFT_ASYMMETRIC},
+	{"raid5_ra",      "raid5 (right asymmetric)",		    1, 2, 5,  ALGORITHM_RIGHT_ASYMMETRIC},
+	{"raid6_zr",      "raid6 (zero restart)",		    2, 4, 6,  ALGORITHM_ROTATING_ZERO_RESTART},
+	{"raid6_nr",      "raid6 (N restart)",			    2, 4, 6,  ALGORITHM_ROTATING_N_RESTART},
+	{"raid6_nc",      "raid6 (N continue)",			    2, 4, 6,  ALGORITHM_ROTATING_N_CONTINUE},
+	{"raid6_n_6",     "raid6 (dedicated parity/Q n/6)",	    2, 4, 6,  ALGORITHM_PARITY_N_6},
+	{"raid6_ls_6",    "raid6 (left symmetric dedicated Q 6)",   2, 4, 6,  ALGORITHM_LEFT_SYMMETRIC_6},
+	{"raid6_rs_6",    "raid6 (right symmetric dedicated Q 6)",  2, 4, 6,  ALGORITHM_RIGHT_SYMMETRIC_6},
+	{"raid6_la_6",    "raid6 (left asymmetric dedicated Q 6)",  2, 4, 6,  ALGORITHM_LEFT_ASYMMETRIC_6},
+	{"raid6_ra_6",    "raid6 (right asymmetric dedicated Q 6)", 2, 4, 6,  ALGORITHM_RIGHT_ASYMMETRIC_6}
 };
 
 /* True, if @v is in inclusive range [@min, @max] */
@@ -227,6 +259,23 @@ static const char *_argname_by_flag(const uint32_t flag)
 	return NULL;
 }
 
+/*
+ * bool helpers to test for various raid levels of a raid set,
+ * is. it's level as reported by the superblock rather than
+ * the requested raid_type passed to the constructor.
+ */
+/* Return true, if raid set in @rs is raid0 */
+static bool rs_is_raid0(struct raid_set *rs)
+{
+	return !rs->md.level;
+}
+
+/* Return true, if raid set in @rs is raid10 */
+static bool rs_is_raid10(struct raid_set *rs)
+{
+	return rs->md.level == 10;
+}
+
 /*
  * bool helpers to test for various raid levels of a raid type
  */
@@ -314,57 +363,184 @@ static int rs_check_for_invalid_flags(struct raid_set *rs)
 	return 0;
 }
 
-static char *raid10_md_layout_to_format(int layout)
+
+/* MD raid10 bit definitions and helpers */
+#define RAID10_OFFSET			(1 << 16) /* stripes with data copies area adjacent on devices */
+#define RAID10_BROCKEN_USE_FAR_SETS	(1 << 17) /* Broken in raid10.c: use sets instead of whole stripe rotation */
+#define RAID10_USE_FAR_SETS		(1 << 18) /* Use sets instead of whole stripe rotation */
+#define RAID10_FAR_COPIES_SHIFT		8	  /* raid10 # far copies shift (2nd byte of layout) */
+
+/* Return md raid10 near copies for @layout */
+static unsigned int _raid10_near_copies(int layout)
+{
+	return layout & 0xFF;
+}
+
+/* Return md raid10 far copies for @layout */
+static unsigned int _raid10_far_copies(int layout)
+{
+	return _raid10_near_copies(layout >> RAID10_FAR_COPIES_SHIFT);
+}
+
+/* Return true if md raid10 offset for @layout */
+static unsigned int _is_raid10_offset(int layout)
+{
+	return layout & RAID10_OFFSET;
+}
+
+/* Return true if md raid10 near for @layout */
+static unsigned int _is_raid10_near(int layout)
+{
+	return !_is_raid10_offset(layout) && _raid10_near_copies(layout) > 1;
+}
+
+/* Return true if md raid10 far for @layout */
+static unsigned int _is_raid10_far(int layout)
+{
+	return !_is_raid10_offset(layout) && _raid10_far_copies(layout) > 1;
+}
+
+/* Return md raid10 layout string for @layout */
+static const char *raid10_md_layout_to_format(int layout)
 {
 	/*
-	 * Bit 16 and 17 stand for "offset" and "use_far_sets"
+	 * Bit 16 stands for "offset"
+	 * (i.e. adjacent stripes hold copies)
+	 *
 	 * Refer to MD's raid10.c for details
 	 */
-	if ((layout & 0x10000) && (layout & 0x20000))
+	if (_is_raid10_offset(layout))
 		return "offset";
 
-	if ((layout & 0xFF) > 1)
+	if (_raid10_near_copies(layout) > 1)
 		return "near";
 
+	WARN_ON(_raid10_far_copies(layout) < 2);
+
 	return "far";
 }
 
-static unsigned raid10_md_layout_to_copies(int layout)
+/* Return md raid10 algorithm for @name */
+static const int raid10_name_to_format(const char *name)
+{
+	if (!strcasecmp(name, "near"))
+		return ALGORITHM_RAID10_NEAR;
+	else if (!strcasecmp(name, "offset"))
+		return ALGORITHM_RAID10_OFFSET;
+	else if (!strcasecmp(name, "far"))
+		return ALGORITHM_RAID10_FAR;
+
+	return -EINVAL;
+}
+
+
+/* Return md raid10 copies for @layout */
+static unsigned int raid10_md_layout_to_copies(int layout)
 {
-	if ((layout & 0xFF) > 1)
-		return layout & 0xFF;
-	return (layout >> 8) & 0xFF;
+	return _raid10_near_copies(layout) > 1 ?
+	       _raid10_near_copies(layout) : _raid10_far_copies(layout);
 }
 
-static int raid10_format_to_md_layout(char *format, unsigned copies)
+/* Return md raid10 format id for @format string */
+static int raid10_format_to_md_layout(struct raid_set *rs,
+				      unsigned int algorithm,
+				      unsigned int copies)
 {
-	unsigned n = 1, f = 1;
+	unsigned int n = 1, f = 1, r = 0;
 
-	if (!strcasecmp("near", format))
+	/*
+	 * MD resilienece flaw:
+	 *
+	 * enabling use_far_sets for far/offset formats causes copies
+	 * to be colocated on the same devs together with their origins!
+	 *
+	 * -> disable it for now in the definition above
+	 */
+	if (algorithm == ALGORITHM_RAID10_DEFAULT ||
+	    algorithm == ALGORITHM_RAID10_NEAR)
 		n = copies;
-	else
+
+	else if (algorithm == ALGORITHM_RAID10_OFFSET) {
+		f = copies;
+		r = RAID10_OFFSET;
+		if (!_test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, rs->ctr_flags))
+			r |= RAID10_USE_FAR_SETS;
+
+	} else if (algorithm == ALGORITHM_RAID10_FAR) {
 		f = copies;
+		r = !RAID10_OFFSET;
+		if (!_test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, rs->ctr_flags))
+			r |= RAID10_USE_FAR_SETS;
 
-	if (!strcasecmp("offset", format))
-		return 0x30000 | (f << 8) | n;
+	} else
+		return -EINVAL;
+
+	return r | (f << RAID10_FAR_COPIES_SHIFT) | n;
+}
+/* END: MD raid10 bit definitions and helpers */
 
-	if (!strcasecmp("far", format))
-		return 0x20000 | (f << 8) | n;
+/* Check for any of the raid10 algorithms */
+static int _got_raid10(struct raid_type *rtp, const int layout)
+{
+	if (rtp->level == 10) {
+		switch (rtp->algorithm) {
+		case ALGORITHM_RAID10_DEFAULT:
+		case ALGORITHM_RAID10_NEAR:
+			return _is_raid10_near(layout);
+		case ALGORITHM_RAID10_OFFSET:
+			return _is_raid10_offset(layout);
+		case ALGORITHM_RAID10_FAR:
+			return _is_raid10_far(layout);
+		default:
+			break;
+		}
+	}
 
-	return (f << 8) | n;
+	return 0;
 }
 
+/* Return raid_type for @name */
 static struct raid_type *get_raid_type(const char *name)
 {
-	int i;
+	struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types);
+
+	while (rtp-- > raid_types)
+		if (!strcasecmp(rtp->name, name))
+			return rtp;
+
+	return NULL;
+}
 
-	for (i = 0; i < ARRAY_SIZE(raid_types); i++)
-		if (!strcmp(raid_types[i].name, name))
-			return &raid_types[i];
+/* Return raid_type for @name based derived from @level and @layout */
+static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
+{
+	struct raid_type *rtp = raid_types + ARRAY_SIZE(raid_types);
+
+	while (rtp-- > raid_types) {
+		/* RAID10 special checks based on @layout flags/properties */
+		if (rtp->level == level &&
+		    (_got_raid10(rtp, layout) || rtp->algorithm == layout))
+			return rtp;
+	}
 
 	return NULL;
 }
 
+/*
+ * Set the mddev properties in @rs to the new
+ * ones requested by the ctr
+ */
+static void rs_set_new(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+
+	mddev->level = mddev->new_level;
+	mddev->layout = mddev->new_layout;
+	mddev->chunk_sectors = mddev->new_chunk_sectors;
+	mddev->delta_disks = 0;
+}
+
+
 static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
 {
 	unsigned i;
@@ -379,6 +555,9 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
 
 	mddev_init(&rs->md);
 
+	rs->raid_disks = raid_devs;
+	rs->delta_disks = 0;
+
 	rs->ti = ti;
 	rs->raid_type = raid_type;
 	rs->md.raid_disks = raid_devs;
@@ -710,7 +889,7 @@ too_many:
 static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			     unsigned num_raid_params)
 {
-	char *raid10_format = "near";
+	int raid10_format = ALGORITHM_RAID10_DEFAULT;
 	unsigned raid10_copies = 2;
 	unsigned i;
 	unsigned value, region_size = 0;
@@ -718,6 +897,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	sector_t max_io_len;
 	const char *arg, *key;
 	struct raid_dev *rd;
+	struct raid_type *rt = rs->raid_type;
 
 	arg = dm_shift_arg(as);
 	num_raid_params--; /* Account for chunk_size argument */
@@ -729,7 +909,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	 * First, parse the in-order required arguments
 	 * "chunk_size" is the only argument of this type.
 	 */
-	if (rt_is_raid1(rs->raid_type)) {
+	if (rt_is_raid1(rt)) {
 		if (value)
 			DMERR("Ignoring chunk size parameter for RAID 1");
 		value = 0;
@@ -794,14 +974,11 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_RAID10_FORMAT))) {
 			if (_test_and_set_flag(CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
 				return ti_error_einval(rs->ti, "Only one raid10_format argument pair allowed");
-			if (!rt_is_raid10(rs->raid_type))
+			if (!rt_is_raid10(rt))
 				return ti_error_einval(rs->ti, "'raid10_format' is an invalid parameter for this RAID type");
-			if (strcmp("near", arg) &&
-			    strcmp("far", arg) &&
-			    strcmp("offset", arg))
-				return ti_error_einval(rs->ti, "Invalid 'raid10_format' value given");
-
-			raid10_format = (char *) arg;
+			raid10_format = raid10_name_to_format(arg);
+			if (raid10_format < 0)
+				return ti_error_ret(rs->ti, "Invalid 'raid10_format' value given", raid10_format);
 			continue;
 		}
 
@@ -823,7 +1000,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			rd->rdev.recovery_offset = 0;
 			_set_flag(CTR_FLAG_REBUILD, &rs->ctr_flags);
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_WRITE_MOSTLY))) {
-			if (!rt_is_raid1(rs->raid_type))
+			if (!rt_is_raid1(rt))
 				return ti_error_einval(rs->ti, "write_mostly option is only valid for RAID1");
 
 			if (!_in_range(value, 0, rs->md.raid_disks - 1))
@@ -832,7 +1009,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			set_bit(WriteMostly, &rs->dev[value].rdev.flags);
 			_set_flag(CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags);
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) {
-			if (!rt_is_raid1(rs->raid_type))
+			if (!rt_is_raid1(rt))
 				return ti_error_einval(rs->ti, "max_write_behind option is only valid for RAID1");
 
 			if (_test_and_set_flag(CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
@@ -862,7 +1039,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			 */
 			value /= 2;
 
-			if (!rt_is_raid456(rs->raid_type))
+			if (!rt_is_raid456(rt))
 				return ti_error_einval(rs->ti, "Inappropriate argument: stripe_cache");
 			if (raid5_set_cache_size(&rs->md, (int)value))
 				return ti_error_einval(rs->ti, "Bad stripe_cache size");
@@ -909,29 +1086,35 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	if (dm_set_target_max_io_len(rs->ti, max_io_len))
 		return -EINVAL;
 
-	if (rt_is_raid10(rs->raid_type)) {
+	if (rt_is_raid10(rt)) {
 		if (raid10_copies > rs->md.raid_disks)
 			return ti_error_einval(rs->ti, "Not enough devices to satisfy specification");
 
-		/*
-		 * If the format is not "near", we only support
-		 * two copies at the moment.
-		 */
-		if (strcmp("near", raid10_format) && (raid10_copies > 2))
-			return ti_error_einval(rs->ti, "Too many copies for given RAID10 format.");
+		rs->md.new_layout = raid10_format_to_md_layout(rs, raid10_format, raid10_copies);
+		if (rs->md.new_layout < 0)
+			return ti_error_ret(rs->ti, "Error getting raid10 format", rs->md.new_layout);
+
+		rt = get_raid_type_by_ll(10, rs->md.new_layout);
+		if (!rt)
+			return ti_error_einval(rs->ti, "Failed to recognize new raid10 layout");
+
+		if ((rt->algorithm == ALGORITHM_RAID10_DEFAULT ||
+		     rt->algorithm == ALGORITHM_RAID10_NEAR) &&
+		    _test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, rs->ctr_flags))
+			return ti_error_einval(rs->ti, "RAID10 format \"near\" and \"raid10_use_near_sets\" are incompatible");
 
 		/* (Len * #mirrors) / #devices */
 		sectors_per_dev = rs->ti->len * raid10_copies;
 		sector_div(sectors_per_dev, rs->md.raid_disks);
 
-		rs->md.layout = raid10_format_to_md_layout(raid10_format,
-							   raid10_copies);
+		rs->md.layout = raid10_format_to_md_layout(rs, raid10_format, raid10_copies);
 		rs->md.new_layout = rs->md.layout;
-	} else if (!rt_is_raid1(rs->raid_type) &&
+	} else if (!rt_is_raid1(rt) &&
 		   sector_div(sectors_per_dev,
-			      (rs->md.raid_disks - rs->raid_type->parity_devs)))
+			      (rs->md.raid_disks - rt->parity_devs)))
 		return ti_error_einval(rs->ti, "Target length not divisible by number of data devices");
 
+	rs->raid10_copies = raid10_copies;
 	rs->md.dev_sectors = sectors_per_dev;
 
 	/* Assume there are no metadata devices until the drives are parsed */
@@ -956,6 +1139,13 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
 	return mddev_congested(&rs->md, bits);
 }
 
+/*  Features */
+#define	FEATURE_FLAG_SUPPORTS_RESHAPE	0x1
+
+/* State flags for sb->flags */
+#define	SB_FLAG_RESHAPE_ACTIVE		0x1
+#define	SB_FLAG_RESHAPE_BACKWARDS	0x2
+
 /*
  * This structure is never routinely used by userspace, unlike md superblocks.
  * Devices with this superblock should only ever be accessed via device-mapper.
@@ -963,13 +1153,14 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
 #define DM_RAID_MAGIC 0x64526D44
 struct dm_raid_superblock {
 	__le32 magic;		/* "DmRd" */
-	__le32 features;	/* Used to indicate possible future changes */
+	__le32 compat_features;	/* Used to indicate compatible features (like 1.8.0 ondisk metadata extension) */
 
-	__le32 num_devices;	/* Number of devices in this array. (Max 64) */
-	__le32 array_position;	/* The position of this drive in the array */
+	__le32 num_devices;	/* Number of devices in this raid set. (Max 64) */
+	__le32 array_position;	/* The position of this drive in the raid set */
 
 	__le64 events;		/* Incremented by md when superblock updated */
-	__le64 failed_devices;	/* Bit field of devices to indicate failures */
+	__le64 failed_devices;	/* Pre 1.8.0 part of bit field of devices to */
+				/* indicate failures (see extension below) */
 
 	/*
 	 * This offset tracks the progress of the repair or replacement of
@@ -978,19 +1169,62 @@ struct dm_raid_superblock {
 	__le64 disk_recovery_offset;
 
 	/*
-	 * This offset tracks the progress of the initial array
+	 * This offset tracks the progress of the initial raid set
 	 * synchronisation/parity calculation.
 	 */
 	__le64 array_resync_offset;
 
 	/*
-	 * RAID characteristics
+	 * raid characteristics
 	 */
 	__le32 level;
 	__le32 layout;
 	__le32 stripe_sectors;
 
-	/* Remainder of a logical block is zero-filled when writing (see super_sync()). */
+	/********************************************************************
+	 * BELOW FOLLOW V1.8.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
+	 *
+	 * FEATURE_FLAG_SUPPORTS_RESHAPE in the features member indicates that those exist
+	 */
+
+	__le32 flags; /* Flags defining array states for reshaping */
+
+	/*
+	 * This offset tracks the progress of a raid
+	 * set reshape in order to be able to restart it
+	 */
+	__le64 reshape_position;
+
+	/*
+	 * These define the properties of the array in case of an interrupted reshape
+	 */
+	__le32 new_level;
+	__le32 new_layout;
+	__le32 new_stripe_sectors;
+	__le32 delta_disks;
+
+	__le64 array_sectors; /* Array size in sectors */
+
+	/*
+	 * Sector offsets to data on devices (reshaping).
+	 * Needed to support out of place reshaping, thus
+	 * not writing over any stripes whilst converting
+	 * them from old to new layout
+	 */
+	__le64 data_offset;
+	__le64 new_data_offset;
+
+	__le64 sectors; /* Used device size in sectors */
+
+	/*
+	 * Additonal Bit field of devices indicating failures to support
+	 * up to 256 devices with the 1.8.0 on-disk metadata format
+	 */
+	__le64 extended_failed_devices[DISKS_ARRAY_ELEMS - 1];
+
+	__le32 incompat_features;	/* Used to indicate any incompatible features */
+
+	/* Always set rest up to logical block size to 0 when writing (see get_metadata_device() below). */
 } __packed;
 
 static int read_disk_sb(struct md_rdev *rdev, int size)
@@ -1012,6 +1246,19 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
 	return 0;
 }
 
+static void sb_retrieve_failed_devices(struct dm_raid_superblock *sb, uint64_t *failed_devices)
+{
+	failed_devices[0] = le64_to_cpu(sb->failed_devices);
+	memset(failed_devices + 1, 0, sizeof(sb->extended_failed_devices));
+
+	if (_test_flag(FEATURE_FLAG_SUPPORTS_RESHAPE, le32_to_cpu(sb->compat_features))) {
+		int i = ARRAY_SIZE(sb->extended_failed_devices);
+
+		while (i--)
+			failed_devices[i+1] = le64_to_cpu(sb->extended_failed_devices[i]);
+	}
+}
+
 static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 {
 	int i;
@@ -1030,7 +1277,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 	memset(sb + 1, 0, rdev->sb_size - sizeof(*sb));
 
 	sb->magic = cpu_to_le32(DM_RAID_MAGIC);
-	sb->features = cpu_to_le32(0);	/* No features yet */
+	sb->compat_features = cpu_to_le32(0);	/* No features yet */
 
 	sb->num_devices = cpu_to_le32(mddev->raid_disks);
 	sb->array_position = cpu_to_le32(rdev->raid_disk);
@@ -1103,119 +1350,196 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 	return (events_sb > events_refsb) ? 1 : 0;
 }
 
-static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
+static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 {
 	int role;
-	struct raid_set *rs = container_of(mddev, struct raid_set, md);
+	unsigned int d;
+	struct mddev *mddev = &rs->md;
 	uint64_t events_sb;
-	uint64_t failed_devices;
+	uint64_t failed_devices[DISKS_ARRAY_ELEMS];
 	struct dm_raid_superblock *sb;
-	uint32_t new_devs = 0;
-	uint32_t rebuilds = 0;
+	uint32_t new_devs = 0, rebuild_and_new = 0, rebuilds = 0;
 	struct md_rdev *r;
 	struct dm_raid_superblock *sb2;
 
 	sb = page_address(rdev->sb_page);
 	events_sb = le64_to_cpu(sb->events);
-	failed_devices = le64_to_cpu(sb->failed_devices);
 
 	/*
 	 * Initialise to 1 if this is a new superblock.
 	 */
 	mddev->events = events_sb ? : 1;
 
+	mddev->reshape_position = MaxSector;
+
 	/*
-	 * Reshaping is not currently allowed
+	 * Reshaping is supported, e.g. reshape_position is valid
+	 * in superblock and superblock content is authoritative.
 	 */
-	if (le32_to_cpu(sb->level) != mddev->level) {
-		DMERR("Reshaping arrays not yet supported. (RAID level change)");
-		return -EINVAL;
-	}
-	if (le32_to_cpu(sb->layout) != mddev->layout) {
-		DMERR("Reshaping arrays not yet supported. (RAID layout change)");
-		DMERR("  0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
-		DMERR("  Old layout: %s w/ %d copies",
-		      raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
-		      raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
-		DMERR("  New layout: %s w/ %d copies",
-		      raid10_md_layout_to_format(mddev->layout),
-		      raid10_md_layout_to_copies(mddev->layout));
-		return -EINVAL;
-	}
-	if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
-		DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
-		return -EINVAL;
-	}
+	if (_test_flag(FEATURE_FLAG_SUPPORTS_RESHAPE, le32_to_cpu(sb->compat_features))) {
+		/* Superblock is authoritative wrt given raid set layout! */
+		mddev->raid_disks = le32_to_cpu(sb->num_devices);
+		mddev->level = le32_to_cpu(sb->level);
+		mddev->layout = le32_to_cpu(sb->layout);
+		mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors);
+		mddev->new_level = le32_to_cpu(sb->new_level);
+		mddev->new_layout = le32_to_cpu(sb->new_layout);
+		mddev->new_chunk_sectors = le32_to_cpu(sb->new_stripe_sectors);
+		mddev->delta_disks = le32_to_cpu(sb->delta_disks);
+		mddev->array_sectors = le64_to_cpu(sb->array_sectors);
+
+		/* raid was reshaping and got interrupted */
+		if (_test_flag(SB_FLAG_RESHAPE_ACTIVE, le32_to_cpu(sb->flags))) {
+			if (_test_flag(CTR_FLAG_DELTA_DISKS, rs->ctr_flags)) {
+				DMERR("Reshape requested but raid set is still reshaping");
+				return -EINVAL;
+			}
 
-	/* We can only change the number of devices in RAID1 right now */
-	if (!rt_is_raid1(rs->raid_type) &&
-	    (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
-		DMERR("Reshaping arrays not yet supported. (device count change)");
-		return -EINVAL;
+			if (mddev->delta_disks < 0 ||
+			    (!mddev->delta_disks && _test_flag(SB_FLAG_RESHAPE_BACKWARDS, le32_to_cpu(sb->flags))))
+				mddev->reshape_backwards = 1;
+			else
+				mddev->reshape_backwards = 0;
+
+			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
+			rs->raid_type = get_raid_type_by_ll(mddev->level, mddev->layout);
+		}
+
+	} else {
+		/*
+		 * Reshaping is not allowed, because we don't have the appropriate metadata
+		 */
+		if (le32_to_cpu(sb->level) != mddev->level) {
+			DMERR("Reshaping/takeover raid sets not yet supported. (raid level/stripes/size change)");
+			return -EINVAL;
+		}
+		if (le32_to_cpu(sb->layout) != mddev->layout) {
+			DMERR("Reshaping raid sets not yet supported. (raid layout change)");
+			DMERR("  0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
+			DMERR("  Old layout: %s w/ %d copies",
+			      raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
+			      raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
+			DMERR("  New layout: %s w/ %d copies",
+			      raid10_md_layout_to_format(mddev->layout),
+			      raid10_md_layout_to_copies(mddev->layout));
+			return -EINVAL;
+		}
+		if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
+			DMERR("Reshaping raid sets not yet supported. (stripe sectors change)");
+			return -EINVAL;
+		}
+
+		/* We can only change the number of devices in raid1 with old (i.e. pre 1.0.7) metadata */
+		if (!rt_is_raid1(rs->raid_type) &&
+		    (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
+			DMERR("Reshaping raid sets not yet supported. (device count change from %u to %u)",
+			      sb->num_devices, mddev->raid_disks);
+			return -EINVAL;
+		}
+
+		/* Table line is checked vs. authoritative superblock */
+		rs_set_new(rs);
 	}
 
-	if (!(_test_flags(CTR_FLAGS_ANY_SYNC, rs->ctr_flags)))
+	if (!_test_flag(CTR_FLAG_NOSYNC, rs->ctr_flags))
 		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
 
 	/*
 	 * During load, we set FirstUse if a new superblock was written.
 	 * There are two reasons we might not have a superblock:
-	 * 1) The array is brand new - in which case, all of the
+	 * 1) The raid set is brand new - in which case, all of the
 	 *    devices must have their In_sync bit set.  Also,
 	 *    recovery_cp must be 0, unless forced.
-	 * 2) This is a new device being added to an old array
+	 * 2) This is a new device being added to an old raid set
 	 *    and the new device needs to be rebuilt - in which
 	 *    case the In_sync bit will /not/ be set and
 	 *    recovery_cp must be MaxSector.
 	 */
+	d = 0;
 	rdev_for_each(r, mddev) {
+		if (test_bit(FirstUse, &r->flags))
+			new_devs++;
+
 		if (!test_bit(In_sync, &r->flags)) {
-			DMINFO("Device %d specified for rebuild: "
-			       "Clearing superblock", r->raid_disk);
+			DMINFO("Device %d specified for rebuild; clearing superblock",
+				r->raid_disk);
 			rebuilds++;
-		} else if (test_bit(FirstUse, &r->flags))
-			new_devs++;
+
+			if (test_bit(FirstUse, &r->flags))
+				rebuild_and_new++;
+		}
+
+		d++;
 	}
 
-	if (!rebuilds) {
-		if (new_devs == mddev->raid_disks) {
-			DMINFO("Superblocks created for new array");
+	if (new_devs == rs->raid_disks || !rebuilds) {
+		/* Replace a broken device */
+		if (new_devs == 1 && !rs->delta_disks)
+			;
+		if (new_devs == rs->raid_disks) {
+			DMINFO("Superblocks created for new raid set");
 			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
-		} else if (new_devs) {
-			DMERR("New device injected "
-			      "into existing array without 'rebuild' "
-			      "parameter specified");
+			mddev->recovery_cp = 0;
+		} else if (new_devs && new_devs != rs->raid_disks && !rebuilds) {
+			DMERR("New device injected into existing raid set without "
+			      "'delta_disks' or 'rebuild' parameter specified");
 			return -EINVAL;
 		}
-	} else if (new_devs) {
-		DMERR("'rebuild' devices cannot be "
-		      "injected into an array with other first-time devices");
-		return -EINVAL;
-	} else if (mddev->recovery_cp != MaxSector) {
-		DMERR("'rebuild' specified while array is not in-sync");
+	} else if (new_devs && new_devs != rebuilds) {
+		DMERR("%u 'rebuild' devices cannot be injected into"
+		      " a raid set with %u other first-time devices",
+		      rebuilds, new_devs);
 		return -EINVAL;
+	} else if (rebuilds) {
+		if (rebuild_and_new && rebuilds != rebuild_and_new) {
+			DMERR("new device%s provided without 'rebuild'",
+			      new_devs > 1 ? "s" : "");
+			return -EINVAL;
+		} else if (mddev->recovery_cp != MaxSector) {
+			DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)",
+			      (unsigned long long) mddev->recovery_cp);
+			return -EINVAL;
+		} else if (mddev->reshape_position != MaxSector) {
+			DMERR("'rebuild' specified while raid set is being reshaped");
+			return -EINVAL;
+		}
 	}
 
 	/*
 	 * Now we set the Faulty bit for those devices that are
 	 * recorded in the superblock as failed.
 	 */
+	sb_retrieve_failed_devices(sb, failed_devices);
 	rdev_for_each(r, mddev) {
 		if (!r->sb_page)
 			continue;
 		sb2 = page_address(r->sb_page);
 		sb2->failed_devices = 0;
+		memset(sb2->extended_failed_devices, 0, sizeof(sb2->extended_failed_devices));
 
 		/*
 		 * Check for any device re-ordering.
 		 */
 		if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
 			role = le32_to_cpu(sb2->array_position);
+			if (role < 0)
+				continue;
+
 			if (role != r->raid_disk) {
-				if (!rt_is_raid1(rs->raid_type))
-					return ti_error_einval(rs->ti, "Cannot change device "
-								       "positions in RAID array");
-				DMINFO("RAID1 device #%d now at position #%d",
+				if (_is_raid10_near(mddev->layout)) {
+					if (mddev->raid_disks % _raid10_near_copies(mddev->layout) ||
+					    rs->raid_disks % rs->raid10_copies)
+						return ti_error_einval(rs->ti, "Cannot change raid10 near "
+									       "set to odd # of devices!");
+
+					sb2->array_position = cpu_to_le32(r->raid_disk);
+
+				} else if (!(rs_is_raid10(rs) && rt_is_raid0(rs->raid_type)) &&
+				    !(rs_is_raid0(rs) && rt_is_raid10(rs->raid_type)) &&
+				    !rt_is_raid1(rs->raid_type))
+					return ti_error_einval(rs->ti, "Cannot change device positions in raid set");
+
+				DMINFO("raid device #%d now at position #%d",
 				       role, r->raid_disk);
 			}
 
@@ -1223,7 +1547,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 			 * Partial recovery is performed on
 			 * returning failed devices.
 			 */
-			if (failed_devices & (1 << role))
+			if (test_bit(role, (void *) failed_devices))
 				set_bit(Faulty, &r->flags);
 		}
 	}
@@ -1234,16 +1558,21 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 {
 	struct mddev *mddev = &rs->md;
-	struct dm_raid_superblock *sb = page_address(rdev->sb_page);
+	struct dm_raid_superblock *sb;
+
+	if (!rdev->sb_page)
+		return 0;
+
+	sb = page_address(rdev->sb_page);
 
 	/*
 	 * If mddev->events is not set, we know we have not yet initialized
 	 * the array.
 	 */
-	if (!mddev->events && super_init_validation(mddev, rdev))
+	if (!mddev->events && super_init_validation(rs, rdev))
 		return -EINVAL;
 
-	if (le32_to_cpu(sb->features)) {
+	if (sb->compat_features || sb->incompat_features) {
 		rs->ti->error = "Unable to assemble array: No feature flags supported yet";
 		return -EINVAL;
 	}
@@ -1252,23 +1581,32 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 	mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
 	rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
 
-	if (!test_bit(FirstUse, &rdev->flags)) {
+	if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
+		/* Retrieve device size stored in superblock to be prepared for shrink */
+		rdev->sectors = le64_to_cpu(sb->sectors);
 		rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
-		if (rdev->recovery_offset != MaxSector)
-			clear_bit(In_sync, &rdev->flags);
+		if (rdev->recovery_offset == MaxSector)
+			set_bit(In_sync, &rdev->flags);
+		/*
+		 * If no reshape in progress -> we're recovering single
+		 * disk(s) and have to set the device(s) to out-of-sync
+		 */
+		else if (rs->md.reshape_position == MaxSector)
+			clear_bit(In_sync, &rdev->flags); /* Mandatory for recovery */
 	}
 
 	/*
 	 * If a device comes back, set it as not In_sync and no longer faulty.
 	 */
-	if (test_bit(Faulty, &rdev->flags)) {
-		clear_bit(Faulty, &rdev->flags);
+	if (test_and_clear_bit(Faulty, &rdev->flags)) {
+		rdev->recovery_offset = 0;
 		clear_bit(In_sync, &rdev->flags);
 		rdev->saved_raid_disk = rdev->raid_disk;
-		rdev->recovery_offset = 0;
 	}
 
-	clear_bit(FirstUse, &rdev->flags);
+	/* Reshape support -> restore repective data offsets */
+	rdev->data_offset = le64_to_cpu(sb->data_offset);
+	rdev->new_data_offset = le64_to_cpu(sb->new_data_offset);
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 4763e543a679053be345e1129bcb5df78f849294 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 19 May 2016 18:49:31 +0200
Subject: dm raid: add new reshaping/raid10 format table line options to
 parameter parser

Support the follwoing arguments in the ctr parameter parser:

 - add 'delta_disks', 'data_offset' taking int and sector respectively

 - 'raid10_use_near_sets' bool argument to optionally select
   near sets with supporting raid10 mappings

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 48 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 9 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c98c34c4d284..bc5a53dfaa07 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -140,6 +140,7 @@ struct raid_set {
 
 	int raid_disks;
 	int delta_disks;
+	int data_offset;
 	int raid10_copies;
 
 	struct mddev md;
@@ -241,6 +242,9 @@ static struct arg_name_flag {
 	{ CTR_FLAG_REGION_SIZE, "region_size"},
 	{ CTR_FLAG_RAID10_COPIES, "raid10_copies"},
 	{ CTR_FLAG_RAID10_FORMAT, "raid10_format"},
+	{ CTR_FLAG_DATA_OFFSET, "data_offset"},
+	{ CTR_FLAG_DELTA_DISKS, "delta_disks"},
+	{ CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
 };
 
 /* Return argument name string for given @flag */
@@ -946,22 +950,28 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	 * Second, parse the unordered optional arguments
 	 */
 	for (i = 0; i < num_raid_params; i++) {
-		arg = dm_shift_arg(as);
-		if (!arg)
+		key = dm_shift_arg(as);
+		if (!key)
 			return ti_error_einval(rs->ti, "Not enough raid parameters given");
 
-		if (!strcasecmp(arg, "nosync")) {
+		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_NOSYNC))) {
+			if (_test_and_set_flag(CTR_FLAG_NOSYNC, &rs->ctr_flags))
+				return ti_error_einval(rs->ti, "Only one 'nosync' argument allowed");
 			rs->md.recovery_cp = MaxSector;
-			_set_flag(CTR_FLAG_NOSYNC, &rs->ctr_flags);
 			continue;
 		}
-		if (!strcasecmp(arg, "sync")) {
+		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_SYNC))) {
+			if (_test_and_set_flag(CTR_FLAG_SYNC, &rs->ctr_flags))
+				return ti_error_einval(rs->ti, "Only one 'sync' argument allowed");
 			rs->md.recovery_cp = 0;
-			_set_flag(CTR_FLAG_SYNC, &rs->ctr_flags);
+			continue;
+		}
+		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_RAID10_USE_NEAR_SETS))) {
+			if (_test_and_set_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
+				return ti_error_einval(rs->ti, "Only one 'raid10_use_new_sets' argument allowed");
 			continue;
 		}
 
-		key = arg;
 		arg = dm_shift_arg(as);
 		i++; /* Account for the argument pairs */
 		if (!arg)
@@ -973,7 +983,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 
 		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_RAID10_FORMAT))) {
 			if (_test_and_set_flag(CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
-				return ti_error_einval(rs->ti, "Only one raid10_format argument pair allowed");
+				return ti_error_einval(rs->ti, "Only one 'raid10_format' argument pair allowed");
 			if (!rt_is_raid10(rt))
 				return ti_error_einval(rs->ti, "'raid10_format' is an invalid parameter for this RAID type");
 			raid10_format = raid10_name_to_format(arg);
@@ -1030,6 +1040,26 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			if (!value || (value > MAX_SCHEDULE_TIMEOUT))
 				return ti_error_einval(rs->ti, "daemon sleep period out of range");
 			rs->md.bitmap_info.daemon_sleep = value;
+		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_DATA_OFFSET))) {
+			/* Userspace passes new data_offset after having extended the the data image LV */
+			if (_test_and_set_flag(CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
+				return ti_error_einval(rs->ti, "Only one data_offset argument pair allowed");
+
+			/* Ensure sensible data offset */
+			if (value < 0)
+				return ti_error_einval(rs->ti, "Bogus data_offset value");
+
+			rs->data_offset = value;
+		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_DELTA_DISKS))) {
+			/* Define the +/-# of disks to add to/remove from the given raid set */
+			if (_test_and_set_flag(CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
+				return ti_error_einval(rs->ti, "Only one delta_disks argument pair allowed");
+
+			/* Ensure MAX_RAID_DEVICES and raid type minimal_devs! */
+			if (!_in_range(abs(value), 1, MAX_RAID_DEVICES - rt->minimal_devs))
+				return ti_error_einval(rs->ti, "Too many delta_disk requested");
+
+			rs->delta_disks = value;
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_STRIPE_CACHE))) {
 			if (_test_and_set_flag(CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
 				return ti_error_einval(rs->ti, "Only one stripe_cache argument pair allowed");
@@ -1101,7 +1131,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		if ((rt->algorithm == ALGORITHM_RAID10_DEFAULT ||
 		     rt->algorithm == ALGORITHM_RAID10_NEAR) &&
 		    _test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, rs->ctr_flags))
-			return ti_error_einval(rs->ti, "RAID10 format \"near\" and \"raid10_use_near_sets\" are incompatible");
+			return ti_error_einval(rs->ti, "RAID10 format 'near' and 'raid10_use_near_sets' are incompatible");
 
 		/* (Len * #mirrors) / #devices */
 		sectors_per_dev = rs->ti->len * raid10_copies;
-- 
cgit v1.2.3-70-g09d2


From 7b34df74d27b2a2350426daec80cda5d3c873622 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 19 May 2016 18:49:32 +0200
Subject: dm raid: enhance super_sync() to support new superblock members

Add transferring the new takeover/reshape related superblock
members introduced to the super_sync() function:

 - add/move supporting functions

 - add failed devices bitfield transfer functions to retrieve the
   bitfield from superblock format or update it in the superblock

 - add code to transfer all new members

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 75 +++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 65 insertions(+), 10 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index bc5a53dfaa07..28420337f880 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -214,6 +214,12 @@ static bool _test_flags(uint32_t flags, uint32_t all_flags)
 	return (flags & all_flags) ? true : false;
 }
 
+/* Clear (multiple) @flags in @all_flags */
+static void _clear_flags(uint32_t flags, uint32_t *all_flags)
+{
+	*all_flags &= ~flags;
+}
+
 /* Return true if single @flag is set in @*flags, else set it and return false */
 static bool _test_and_set_flag(uint32_t flag, uint32_t *flags)
 {
@@ -1289,31 +1295,54 @@ static void sb_retrieve_failed_devices(struct dm_raid_superblock *sb, uint64_t *
 	}
 }
 
+static void sb_update_failed_devices(struct dm_raid_superblock *sb, uint64_t *failed_devices)
+{
+	int i = ARRAY_SIZE(sb->extended_failed_devices);
+
+	sb->failed_devices = cpu_to_le64(failed_devices[0]);
+	while (i--)
+		sb->extended_failed_devices[i] = cpu_to_le64(failed_devices[i+1]);
+}
+
+/*
+ * Synchronize the superblock members with the raid set properties
+ *
+ * All superblock data is little endian.
+ */
 static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 {
-	int i;
-	uint64_t failed_devices;
+	bool update_failed_devices = false;
+	unsigned int i;
+	uint64_t failed_devices[DISKS_ARRAY_ELEMS];
 	struct dm_raid_superblock *sb;
 	struct raid_set *rs = container_of(mddev, struct raid_set, md);
 
+	/* No metadata device, no superblock */
+	if (!rdev->meta_bdev)
+		return;
+
+	BUG_ON(!rdev->sb_page);
+
 	sb = page_address(rdev->sb_page);
-	failed_devices = le64_to_cpu(sb->failed_devices);
 
-	for (i = 0; i < mddev->raid_disks; i++)
-		if (!rs->dev[i].data_dev ||
-		    test_bit(Faulty, &(rs->dev[i].rdev.flags)))
-			failed_devices |= (1ULL << i);
+	sb_retrieve_failed_devices(sb, failed_devices);
 
-	memset(sb + 1, 0, rdev->sb_size - sizeof(*sb));
+	for (i = 0; i < rs->raid_disks; i++)
+		if (!rs->dev[i].data_dev || test_bit(Faulty, &rs->dev[i].rdev.flags)) {
+			update_failed_devices = true;
+			set_bit(i, (void *) failed_devices);
+		}
+
+	if (update_failed_devices)
+		sb_update_failed_devices(sb, failed_devices);
 
 	sb->magic = cpu_to_le32(DM_RAID_MAGIC);
-	sb->compat_features = cpu_to_le32(0);	/* No features yet */
+	sb->compat_features = cpu_to_le32(0); /* Don't set reshape flag yet */
 
 	sb->num_devices = cpu_to_le32(mddev->raid_disks);
 	sb->array_position = cpu_to_le32(rdev->raid_disk);
 
 	sb->events = cpu_to_le64(mddev->events);
-	sb->failed_devices = cpu_to_le64(failed_devices);
 
 	sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
 	sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
@@ -1321,6 +1350,32 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 	sb->level = cpu_to_le32(mddev->level);
 	sb->layout = cpu_to_le32(mddev->layout);
 	sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
+
+	sb->new_level = cpu_to_le32(mddev->new_level);
+	sb->new_layout = cpu_to_le32(mddev->new_layout);
+	sb->new_stripe_sectors = cpu_to_le32(mddev->new_chunk_sectors);
+
+	sb->delta_disks = cpu_to_le32(mddev->delta_disks);
+
+	smp_rmb(); /* Make sure we access most recent reshape position */
+	sb->reshape_position = cpu_to_le64(mddev->reshape_position);
+	if (le64_to_cpu(sb->reshape_position) != MaxSector) {
+		/* Flag ongoing reshape */
+		sb->flags |= cpu_to_le32(SB_FLAG_RESHAPE_ACTIVE);
+
+		if (mddev->delta_disks < 0 || mddev->reshape_backwards)
+			sb->flags |= cpu_to_le32(SB_FLAG_RESHAPE_BACKWARDS);
+	} else
+		/* Flag no reshape */
+		_clear_flags(cpu_to_le32(SB_FLAG_RESHAPE_ACTIVE|SB_FLAG_RESHAPE_BACKWARDS), &sb->flags);
+
+	sb->array_sectors = cpu_to_le64(mddev->array_sectors);
+	sb->data_offset = cpu_to_le64(rdev->data_offset);
+	sb->new_data_offset = cpu_to_le64(rdev->new_data_offset);
+	sb->sectors = cpu_to_le64(rdev->sectors);
+
+	/* Zero out the rest of the payload after the size of the superblock */
+	memset(sb + 1, 0, rdev->sb_size - sizeof(*sb));
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From ecbfb9f118bce49f571675929160e4ecef91cc8a Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 19 May 2016 18:49:33 +0200
Subject: dm raid: add raid level takeover support

Add raid level takeover support allowing arbitrary takeovers between
raid levels supported by md personalities (i.e. raid0, raid1/10 and
raid4/5/6):

 - add rs_config_{backup|restore} function to allow for temporary
   storing ctr requested layout changes and restore them for takeover
   conersion decision after the superblocks got loaded and analyzed

 - add members to store layout to 'struct raid_set' (not mandatory
   for takeover but needed for reshape in later patch)

 - add rebuild_disks bitfield to 'struct raid_set' and set bits in ctr
   to use in setting up takeover (base to address a 'rebuild' related
   raid_status() table line bug and needed as well for reshape in future
   patch)

 - add runtime flags and respective manipulation functions to be able to
   control e.g. wrting of superlocks to the preresume function on
   takeover and (later) reshape

 - add functions to detect takeover, check it's valid (mandatory here to
   avoid failing on md_run()), setup for it and use in the ctr; those
   will be likely moved out once reshaping gets added to simplify the
   ctr

 - start raid set readonly in ctr and switch to readwrite, optionally
   updating superblocks, in preresume in order to allow suspend to
   quiesce any active table before (which involves superblock updates);
   this ensures the proper sequence of writing the current and any new
   takeover(/reshape) metadata

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 455 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 426 insertions(+), 29 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 28420337f880..89b677c7cfe6 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -98,6 +98,13 @@ struct raid_dev {
 #define ALL_CTR_FLAGS		(CTR_FLAG_OPTIONS_NO_ARGS | \
 				 CTR_FLAG_OPTIONS_ONE_ARG)
 
+/*
+ * All flags which cause a recovery unfreeze once they got stored in the raid metadata
+ */
+#define	ALL_FREEZE_FLAGS (ALL_CTR_FLAGS & ~(CTR_FLAG_REGION_SIZE | CTR_FLAGS_ANY_SYNC | \
+					    CTR_FLAG_RAID10_FORMAT | CTR_FLAG_RAID10_COPIES | \
+					    CTR_FLAG_RAID10_USE_NEAR_SETS))
+
 /* Invalid options definitions per raid level... */
 
 /* "raid0" does not accept any options */
@@ -129,14 +136,39 @@ struct raid_dev {
 #define RAID6_INVALID_FLAGS	(CTR_FLAG_NOSYNC | RAID45_INVALID_FLAGS)
 /* ...invalid options definitions per raid level */
 
+/*
+ * Flags for rs->runtime_flags field
+ * (RT_FLAG prefix meaning "runtime flag")
+ *
+ * These are all internal and used to define runtime state,
+ * e.g. to prevent another resume from preresume processing
+ * the raid set all over again.
+ */
+#define RT_FLAG_RS_PRERESUMED		0x1
+#define RT_FLAG_RS_RESUMED		0x2
+#define RT_FLAG_RS_BITMAP_LOADED	0x4
+#define RT_FLAG_UPDATE_SBS		0x8
+
 /* Array elements of 64 bit needed for rebuild/write_mostly bits */
 #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
 
+/*
+ * raid set level, layout and chunk sectors backup/restore
+ */
+struct rs_layout {
+	int new_level;
+	int new_layout;
+	int new_chunk_sectors;
+};
+
 struct raid_set {
 	struct dm_target *ti;
 
 	uint32_t bitmap_loaded;
 	uint32_t ctr_flags;
+	uint32_t runtime_flags;
+
+	uint64_t rebuild_disks[DISKS_ARRAY_ELEMS];
 
 	int raid_disks;
 	int delta_disks;
@@ -146,10 +178,41 @@ struct raid_set {
 	struct mddev md;
 	struct raid_type *raid_type;
 	struct dm_target_callbacks callbacks;
+	struct rs_layout rs_layout;
 
 	struct raid_dev dev[0];
 };
 
+/* Backup/restore raid set configuration helpers */
+static void _rs_config_backup(struct raid_set *rs, struct rs_layout *l)
+{
+	struct mddev *mddev = &rs->md;
+
+	l->new_level = mddev->new_level;
+	l->new_layout = mddev->new_layout;
+	l->new_chunk_sectors = mddev->new_chunk_sectors;
+}
+
+static void rs_config_backup(struct raid_set *rs)
+{
+	return _rs_config_backup(rs, &rs->rs_layout);
+}
+
+static void _rs_config_restore(struct raid_set *rs, struct rs_layout *l)
+{
+	struct mddev *mddev = &rs->md;
+
+	mddev->new_level = l->new_level;
+	mddev->new_layout = l->new_layout;
+	mddev->new_chunk_sectors = l->new_chunk_sectors;
+}
+
+static void rs_config_restore(struct raid_set *rs)
+{
+	return _rs_config_restore(rs, &rs->rs_layout);
+}
+/* END: backup/restore raid set configuration helpers */
+
 /* raid10 algorithms (i.e. formats) */
 #define	ALGORITHM_RAID10_DEFAULT	0
 #define	ALGORITHM_RAID10_NEAR		1
@@ -201,6 +264,13 @@ static void _set_flag(uint32_t flag, uint32_t *flags)
 	*flags |= flag;
 }
 
+/* Clear single @flag in @flags */
+static void _clear_flag(uint32_t flag, uint32_t *flags)
+{
+	WARN_ON_ONCE(hweight32(flag) != 1);
+	*flags &= ~flag;
+}
+
 /* Test single @flag in @flags */
 static bool _test_flag(uint32_t flag, uint32_t flags)
 {
@@ -229,6 +299,17 @@ static bool _test_and_set_flag(uint32_t flag, uint32_t *flags)
 	_set_flag(flag, flags);
 	return false;
 }
+
+/* Return true if single @flag is set in @*flags and clear it, else return false */
+static bool _test_and_clear_flag(uint32_t flag, uint32_t *flags)
+{
+	if (_test_flag(flag, *flags)) {
+		_clear_flag(flag, flags);
+		return true;
+	}
+
+	return false;
+}
 /* ...ctr and runtime flag bit manipulation */
 
 /* All table line arguments are defined here */
@@ -576,7 +657,7 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
 	rs->md.layout = raid_type->algorithm;
 	rs->md.new_layout = rs->md.layout;
 	rs->md.delta_disks = 0;
-	rs->md.recovery_cp = 0;
+	rs->md.recovery_cp = rs_is_raid0(rs) ? MaxSector : 0;
 
 	for (i = 0; i < raid_devs; i++)
 		md_rdev_init(&rs->dev[i].rdev);
@@ -1007,9 +1088,12 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			 * indexes of replaced devices and to set up additional
 			 * devices on raid level takeover.
  			 */
-			if (!_in_range(value, 0, rs->md.raid_disks - 1))
+			if (!_in_range(value, 0, rs->raid_disks - 1))
 				return ti_error_einval(rs->ti, "Invalid rebuild index given");
 
+			if (test_and_set_bit(value, (void *) rs->rebuild_disks))
+				return ti_error_einval(rs->ti, "rebuild for this index already given");
+
 			rd = rs->dev + value;
 			clear_bit(In_sync, &rd->rdev.flags);
 			clear_bit(Faulty, &rd->rdev.flags);
@@ -1175,8 +1259,166 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
 	return mddev_congested(&rs->md, bits);
 }
 
+/*
+ * Make sure a valid takover (level switch) is being requested on @rs
+ *
+ * Conversions of raid sets from one MD personality to another
+ * have to conform to restrictions which are enforced here.
+ *
+ * Degration is already checked for in rs_check_conversion() below.
+ */
+static int rs_check_takeover(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+	unsigned int near_copies;
+
+	switch (mddev->level) {
+	case 0:
+		/* raid0 -> raid1/5 with one disk */
+		if ((mddev->new_level == 1 || mddev->new_level == 5) &&
+		    mddev->raid_disks == 1)
+			return 0;
+
+		/* raid0 -> raid10 */
+		if (mddev->new_level == 10 &&
+		    !(rs->raid_disks % 2))
+			return 0;
+
+		/* raid0 with multiple disks -> raid4/5/6 */
+		if (_in_range(mddev->new_level, 4, 6) &&
+		    mddev->new_layout == ALGORITHM_PARITY_N &&
+		    mddev->raid_disks > 1)
+			return 0;
+
+		break;
+
+	case 10:
+		/* Can't takeover raid10_offset! */
+		if (_is_raid10_offset(mddev->layout))
+			break;
+
+		near_copies = _raid10_near_copies(mddev->layout);
+
+		/* raid10* -> raid0 */
+		if (mddev->new_level == 0) {
+			/* Can takeover raid10_near with raid disks divisable by data copies! */
+			if (near_copies > 1 &&
+			    !(mddev->raid_disks % near_copies)) {
+				mddev->raid_disks /= near_copies;
+				mddev->delta_disks = mddev->raid_disks;
+				return 0;
+			}
+
+			/* Can takeover raid10_far */
+			if (near_copies == 1 &&
+			   _raid10_far_copies(mddev->layout) > 1)
+				return 0;
+
+			break;
+		}
+
+		/* raid10_{near,far} -> raid1 */
+		if (mddev->new_level == 1 &&
+		    max(near_copies, _raid10_far_copies(mddev->layout)) == mddev->raid_disks)
+			return 0;
+
+		/* raid10_{near,far} with 2 disks -> raid4/5 */
+		if (_in_range(mddev->new_level, 4, 5) &&
+		    mddev->raid_disks == 2)
+			return 0;
+		break;
+
+	case 1:
+		/* raid1 with 2 disks -> raid4/5 */
+		if (_in_range(mddev->new_level, 4, 5) &&
+		    mddev->raid_disks == 2) {
+			mddev->degraded = 1;
+			return 0;
+		}
+
+		/* raid1 -> raid0 */
+		if (mddev->new_level == 0 &&
+		    mddev->raid_disks == 1)
+			return 0;
+
+		/* raid1 -> raid10 */
+		if (mddev->new_level == 10)
+			return 0;
+
+		break;
+
+	case 4:
+		/* raid4 -> raid0 */
+		if (mddev->new_level == 0)
+			return 0;
+
+		/* raid4 -> raid1/5 with 2 disks */
+		if ((mddev->new_level == 1 || mddev->new_level == 5) &&
+		    mddev->raid_disks == 2)
+			return 0;
+
+		/* raid4 -> raid5/6 with parity N */
+		if (_in_range(mddev->new_level, 5, 6) &&
+		    mddev->layout == ALGORITHM_PARITY_N)
+			return 0;
+		break;
+
+	case 5:
+		/* raid5 with parity N -> raid0 */
+		if (mddev->new_level == 0 &&
+		    mddev->layout == ALGORITHM_PARITY_N)
+			return 0;
+
+		/* raid5 with parity N -> raid4 */
+		if (mddev->new_level == 4 &&
+		    mddev->layout == ALGORITHM_PARITY_N)
+			return 0;
+
+		/* raid5 with 2 disks -> raid1/4/10 */
+		if ((mddev->new_level == 1 || mddev->new_level == 4 || mddev->new_level == 10) &&
+		    mddev->raid_disks == 2)
+			return 0;
+
+		/* raid5 with parity N -> raid6 with parity N */
+		if (mddev->new_level == 6 &&
+		    ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) ||
+		      _in_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC_6, ALGORITHM_RIGHT_SYMMETRIC_6)))
+			return 0;
+		break;
+
+	case 6:
+		/* raid6 with parity N -> raid0 */
+		if (mddev->new_level == 0 &&
+		    mddev->layout == ALGORITHM_PARITY_N)
+			return 0;
+
+		/* raid6 with parity N -> raid4 */
+		if (mddev->new_level == 4 &&
+		    mddev->layout == ALGORITHM_PARITY_N)
+			return 0;
+
+		/* raid6_*_n with parity N -> raid5_* */
+		if (mddev->new_level == 5 &&
+		    ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) ||
+		     _in_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC, ALGORITHM_RIGHT_SYMMETRIC)))
+			return 0;
+
+	default:
+		break;
+	}
+
+	return ti_error_einval(rs->ti, "takeover not possible");
+}
+
+/* True if @rs requested to be taken over */
+static bool rs_takeover_requested(struct raid_set *rs)
+{
+	return rs->md.new_level != rs->md.level;
+}
+
 /*  Features */
-#define	FEATURE_FLAG_SUPPORTS_RESHAPE	0x1
+#define	FEATURE_FLAG_SUPPORTS_V180	0x1 /* Supports v1.8.0 extended superblock */
+#define	FEATURE_FLAG_SUPPORTS_RESHAPE	0x2 /* Supports v1.8.0 reshaping functionality */
 
 /* State flags for sb->flags */
 #define	SB_FLAG_RESHAPE_ACTIVE		0x1
@@ -1220,7 +1462,7 @@ struct dm_raid_superblock {
 	/********************************************************************
 	 * BELOW FOLLOW V1.8.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
 	 *
-	 * FEATURE_FLAG_SUPPORTS_RESHAPE in the features member indicates that those exist
+	 * FEATURE_FLAG_SUPPORTS_V180 in the features member indicates that those exist
 	 */
 
 	__le32 flags; /* Flags defining array states for reshaping */
@@ -1287,7 +1529,7 @@ static void sb_retrieve_failed_devices(struct dm_raid_superblock *sb, uint64_t *
 	failed_devices[0] = le64_to_cpu(sb->failed_devices);
 	memset(failed_devices + 1, 0, sizeof(sb->extended_failed_devices));
 
-	if (_test_flag(FEATURE_FLAG_SUPPORTS_RESHAPE, le32_to_cpu(sb->compat_features))) {
+	if (_test_flag(FEATURE_FLAG_SUPPORTS_V180, le32_to_cpu(sb->compat_features))) {
 		int i = ARRAY_SIZE(sb->extended_failed_devices);
 
 		while (i--)
@@ -1337,7 +1579,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 		sb_update_failed_devices(sb, failed_devices);
 
 	sb->magic = cpu_to_le32(DM_RAID_MAGIC);
-	sb->compat_features = cpu_to_le32(0); /* Don't set reshape flag yet */
+	sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V180); /* Don't set reshape flag yet */
 
 	sb->num_devices = cpu_to_le32(mddev->raid_disks);
 	sb->array_position = cpu_to_le32(rdev->raid_disk);
@@ -1416,6 +1658,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 		super_sync(rdev->mddev, rdev);
 
 		set_bit(FirstUse, &rdev->flags);
+		sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V180); /* Don't set reshape flag yet */
 
 		/* Force writing of superblocks to disk */
 		set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
@@ -1461,7 +1704,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 	 * Reshaping is supported, e.g. reshape_position is valid
 	 * in superblock and superblock content is authoritative.
 	 */
-	if (_test_flag(FEATURE_FLAG_SUPPORTS_RESHAPE, le32_to_cpu(sb->compat_features))) {
+	if (_test_flag(FEATURE_FLAG_SUPPORTS_V180, le32_to_cpu(sb->compat_features))) {
 		/* Superblock is authoritative wrt given raid set layout! */
 		mddev->raid_disks = le32_to_cpu(sb->num_devices);
 		mddev->level = le32_to_cpu(sb->level);
@@ -1564,6 +1807,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 		if (new_devs == rs->raid_disks) {
 			DMINFO("Superblocks created for new raid set");
 			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+			_set_flag(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 			mddev->recovery_cp = 0;
 		} else if (new_devs && new_devs != rs->raid_disks && !rebuilds) {
 			DMERR("New device injected into existing raid set without "
@@ -1657,8 +1901,9 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 	if (!mddev->events && super_init_validation(rs, rdev))
 		return -EINVAL;
 
-	if (sb->compat_features || sb->incompat_features) {
-		rs->ti->error = "Unable to assemble array: No feature flags supported yet";
+	if (le32_to_cpu(sb->compat_features) != FEATURE_FLAG_SUPPORTS_V180 ||
+	    sb->incompat_features) {
+		rs->ti->error = "Unable to assemble array: No incompatible feature flags supported yet";
 		return -EINVAL;
 	}
 
@@ -1718,8 +1963,6 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 		 * that the "sync" directive is disallowed during the
 		 * reshape.
 		 */
-		rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode));
-
 		if (_test_flag(CTR_FLAG_SYNC, rs->ctr_flags))
 			continue;
 
@@ -1785,14 +2028,77 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 	return 0;
 }
 
+/* Userpace reordered disks -> adjust raid_disk indexes in @rs */
+static void _reorder_raid_disk_indexes(struct raid_set *rs)
+{
+	int i = 0;
+	struct md_rdev *rdev;
+
+	rdev_for_each(rdev, &rs->md) {
+		rdev->raid_disk = i++;
+		rdev->saved_raid_disk = rdev->new_raid_disk = -1;
+	}
+}
+
+/*
+ * Setup @rs for takeover by a different raid level
+ */
+static int rs_setup_takeover(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+	struct md_rdev *rdev;
+	unsigned int d = mddev->raid_disks = rs->raid_disks;
+	sector_t new_data_offset = rs->dev[0].rdev.data_offset ? 0 : rs->data_offset;
+
+	if (rt_is_raid10(rs->raid_type)) {
+		if (mddev->level == 0) {
+			/* Userpace reordered disks -> adjust raid_disk indexes */
+			_reorder_raid_disk_indexes(rs);
+
+			/* raid0 -> raid10_far layout */
+			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_FAR,
+								   rs->raid10_copies);
+		} else if (mddev->level == 1)
+			/* raid1 -> raid10_near layout */
+			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
+								   rs->raid_disks);
+		 else
+			return -EINVAL;
+
+	}
+
+	clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+	mddev->recovery_cp = MaxSector;
+
+	while (d--) {
+		rdev = &rs->dev[d].rdev;
+
+		if (test_bit(d, (void *) rs->rebuild_disks)) {
+			clear_bit(In_sync, &rdev->flags);
+			clear_bit(Faulty, &rdev->flags);
+			mddev->recovery_cp = rdev->recovery_offset = 0;
+			/* Bitmap has to be created when we do an "up" takeover */
+			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+		}
+
+		rdev->new_data_offset = new_data_offset;
+	}
+
+	rs_set_new(rs);
+	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+
+	return 0;
+}
+
 /*
  * Enable/disable discard support on RAID set depending on
  * RAID level and discard properties of underlying RAID members.
  */
-static void configure_discard_support(struct dm_target *ti, struct raid_set *rs)
+static void configure_discard_support(struct raid_set *rs)
 {
 	int i;
 	bool raid456;
+	struct dm_target *ti = rs->ti;
 
 	/* Assume discards not supported until after checks below. */
 	ti->discards_supported = false;
@@ -1894,6 +2200,14 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 
 	rs->md.sync_super = super_sync;
+
+	/*
+	 * Backup any new raid set level, layout, ...
+	 * requested to be able to compare to superblock
+	 * members for conversion decisions.
+	 */
+	rs_config_backup(rs);
+
 	r = analyse_superblocks(ti, rs);
 	if (r)
 		goto bad;
@@ -1902,10 +2216,29 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->private = rs;
 	ti->num_flush_bios = 1;
 
+	/* Restore any requested new layout for conversion decision */
+	rs_config_restore(rs);
+
 	/*
-	 * Disable/enable discard support on RAID set.
+	 * If a takeover is needed, just set the level to
+	 * the new requested one and allow the raid set to run.
 	 */
-	configure_discard_support(ti, rs);
+	if (rs_takeover_requested(rs)) {
+		r = rs_check_takeover(rs);
+		if (r)
+			return r;
+
+		r = rs_setup_takeover(rs);
+		if (r)
+			return r;
+
+		_set_flag(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+	}
+
+	/* Start raid set read-only and assumed clean to change in raid_resume() */
+	rs->md.ro = 1;
+	rs->md.in_sync = 1;
+	set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
 
 	/* Has to be held on running the array */
 	mddev_lock_nointr(&rs->md);
@@ -2312,29 +2645,92 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
 	}
 }
 
+/* Load the dirty region bitmap */
+static int _bitmap_load(struct raid_set *rs)
+{
+	int r = 0;
+
+	/* Try loading the bitmap unless "raid0", which does not have one */
+	if (!rs_is_raid0(rs) &&
+	    !_test_and_set_flag(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
+		r = bitmap_load(&rs->md);
+		if (r)
+			DMERR("Failed to load bitmap");
+	}
+
+	return r;
+}
+
+static int raid_preresume(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+	struct mddev *mddev = &rs->md;
+
+	/* This is a resume after a suspend of the set -> it's already started */
+	if (_test_and_set_flag(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
+		return 0;
+
+	/*
+	 * The superblocks need to be updated on disk if the
+	 * array is new or _bitmap_load will overwrite them
+	 * in core with old data.
+	 *
+	 * In case the array got modified (takeover/reshape/resize)
+	 * or the data offsets on the component devices changed, they
+	 * have to be updated as well.
+	 *
+	 * Have to switch to readwrite and back in order to
+	 * allow for the superblock updates.
+	 */
+	if (_test_and_clear_flag(RT_FLAG_UPDATE_SBS, &rs->runtime_flags)) {
+		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		mddev->ro = 0;
+		md_update_sb(mddev, 1);
+		mddev->ro = 1;
+	}
+
+	/*
+	 * Disable/enable discard support on raid set after any
+	 * conversion, because devices can have been added
+	 */
+	configure_discard_support(rs);
+
+	/* Load the bitmap from disk unless raid0 */
+	return _bitmap_load(rs);
+}
+
 static void raid_resume(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;
+	struct mddev *mddev = &rs->md;
 
-	if (!rt_is_raid0(rs->raid_type)) {
-		set_bit(MD_CHANGE_DEVS, &rs->md.flags);
+	if (_test_and_set_flag(RT_FLAG_RS_RESUMED, &rs->runtime_flags)) {
+		/*
+		 * A secondary resume while the device is active.
+		 * Take this opportunity to check whether any failed
+		 * devices are reachable again.
+		 */
+		attempt_restore_of_faulty_devices(rs);
 
-		if (!rs->bitmap_loaded) {
-			bitmap_load(&rs->md);
-			rs->bitmap_loaded = 1;
-		} else {
-			/*
-			 * A secondary resume while the device is active.
-			 * Take this opportunity to check whether any failed
-			 * devices are reachable again.
-			 */
-			attempt_restore_of_faulty_devices(rs);
-		}
+	} else {
+		mddev->in_sync = 0;
 
-		clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
+		/*
+		 * If any of the constructor flags got passed in
+		 * but "region_size" (gets always passed in for
+		 * mappings with bitmap), we expect userspace to
+		 * reset them and reload the mapping anyway.
+		 *
+		 * -> don't unfreeze resynchronization until imminant
+		 *    reload of the table w/o theses flags
+		 */
+		if (!_test_flags(ALL_FREEZE_FLAGS, rs->ctr_flags))
+			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	}
 
-	mddev_resume(&rs->md);
+	mddev->ro = 0;
+	if (mddev->suspended)
+		mddev_resume(mddev);
 }
 
 static struct target_type raid_target = {
@@ -2350,6 +2746,7 @@ static struct target_type raid_target = {
 	.io_hints = raid_io_hints,
 	.presuspend = raid_presuspend,
 	.postsuspend = raid_postsuspend,
+	.preresume = raid_preresume,
 	.resume = raid_resume,
 };
 
-- 
cgit v1.2.3-70-g09d2


From 3a1c1ef2fd62087c3d6521de217ddb9360776658 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 19 May 2016 18:49:34 +0200
Subject: dm raid: enhance status interface and fixup takeover/raid0

The target's status interface has to provide the new 'data_offset' value
to allow userspace to retrieve the kernels offset to the data on each
raid device of a raid set.  This is the base for out-of-place reshaping
required to not write over any data during reshaping (e.g. change
raid6_zr -> raid6_nc):

 - add rs_set_cur() to be able to start up existing array in case of no
   takeover; use in ctr on takeover check

 - enhance raid_status()

 - add supporting functions to get resync/reshape progress and raid
   device status chars

 - fixup rebuild table line output race, which does miss to emit
   'rebuild N' on fully synced/rebuild devices, because it is relying on
   the transient 'In_sync' raid device flag

 - add new status line output for 'data_offset', which'll later be used
   for out-of-place reshaping

 - fixup takeover not working for all levels

 - fixup raid0 message interface oops caused by missing checks
   for the md threads, which don't exist in case of raid0

 - remove ALL_FREEZE_FLAGS not needed for takeover

 - adjust comments

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 434 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 253 insertions(+), 181 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 89b677c7cfe6..3165c4fd86bf 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -98,13 +98,6 @@ struct raid_dev {
 #define ALL_CTR_FLAGS		(CTR_FLAG_OPTIONS_NO_ARGS | \
 				 CTR_FLAG_OPTIONS_ONE_ARG)
 
-/*
- * All flags which cause a recovery unfreeze once they got stored in the raid metadata
- */
-#define	ALL_FREEZE_FLAGS (ALL_CTR_FLAGS & ~(CTR_FLAG_REGION_SIZE | CTR_FLAGS_ANY_SYNC | \
-					    CTR_FLAG_RAID10_FORMAT | CTR_FLAG_RAID10_COPIES | \
-					    CTR_FLAG_RAID10_USE_NEAR_SETS))
-
 /* Invalid options definitions per raid level... */
 
 /* "raid0" does not accept any options */
@@ -617,6 +610,19 @@ static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
 	return NULL;
 }
 
+/*
+ * Set the mddev properties in @rs to the current
+ * ones retrieved from the freshest superblock
+ */
+static void rs_set_cur(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+
+	mddev->new_level = mddev->level;
+	mddev->new_layout = mddev->layout;
+	mddev->new_chunk_sectors = mddev->chunk_sectors;
+}
+
 /*
  * Set the mddev properties in @rs to the new
  * ones requested by the ctr
@@ -628,6 +634,7 @@ static void rs_set_new(struct raid_set *rs)
 	mddev->level = mddev->new_level;
 	mddev->layout = mddev->new_layout;
 	mddev->chunk_sectors = mddev->new_chunk_sectors;
+	mddev->raid_disks = rs->raid_disks;
 	mddev->delta_disks = 0;
 }
 
@@ -773,7 +780,7 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
 			rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
 		}
 		rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
-		list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
+		list_add_tail(&rs->dev[i].rdev.same_set, &rs->md.disks);
 		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
 			rebuild++;
 	}
@@ -1245,6 +1252,12 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	return rs_check_for_invalid_flags(rs);
 }
 
+/* Return # of data stripes as kept in mddev as of @rs (i.e. as of superblock) */
+static unsigned int mddev_data_stripes(struct raid_set *rs)
+{
+	return rs->md.raid_disks - rs->raid_type->parity_devs;
+}
+
 static void do_table_event(struct work_struct *ws)
 {
 	struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
@@ -1735,7 +1748,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 
 	} else {
 		/*
-		 * Reshaping is not allowed, because we don't have the appropriate metadata
+		 * No takeover/reshaping, because we don't have the extended v1.8.0 metadata
 		 */
 		if (le32_to_cpu(sb->level) != mddev->level) {
 			DMERR("Reshaping/takeover raid sets not yet supported. (raid level/stripes/size change)");
@@ -1889,7 +1902,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 	struct mddev *mddev = &rs->md;
 	struct dm_raid_superblock *sb;
 
-	if (!rdev->sb_page)
+	if (rs_is_raid0(rs) || !rdev->sb_page)
 		return 0;
 
 	sb = page_address(rdev->sb_page);
@@ -2084,9 +2097,6 @@ static int rs_setup_takeover(struct raid_set *rs)
 		rdev->new_data_offset = new_data_offset;
 	}
 
-	rs_set_new(rs);
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-
 	return 0;
 }
 
@@ -2232,8 +2242,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		if (r)
 			return r;
 
+		/* Tell preresume to update superblocks with new layout */
 		_set_flag(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
-	}
+		rs_set_new(rs);
+	} else
+		rs_set_cur(rs);
 
 	/* Start raid set read-only and assumed clean to change in raid_resume() */
 	rs->md.ro = 1;
@@ -2288,6 +2301,7 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
 	return DM_MAPIO_SUBMITTED;
 }
 
+/* Return string describing the current sync action of @mddev */
 static const char *decipher_sync_action(struct mddev *mddev)
 {
 	if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
@@ -2313,181 +2327,251 @@ static const char *decipher_sync_action(struct mddev *mddev)
 	return "idle";
 }
 
-static void raid_status(struct dm_target *ti, status_type_t type,
-			unsigned status_flags, char *result, unsigned maxlen)
+/*
+ * Return status string @rdev
+ *
+ * Status characters:
+ *
+ *  'D' = Dead/Failed device
+ *  'a' = Alive but not in-sync
+ *  'A' = Alive and in-sync
+ */
+static const char *_raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
 {
-	struct raid_set *rs = ti->private;
-	unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
-	unsigned sz = 0;
-	int i, array_in_sync = 0;
-	sector_t sync;
+	if (test_bit(Faulty, &rdev->flags))
+		return "D";
+	else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
+		return "a";
+	else
+		return "A";
+}
 
-	switch (type) {
-	case STATUSTYPE_INFO:
-		DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
+/* Helper to return resync/reshape progress for @rs and @array_in_sync */
+static sector_t rs_get_progress(struct raid_set *rs,
+				sector_t resync_max_sectors, bool *array_in_sync)
+{
+	sector_t r, recovery_cp, curr_resync_completed;
+	struct mddev *mddev = &rs->md;
 
-		if (!rt_is_raid0(rs->raid_type)) {
-			if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
-				sync = rs->md.curr_resync_completed;
-			else
-				sync = rs->md.recovery_cp;
-
-			if (sync >= rs->md.resync_max_sectors) {
-				/*
-				 * Sync complete.
-				 */
-				array_in_sync = 1;
-				sync = rs->md.resync_max_sectors;
-			} else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
-				/*
-				 * If "check" or "repair" is occurring, the array has
-				 * undergone and initial sync and the health characters
-				 * should not be 'a' anymore.
-				 */
-				array_in_sync = 1;
+	curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp;
+	recovery_cp = mddev->recovery_cp;
+	*array_in_sync = false;
+
+	if (rs_is_raid0(rs)) {
+		r = resync_max_sectors;
+		*array_in_sync = true;
+
+	} else {
+		r = mddev->reshape_position;
+
+		/* Reshape is relative to the array size */
+		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
+		    r != MaxSector) {
+			if (r == MaxSector) {
+				*array_in_sync = true;
+				r = resync_max_sectors;
 			} else {
-				/*
-				 * The array may be doing an initial sync, or it may
-				 * be rebuilding individual components.  If all the
-				 * devices are In_sync, then it is the array that is
-				 * being initialized.
-				 */
-				for (i = 0; i < rs->md.raid_disks; i++)
-					if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
-						array_in_sync = 1;
+				/* Got to reverse on backward reshape */
+				if (mddev->reshape_backwards)
+					r = mddev->array_sectors - r;
+
+				/* Devide by # of data stripes */
+				sector_div(r, mddev_data_stripes(rs));
 			}
+
+		/* Sync is relative to the component device size */
+		} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+			r = curr_resync_completed;
+		else
+			r = recovery_cp;
+
+		if (r == MaxSector) {
+			/*
+			 * Sync complete.
+			 */
+			*array_in_sync = true;
+			r = resync_max_sectors;
+		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+			/*
+			 * If "check" or "repair" is occurring, the raid set has
+			 * undergone an initial sync and the health characters
+			 * should not be 'a' anymore.
+			 */
+			*array_in_sync = true;
 		} else {
-			/* RAID0 */
-			array_in_sync = 1;
-			sync = rs->md.resync_max_sectors;
-		}
+			struct md_rdev *rdev;
 
-		/*
-		 * Status characters:
-		 *  'D' = Dead/Failed device
-		 *  'a' = Alive but not in-sync
-		 *  'A' = Alive and in-sync
-		 */
-		for (i = 0; i < rs->md.raid_disks; i++) {
-			if (test_bit(Faulty, &rs->dev[i].rdev.flags))
-				DMEMIT("D");
-			else if (!array_in_sync ||
-				 !test_bit(In_sync, &rs->dev[i].rdev.flags))
-				DMEMIT("a");
-			else
-				DMEMIT("A");
+			/*
+			 * The raid set may be doing an initial sync, or it may
+			 * be rebuilding individual components.  If all the
+			 * devices are In_sync, then it is the raid set that is
+			 * being initialized.
+			 */
+			rdev_for_each(rdev, mddev)
+				if (!test_bit(In_sync, &rdev->flags))
+					*array_in_sync = true;
+#if 0
+			r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
+#endif
 		}
+	}
+
+	return r;
+}
+
+/* Helper to return @dev name or "-" if !@dev */
+static const char *_get_dev_name(struct dm_dev *dev)
+{
+	return dev ? dev->name : "-";
+}
+
+static void raid_status(struct dm_target *ti, status_type_t type,
+			unsigned int status_flags, char *result, unsigned int maxlen)
+{
+	struct raid_set *rs = ti->private;
+	struct mddev *mddev = &rs->md;
+	struct r5conf *conf = mddev->private;
+	int max_nr_stripes = conf ? conf->max_nr_stripes : 0;
+	bool array_in_sync;
+	unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
+	unsigned int sz = 0;
+	unsigned int write_mostly_params = 0;
+	sector_t progress, resync_max_sectors, resync_mismatches;
+	const char *sync_action;
+	struct raid_type *rt;
+	struct md_rdev *rdev;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		/* *Should* always succeed */
+		rt = get_raid_type_by_ll(mddev->new_level, mddev->new_layout);
+		if (!rt)
+			return;
+
+		DMEMIT("%s %d ", rt ? rt->name : "unknown", mddev->raid_disks);
+
+		/* Access most recent mddev properties for status output */
+		smp_rmb();
+		/* Get sensible max sectors even if raid set not yet started */
+		resync_max_sectors = _test_flag(RT_FLAG_RS_PRERESUMED, rs->runtime_flags) ?
+				      mddev->resync_max_sectors : mddev->dev_sectors;
+		progress = rs_get_progress(rs, resync_max_sectors, &array_in_sync);
+		resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
+				    (unsigned int) atomic64_read(&mddev->resync_mismatches) : 0;
+		sync_action = decipher_sync_action(&rs->md);
+
+		/* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
+		rdev_for_each(rdev, mddev)
+			DMEMIT(_raid_dev_status(rdev, array_in_sync));
 
 		/*
-		 * In-sync ratio:
+		 * In-sync/Reshape ratio:
 		 *  The in-sync ratio shows the progress of:
-		 *   - Initializing the array
-		 *   - Rebuilding a subset of devices of the array
+		 *   - Initializing the raid set
+		 *   - Rebuilding a subset of devices of the raid set
 		 *  The user can distinguish between the two by referring
 		 *  to the status characters.
+		 *
+		 *  The reshape ratio shows the progress of
+		 *  changing the raid layout or the number of
+		 *  disks of a raid set
 		 */
-		DMEMIT(" %llu/%llu",
-		       (unsigned long long) sync,
-		       (unsigned long long) rs->md.resync_max_sectors);
+		DMEMIT(" %llu/%llu", (unsigned long long) progress,
+				     (unsigned long long) resync_max_sectors);
 
 		/*
+		 * v1.5.0+:
+		 *
 		 * Sync action:
-		 *   See Documentation/device-mapper/dm-raid.c for
+		 *   See Documentation/device-mapper/dm-raid.txt for
 		 *   information on each of these states.
 		 */
-		DMEMIT(" %s", decipher_sync_action(&rs->md));
+		DMEMIT(" %s", sync_action);
 
 		/*
+		 * v1.5.0+:
+		 *
 		 * resync_mismatches/mismatch_cnt
 		 *   This field shows the number of discrepancies found when
-		 *   performing a "check" of the array.
+		 *   performing a "check" of the raid set.
 		 */
-		DMEMIT(" %llu",
-		       (strcmp(rs->md.last_sync_action, "check")) ? 0 :
-		       (unsigned long long)
-		       atomic64_read(&rs->md.resync_mismatches));
-		break;
-	case STATUSTYPE_TABLE:
-		/* The string you would use to construct this array */
-		for (i = 0; i < rs->md.raid_disks; i++) {
-			if (_test_flag(CTR_FLAG_REBUILD, rs->ctr_flags) &&
-			    rs->dev[i].data_dev &&
-			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
-				raid_param_cnt += 2; /* for rebuilds */
-			if (rs->dev[i].data_dev &&
-			    test_bit(WriteMostly, &rs->dev[i].rdev.flags))
-				raid_param_cnt += 2;
-		}
-
-		raid_param_cnt += (hweight32(rs->ctr_flags & ~CTR_FLAG_REBUILD) * 2);
-		if (rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC))
-			raid_param_cnt--;
+		DMEMIT(" %llu", (unsigned long long) resync_mismatches);
 
-		DMEMIT("%s %u %u", rs->raid_type->name,
-		       raid_param_cnt, rs->md.chunk_sectors);
+		/*
+		 * v1.8.0+:
+		 *
+		 * data_offset (needed for out of space reshaping)
+		 *   This field shows the data offset into the data
+		 *   image LV where the first stripes data starts.
+		 *
+		 * We keep data_offset equal on all raid disks of the set,
+		 * so retrieving it from the first raid disk is sufficient.
+		 */
+		DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset);
+		break;
 
-		if (_test_flag(CTR_FLAG_SYNC, rs->ctr_flags) &&
-		    rs->md.recovery_cp == MaxSector)
-			DMEMIT(" sync");
+	case STATUSTYPE_TABLE:
+		/* Report the table line string you would use to construct this raid set */
+
+		/* Calculate raid parameter count */
+		rdev_for_each(rdev, mddev)
+			if (test_bit(WriteMostly, &rdev->flags))
+				write_mostly_params += 2;
+		raid_param_cnt += memweight(rs->rebuild_disks,
+					    DISKS_ARRAY_ELEMS * sizeof(*rs->rebuild_disks)) * 2 +
+				  write_mostly_params +
+				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
+				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
+		/* Emit table line */
+		DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
+		if (_test_flag(CTR_FLAG_RAID10_FORMAT, rs->ctr_flags))
+			DMEMIT(" %s %s", _argname_by_flag(CTR_FLAG_RAID10_FORMAT),
+					 raid10_md_layout_to_format(mddev->layout));
+		if (_test_flag(CTR_FLAG_RAID10_COPIES, rs->ctr_flags))
+			DMEMIT(" %s %d", _argname_by_flag(CTR_FLAG_RAID10_COPIES),
+					 raid10_md_layout_to_copies(mddev->layout));
 		if (_test_flag(CTR_FLAG_NOSYNC, rs->ctr_flags))
-			DMEMIT(" nosync");
-
-		for (i = 0; i < rs->md.raid_disks; i++)
-			if (_test_flag(CTR_FLAG_REBUILD, rs->ctr_flags) &&
-			    rs->dev[i].data_dev &&
-			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
-				DMEMIT(" rebuild %u", i);
-
+			DMEMIT(" %s", _argname_by_flag(CTR_FLAG_NOSYNC));
+		if (_test_flag(CTR_FLAG_SYNC, rs->ctr_flags))
+			DMEMIT(" %s", _argname_by_flag(CTR_FLAG_SYNC));
+		if (_test_flag(CTR_FLAG_REGION_SIZE, rs->ctr_flags))
+			DMEMIT(" %s %llu", _argname_by_flag(CTR_FLAG_REGION_SIZE),
+					   (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
+		if (_test_flag(CTR_FLAG_DATA_OFFSET, rs->ctr_flags))
+			DMEMIT(" %s %llu", _argname_by_flag(CTR_FLAG_DATA_OFFSET),
+					   (unsigned long long) rs->data_offset);
 		if (_test_flag(CTR_FLAG_DAEMON_SLEEP, rs->ctr_flags))
-			DMEMIT(" daemon_sleep %lu",
-			       rs->md.bitmap_info.daemon_sleep);
-
-		if (_test_flag(CTR_FLAG_MIN_RECOVERY_RATE, rs->ctr_flags))
-			DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
-
-		if (_test_flag(CTR_FLAG_MAX_RECOVERY_RATE, rs->ctr_flags))
-			DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
-
-		for (i = 0; i < rs->md.raid_disks; i++)
-			if (rs->dev[i].data_dev &&
-			    test_bit(WriteMostly, &rs->dev[i].rdev.flags))
-				DMEMIT(" write_mostly %u", i);
-
+			DMEMIT(" %s %lu", _argname_by_flag(CTR_FLAG_DAEMON_SLEEP),
+					  mddev->bitmap_info.daemon_sleep);
+		if (_test_flag(CTR_FLAG_DELTA_DISKS, rs->ctr_flags))
+			DMEMIT(" %s %d", _argname_by_flag(CTR_FLAG_DELTA_DISKS),
+					 mddev->delta_disks);
+		if (_test_flag(CTR_FLAG_STRIPE_CACHE, rs->ctr_flags))
+			DMEMIT(" %s %d", _argname_by_flag(CTR_FLAG_STRIPE_CACHE),
+					 max_nr_stripes);
+		rdev_for_each(rdev, mddev)
+			if (test_bit(rdev->raid_disk, (void *) rs->rebuild_disks))
+				DMEMIT(" %s %u", _argname_by_flag(CTR_FLAG_REBUILD),
+						 rdev->raid_disk);
+		rdev_for_each(rdev, mddev)
+			if (test_bit(WriteMostly, &rdev->flags))
+				DMEMIT(" %s %d", _argname_by_flag(CTR_FLAG_WRITE_MOSTLY),
+						 rdev->raid_disk);
 		if (_test_flag(CTR_FLAG_MAX_WRITE_BEHIND, rs->ctr_flags))
-			DMEMIT(" max_write_behind %lu",
-			       rs->md.bitmap_info.max_write_behind);
-
-		if (_test_flag(CTR_FLAG_STRIPE_CACHE, rs->ctr_flags)) {
-			struct r5conf *conf = rs->md.private;
-
-			/* convert from kiB to sectors */
-			DMEMIT(" stripe_cache %d",
-			       conf ? conf->max_nr_stripes * 2 : 0);
-		}
-
-		if (_test_flag(CTR_FLAG_REGION_SIZE, rs->ctr_flags))
-			DMEMIT(" region_size %lu",
-			       rs->md.bitmap_info.chunksize >> 9);
-
-		if (_test_flag(CTR_FLAG_RAID10_COPIES, rs->ctr_flags))
-			DMEMIT(" raid10_copies %u",
-			       raid10_md_layout_to_copies(rs->md.layout));
-
-		if (_test_flag(CTR_FLAG_RAID10_FORMAT, rs->ctr_flags))
-			DMEMIT(" raid10_format %s",
-			       raid10_md_layout_to_format(rs->md.layout));
-
-		DMEMIT(" %d", rs->md.raid_disks);
-		for (i = 0; i < rs->md.raid_disks; i++) {
-			if (rs->dev[i].meta_dev)
-				DMEMIT(" %s", rs->dev[i].meta_dev->name);
-			else
-				DMEMIT(" -");
-
-			if (rs->dev[i].data_dev)
-				DMEMIT(" %s", rs->dev[i].data_dev->name);
-			else
-				DMEMIT(" -");
+			DMEMIT(" %s %lu", _argname_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
+					  mddev->bitmap_info.max_write_behind);
+		if (_test_flag(CTR_FLAG_MAX_RECOVERY_RATE, rs->ctr_flags))
+			DMEMIT(" %s %d", _argname_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
+					 mddev->sync_speed_max);
+		if (_test_flag(CTR_FLAG_MIN_RECOVERY_RATE, rs->ctr_flags))
+			DMEMIT(" %s %d", _argname_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
+					 mddev->sync_speed_min);
+		DMEMIT(" %d", rs->raid_disks);
+		rdev_for_each(rdev, mddev) {
+			struct raid_dev *rd = container_of(rdev, struct raid_dev, rdev);
+
+			DMEMIT(" %s %s", _get_dev_name(rd->meta_dev),
+					 _get_dev_name(rd->data_dev));
 		}
 	}
 }
@@ -2519,11 +2603,10 @@ static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
 		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
 		return -EBUSY;
 	else if (!strcasecmp(argv[0], "resync"))
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-	else if (!strcasecmp(argv[0], "recover")) {
+		; /* MD_RECOVERY_NEEDED set below */
+	else if (!strcasecmp(argv[0], "recover"))
 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-	} else {
+	else {
 		if (!strcasecmp(argv[0], "check"))
 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		else if (!!strcasecmp(argv[0], "repair"))
@@ -2536,11 +2619,11 @@ static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
 		 * canceling read-auto mode
 		 */
 		mddev->ro = 0;
-		if (!mddev->suspended)
+		if (!mddev->suspended && mddev->sync_thread)
 			md_wakeup_thread(mddev->sync_thread);
 	}
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-	if (!mddev->suspended)
+	if (!mddev->suspended && mddev->thread)
 		md_wakeup_thread(mddev->thread);
 
 	return 0;
@@ -2711,24 +2794,12 @@ static void raid_resume(struct dm_target *ti)
 		 * devices are reachable again.
 		 */
 		attempt_restore_of_faulty_devices(rs);
-
-	} else {
-		mddev->in_sync = 0;
-
-		/*
-		 * If any of the constructor flags got passed in
-		 * but "region_size" (gets always passed in for
-		 * mappings with bitmap), we expect userspace to
-		 * reset them and reload the mapping anyway.
-		 *
-		 * -> don't unfreeze resynchronization until imminant
-		 *    reload of the table w/o theses flags
-		 */
-		if (!_test_flags(ALL_FREEZE_FLAGS, rs->ctr_flags))
-			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	}
 
 	mddev->ro = 0;
+	mddev->in_sync = 0;
+	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+
 	if (mddev->suspended)
 		mddev_resume(mddev);
 }
@@ -2778,4 +2849,5 @@ MODULE_ALIAS("dm-raid4");
 MODULE_ALIAS("dm-raid5");
 MODULE_ALIAS("dm-raid6");
 MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
+MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-70-g09d2


From 43157840fddb01653b2446e7ee51e910a9fc584e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 30 May 2016 13:03:37 -0400
Subject: dm raid: tabify appropriate whitespace

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 124 +++++++++++++++++++++++++--------------------------
 1 file changed, 62 insertions(+), 62 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 3165c4fd86bf..50d2901fd9f4 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -25,12 +25,12 @@ static bool devices_handle_discard_safely = false;
  * The following flags are used by dm-raid.c to set up the array state.
  * They must be cleared before md_run is called.
  */
-#define FirstUse 10             /* rdev flag */
+#define FirstUse 10		/* rdev flag */
 
 struct raid_dev {
 	/*
 	 * Two DM devices, one to hold metadata and one to hold the
-	 * actual data/parity.  The reason for this is to not confuse
+	 * actual data/parity.	The reason for this is to not confuse
 	 * ti->len and give more flexibility in altering size and
 	 * characteristics.
 	 *
@@ -51,21 +51,21 @@ struct raid_dev {
  * 1 = no flag value
  * 2 = flag with value
  */
-#define CTR_FLAG_SYNC              0x1   /* 1 */ /* Not with raid0! */
-#define CTR_FLAG_NOSYNC            0x2   /* 1 */ /* Not with raid0! */
-#define CTR_FLAG_REBUILD           0x4   /* 2 */ /* Not with raid0! */
-#define CTR_FLAG_DAEMON_SLEEP      0x8   /* 2 */ /* Not with raid0! */
-#define CTR_FLAG_MIN_RECOVERY_RATE 0x10  /* 2 */ /* Not with raid0! */
-#define CTR_FLAG_MAX_RECOVERY_RATE 0x20  /* 2 */ /* Not with raid0! */
-#define CTR_FLAG_MAX_WRITE_BEHIND  0x40  /* 2 */ /* Only with raid1! */
-#define CTR_FLAG_WRITE_MOSTLY      0x80  /* 2 */ /* Only with raid1! */
-#define CTR_FLAG_STRIPE_CACHE      0x100 /* 2 */ /* Only with raid4/5/6! */
-#define CTR_FLAG_REGION_SIZE       0x200 /* 2 */ /* Not with raid0! */
-#define CTR_FLAG_RAID10_COPIES     0x400 /* 2 */ /* Only with raid10 */
-#define CTR_FLAG_RAID10_FORMAT     0x800 /* 2 */ /* Only with raid10 */
+#define CTR_FLAG_SYNC		   0x1	 /* 1 */ /* Not with raid0! */
+#define CTR_FLAG_NOSYNC		   0x2	 /* 1 */ /* Not with raid0! */
+#define CTR_FLAG_REBUILD	   0x4	 /* 2 */ /* Not with raid0! */
+#define CTR_FLAG_DAEMON_SLEEP	   0x8	 /* 2 */ /* Not with raid0! */
+#define CTR_FLAG_MIN_RECOVERY_RATE 0x10	 /* 2 */ /* Not with raid0! */
+#define CTR_FLAG_MAX_RECOVERY_RATE 0x20	 /* 2 */ /* Not with raid0! */
+#define CTR_FLAG_MAX_WRITE_BEHIND  0x40	 /* 2 */ /* Only with raid1! */
+#define CTR_FLAG_WRITE_MOSTLY	   0x80	 /* 2 */ /* Only with raid1! */
+#define CTR_FLAG_STRIPE_CACHE	   0x100 /* 2 */ /* Only with raid4/5/6! */
+#define CTR_FLAG_REGION_SIZE	   0x200 /* 2 */ /* Not with raid0! */
+#define CTR_FLAG_RAID10_COPIES	   0x400 /* 2 */ /* Only with raid10 */
+#define CTR_FLAG_RAID10_FORMAT	   0x800 /* 2 */ /* Only with raid10 */
 /* New for v1.8.0 */
-#define CTR_FLAG_DELTA_DISKS          0x1000 /* 2 */ /* Only with reshapable raid4/5/6/10! */
-#define CTR_FLAG_DATA_OFFSET          0x2000 /* 2 */ /* Only with reshapable raid4/5/6/10! */
+#define CTR_FLAG_DELTA_DISKS	      0x1000 /* 2 */ /* Only with reshapable raid4/5/6/10! */
+#define CTR_FLAG_DATA_OFFSET	      0x2000 /* 2 */ /* Only with reshapable raid4/5/6/10! */
 #define CTR_FLAG_RAID10_USE_NEAR_SETS 0x4000 /* 2 */ /* Only with raid10! */
 
 /*
@@ -221,26 +221,26 @@ static struct raid_type {
 	const unsigned level;		/* RAID level. */
 	const unsigned algorithm;	/* RAID algorithm. */
 } raid_types[] = {
-	{"raid0",         "raid0 (striping)",			    0, 2, 0,  0 /* NONE */},
-	{"raid1",         "raid1 (mirroring)",			    0, 2, 1,  0 /* NONE */},
-	{"raid10_far",    "raid10 far (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_FAR},
+	{"raid0",	  "raid0 (striping)",			    0, 2, 0,  0 /* NONE */},
+	{"raid1",	  "raid1 (mirroring)",			    0, 2, 1,  0 /* NONE */},
+	{"raid10_far",	  "raid10 far (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_FAR},
 	{"raid10_offset", "raid10 offset (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_OFFSET},
-	{"raid10_near",   "raid10 near (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_NEAR},
-	{"raid10",        "raid10 (striped mirrors)",		    0, 2, 10, ALGORITHM_RAID10_DEFAULT},
-	{"raid4",         "raid4 (dedicated last parity disk)",	    1, 2, 4,  ALGORITHM_PARITY_N}, /* raid4 layout = raid5_n */
-	{"raid5_n",       "raid5 (dedicated last parity disk)",	    1, 2, 5,  ALGORITHM_PARITY_N},
-	{"raid5_ls",      "raid5 (left symmetric)",		    1, 2, 5,  ALGORITHM_LEFT_SYMMETRIC},
-	{"raid5_rs",      "raid5 (right symmetric)",		    1, 2, 5,  ALGORITHM_RIGHT_SYMMETRIC},
-	{"raid5_la",      "raid5 (left asymmetric)",		    1, 2, 5,  ALGORITHM_LEFT_ASYMMETRIC},
-	{"raid5_ra",      "raid5 (right asymmetric)",		    1, 2, 5,  ALGORITHM_RIGHT_ASYMMETRIC},
-	{"raid6_zr",      "raid6 (zero restart)",		    2, 4, 6,  ALGORITHM_ROTATING_ZERO_RESTART},
-	{"raid6_nr",      "raid6 (N restart)",			    2, 4, 6,  ALGORITHM_ROTATING_N_RESTART},
-	{"raid6_nc",      "raid6 (N continue)",			    2, 4, 6,  ALGORITHM_ROTATING_N_CONTINUE},
-	{"raid6_n_6",     "raid6 (dedicated parity/Q n/6)",	    2, 4, 6,  ALGORITHM_PARITY_N_6},
-	{"raid6_ls_6",    "raid6 (left symmetric dedicated Q 6)",   2, 4, 6,  ALGORITHM_LEFT_SYMMETRIC_6},
-	{"raid6_rs_6",    "raid6 (right symmetric dedicated Q 6)",  2, 4, 6,  ALGORITHM_RIGHT_SYMMETRIC_6},
-	{"raid6_la_6",    "raid6 (left asymmetric dedicated Q 6)",  2, 4, 6,  ALGORITHM_LEFT_ASYMMETRIC_6},
-	{"raid6_ra_6",    "raid6 (right asymmetric dedicated Q 6)", 2, 4, 6,  ALGORITHM_RIGHT_ASYMMETRIC_6}
+	{"raid10_near",	  "raid10 near (striped mirrors)",	    0, 2, 10, ALGORITHM_RAID10_NEAR},
+	{"raid10",	  "raid10 (striped mirrors)",		    0, 2, 10, ALGORITHM_RAID10_DEFAULT},
+	{"raid4",	  "raid4 (dedicated last parity disk)",	    1, 2, 4,  ALGORITHM_PARITY_N}, /* raid4 layout = raid5_n */
+	{"raid5_n",	  "raid5 (dedicated last parity disk)",	    1, 2, 5,  ALGORITHM_PARITY_N},
+	{"raid5_ls",	  "raid5 (left symmetric)",		    1, 2, 5,  ALGORITHM_LEFT_SYMMETRIC},
+	{"raid5_rs",	  "raid5 (right symmetric)",		    1, 2, 5,  ALGORITHM_RIGHT_SYMMETRIC},
+	{"raid5_la",	  "raid5 (left asymmetric)",		    1, 2, 5,  ALGORITHM_LEFT_ASYMMETRIC},
+	{"raid5_ra",	  "raid5 (right asymmetric)",		    1, 2, 5,  ALGORITHM_RIGHT_ASYMMETRIC},
+	{"raid6_zr",	  "raid6 (zero restart)",		    2, 4, 6,  ALGORITHM_ROTATING_ZERO_RESTART},
+	{"raid6_nr",	  "raid6 (N restart)",			    2, 4, 6,  ALGORITHM_ROTATING_N_RESTART},
+	{"raid6_nc",	  "raid6 (N continue)",			    2, 4, 6,  ALGORITHM_ROTATING_N_CONTINUE},
+	{"raid6_n_6",	  "raid6 (dedicated parity/Q n/6)",	    2, 4, 6,  ALGORITHM_PARITY_N_6},
+	{"raid6_ls_6",	  "raid6 (left symmetric dedicated Q 6)",   2, 4, 6,  ALGORITHM_LEFT_SYMMETRIC_6},
+	{"raid6_rs_6",	  "raid6 (right symmetric dedicated Q 6)",  2, 4, 6,  ALGORITHM_RIGHT_SYMMETRIC_6},
+	{"raid6_la_6",	  "raid6 (left asymmetric dedicated Q 6)",  2, 4, 6,  ALGORITHM_LEFT_ASYMMETRIC_6},
+	{"raid6_ra_6",	  "raid6 (right asymmetric dedicated Q 6)", 2, 4, 6,  ALGORITHM_RIGHT_ASYMMETRIC_6}
 };
 
 /* True, if @v is in inclusive range [@min, @max] */
@@ -824,7 +824,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
 
 	if (!region_size) {
 		/*
-		 * Choose a reasonable default.  All figures in sectors.
+		 * Choose a reasonable default.	 All figures in sectors.
 		 */
 		if (min_region_size > (1 << 13)) {
 			/* If not a power of 2, make it the next power of 2 */
@@ -909,9 +909,9 @@ static int validate_raid_redundancy(struct raid_set *rs)
 		 * simple case where the number of devices is a multiple of the
 		 * number of copies, we must also handle cases where the number
 		 * of devices is not a multiple of the number of copies.
-		 * E.g.    dev1 dev2 dev3 dev4 dev5
-		 *          A    A    B    B    C
-		 *          C    D    D    E    E
+		 * E.g.	   dev1 dev2 dev3 dev4 dev5
+		 *	    A	 A    B	   B	C
+		 *	    C	 D    D	   E	E
 		 */
 		if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
 			for (i = 0; i < rs->md.raid_disks * copies; i++) {
@@ -934,7 +934,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
 		 * use the 'use_far_sets' variant.)
 		 *
 		 * This check is somewhat complicated by the need to account
-		 * for arrays that are not a multiple of (far) copies.  This
+		 * for arrays that are not a multiple of (far) copies.	This
 		 * results in the need to treat the last (potentially larger)
 		 * set differently.
 		 */
@@ -967,21 +967,21 @@ too_many:
  *
  * Argument definitions
  *    <chunk_size>			The number of sectors per disk that
- *                                      will form the "stripe"
+ *					will form the "stripe"
  *    [[no]sync]			Force or prevent recovery of the
- *                                      entire array
+ *					entire array
  *    [rebuild <idx>]			Rebuild the drive indicated by the index
  *    [daemon_sleep <ms>]		Time between bitmap daemon work to
- *                                      clear bits
+ *					clear bits
  *    [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
  *    [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
  *    [write_mostly <idx>]		Indicate a write mostly drive via index
  *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
  *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
- *    [region_size <sectors>]           Defines granularity of bitmap
+ *    [region_size <sectors>]		Defines granularity of bitmap
  *
  * RAID10-only options:
- *    [raid10_copies <# copies>]        Number of copies.  (Default: 2)
+ *    [raid10_copies <# copies>]	Number of copies.  (Default: 2)
  *    [raid10_format <near|far|offset>] Layout algorithm.  (Default: near)
  */
 static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
@@ -1024,13 +1024,13 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	 * replacement then one of the following cases applies:
 	 *
 	 *   1) User specifies 'rebuild'.
-	 *      - Device is reset when param is read.
+	 *	- Device is reset when param is read.
 	 *   2) A new device is supplied.
-	 *      - No matching superblock found, resets device.
+	 *	- No matching superblock found, resets device.
 	 *   3) Device failure was transient and returns on reload.
-	 *      - Failure noticed, resets device for bitmap replay.
+	 *	- Failure noticed, resets device for bitmap replay.
 	 *   4) Device hadn't completed recovery after previous failure.
-	 *      - Superblock is read and overrides recovery_offset.
+	 *	- Superblock is read and overrides recovery_offset.
 	 *
 	 * What is found in the superblocks of the devices is always
 	 * authoritative, unless 'rebuild' or '[no]sync' was specified.
@@ -1094,7 +1094,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			 * "rebuild" is being passed in by userspace to provide
 			 * indexes of replaced devices and to set up additional
 			 * devices on raid level takeover.
- 			 */
+			 */
 			if (!_in_range(value, 0, rs->raid_disks - 1))
 				return ti_error_einval(rs->ti, "Invalid rebuild index given");
 
@@ -1756,11 +1756,11 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 		}
 		if (le32_to_cpu(sb->layout) != mddev->layout) {
 			DMERR("Reshaping raid sets not yet supported. (raid layout change)");
-			DMERR("  0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
-			DMERR("  Old layout: %s w/ %d copies",
+			DMERR("	 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
+			DMERR("	 Old layout: %s w/ %d copies",
 			      raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
 			      raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
-			DMERR("  New layout: %s w/ %d copies",
+			DMERR("	 New layout: %s w/ %d copies",
 			      raid10_md_layout_to_format(mddev->layout),
 			      raid10_md_layout_to_copies(mddev->layout));
 			return -EINVAL;
@@ -1789,7 +1789,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 	 * During load, we set FirstUse if a new superblock was written.
 	 * There are two reasons we might not have a superblock:
 	 * 1) The raid set is brand new - in which case, all of the
-	 *    devices must have their In_sync bit set.  Also,
+	 *    devices must have their In_sync bit set.	Also,
 	 *    recovery_cp must be 0, unless forced.
 	 * 2) This is a new device being added to an old raid set
 	 *    and the new device needs to be rebuilt - in which
@@ -1969,7 +1969,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 		/*
 		 * Skipping super_load due to CTR_FLAG_SYNC will cause
 		 * the array to undergo initialization again as
-		 * though it were new.  This is the intended effect
+		 * though it were new.	This is the intended effect
 		 * of the "sync" directive.
 		 *
 		 * When reshaping capability is added, we must ensure
@@ -2151,10 +2151,10 @@ static void configure_discard_support(struct raid_set *rs)
 /*
  * Construct a RAID0/1/10/4/5/6 mapping:
  * Args:
- *      <raid_type> <#raid_params> <raid_params>{0,}    \
- *      <#raid_devs> [<meta_dev1> <dev1>]{1,}
+ *	<raid_type> <#raid_params> <raid_params>{0,}	\
+ *	<#raid_devs> [<meta_dev1> <dev1>]{1,}
  *
- * <raid_params> varies by <raid_type>.  See 'parse_raid_params' for
+ * <raid_params> varies by <raid_type>.	 See 'parse_raid_params' for
  * details on possible <raid_params>.
  *
  * Userspace is free to initialize the metadata devices, hence the superblocks to
@@ -2185,14 +2185,14 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	/* Must have <#raid_params> */
 	if (dm_read_arg_group(_args, &as, &num_raid_params, &ti->error))
-                return -EINVAL;
+		return -EINVAL;
 
 	/* number of raid device tupples <meta_dev data_dev> */
 	as_nrd = as;
 	dm_consume_args(&as_nrd, num_raid_params);
 	_args[1].max = (as_nrd.argc - 1) / 2;
 	if (dm_read_arg(_args + 1, &as_nrd, &num_raid_devs, &ti->error))
-                return -EINVAL;
+		return -EINVAL;
 
 	if (!_in_range(num_raid_devs, 1, MAX_RAID_DEVICES))
 		return ti_error_einval(rs->ti, "Invalid number of supplied raid devices");
@@ -2403,7 +2403,7 @@ static sector_t rs_get_progress(struct raid_set *rs,
 
 			/*
 			 * The raid set may be doing an initial sync, or it may
-			 * be rebuilding individual components.  If all the
+			 * be rebuilding individual components.	 If all the
 			 * devices are In_sync, then it is the raid set that is
 			 * being initialized.
 			 */
@@ -2692,7 +2692,7 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
 			 * Faulty bit may be set, but sometimes the array can
 			 * be suspended before the personalities can respond
 			 * by removing the device from the array (i.e. calling
-			 * 'hot_remove_disk').  If they haven't yet removed
+			 * 'hot_remove_disk').	If they haven't yet removed
 			 * the failed device, its 'raid_disk' number will be
 			 * '>= 0' - meaning we must call this function
 			 * ourselves.
-- 
cgit v1.2.3-70-g09d2


From bd83a4c4f838d0115a5754a80e1bd1fdae82ab6f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 31 May 2016 14:26:52 -0400
Subject: dm raid: remove ti_error_* wrappers

There ti_error_* wrappers added very little.  No other DM target has
ever gone to such lengths to wrap setting ti->error.

Also fixes some NULL derefences via rs->ti->error.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 401 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 249 insertions(+), 152 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 50d2901fd9f4..06a4d170e724 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -401,22 +401,6 @@ static bool rt_is_raid456(struct raid_type *rt)
 }
 /* END: raid level bools */
 
-/*
- * Convenience functions to set ti->error to @errmsg and
- * return @r in order to shorten code in a lot of places
- */
-static int ti_error_ret(struct dm_target *ti, const char *errmsg, int r)
-{
-	ti->error = (char *) errmsg;
-	return r;
-}
-
-static int ti_error_einval(struct dm_target *ti, const char *errmsg)
-{
-	return ti_error_ret(ti, errmsg, -EINVAL);
-}
-/* END: convenience functions to set ti->error to @errmsg... */
-
 /* Return invalid ctr flags for the raid level of @rs */
 static uint32_t _invalid_flags(struct raid_set *rs)
 {
@@ -441,8 +425,10 @@ static uint32_t _invalid_flags(struct raid_set *rs)
  */
 static int rs_check_for_invalid_flags(struct raid_set *rs)
 {
-	if (_test_flags(rs->ctr_flags, _invalid_flags(rs)))
-		return ti_error_einval(rs->ti, "Invalid flag combined");
+	if (_test_flags(rs->ctr_flags, _invalid_flags(rs))) {
+		rs->ti->error = "Invalid flag combined";
+		return -EINVAL;
+	}
 
 	return 0;
 }
@@ -644,12 +630,16 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
 	unsigned i;
 	struct raid_set *rs;
 
-	if (raid_devs <= raid_type->parity_devs)
-		return ERR_PTR(ti_error_einval(ti, "Insufficient number of devices"));
+	if (raid_devs <= raid_type->parity_devs) {
+		ti->error = "Insufficient number of devices";
+		return ERR_PTR(-EINVAL);
+	}
 
 	rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
-	if (!rs)
-		return ERR_PTR(ti_error_ret(ti, "Cannot allocate raid context", -ENOMEM));
+	if (!rs) {
+		ti->error = "Cannot allocate raid context";
+		return ERR_PTR(-ENOMEM);
+	}
 
 	mddev_init(&rs->md);
 
@@ -743,15 +733,18 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
 			return -EINVAL;
 
 		if (strcmp(arg, "-")) {
-			r = dm_get_device(rs->ti, arg,
-					    dm_table_get_mode(rs->ti->table),
-					    &rs->dev[i].meta_dev);
-			if (r)
-				return ti_error_ret(rs->ti, "RAID metadata device lookup failure", r);
+			r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
+					  &rs->dev[i].meta_dev);
+			if (r) {
+				rs->ti->error = "RAID metadata device lookup failure";
+				return r;
+			}
 
 			rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
-			if (!rs->dev[i].rdev.sb_page)
-				return ti_error_ret(rs->ti, "Failed to allocate superblock page", -ENOMEM);
+			if (!rs->dev[i].rdev.sb_page) {
+				rs->ti->error = "Failed to allocate superblock page";
+				return -ENOMEM;
+			}
 		}
 
 		arg = dm_shift_arg(as);
@@ -760,20 +753,25 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
 
 		if (!strcmp(arg, "-")) {
 			if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
-			    (!rs->dev[i].rdev.recovery_offset))
-				return ti_error_einval(rs->ti, "Drive designated for rebuild not specified");
+			    (!rs->dev[i].rdev.recovery_offset)) {
+				rs->ti->error = "Drive designated for rebuild not specified";
+				return -EINVAL;
+			}
 
-			if (rs->dev[i].meta_dev)
-				return ti_error_einval(rs->ti, "No data device supplied with metadata device");
+			if (rs->dev[i].meta_dev) {
+				rs->ti->error = "No data device supplied with metadata device";
+				return -EINVAL;
+			}
 
 			continue;
 		}
 
-		r = dm_get_device(rs->ti, arg,
-				    dm_table_get_mode(rs->ti->table),
-				    &rs->dev[i].data_dev);
-		if (r)
-			return ti_error_ret(rs->ti, "RAID device lookup failure", r);
+		r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
+				  &rs->dev[i].data_dev);
+		if (r) {
+			rs->ti->error = "RAID device lookup failure";
+			return r;
+		}
 
 		if (rs->dev[i].meta_dev) {
 			metadata_available = 1;
@@ -801,8 +799,8 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
 		 *
 		 * User could specify 'nosync' option if desperate.
 		 */
-		DMERR("Unable to rebuild drive while array is not in-sync");
-		return ti_error_einval(rs->ti, "Unable to rebuild drive while array is not in-sync");
+		rs->ti->error = "Unable to rebuild drive while array is not in-sync";
+		return -EINVAL;
 	}
 
 	return 0;
@@ -839,20 +837,27 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
 		/*
 		 * Validate user-supplied value.
 		 */
-		if (region_size > rs->ti->len)
-			return ti_error_einval(rs->ti, "Supplied region size is too large");
+		if (region_size > rs->ti->len) {
+			rs->ti->error = "Supplied region size is too large";
+			return -EINVAL;
+		}
 
 		if (region_size < min_region_size) {
 			DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
 			      region_size, min_region_size);
-			return ti_error_einval(rs->ti, "Supplied region size is too small");
+			rs->ti->error = "Supplied region size is too small";
+			return -EINVAL;
 		}
 
-		if (!is_power_of_2(region_size))
-			return ti_error_einval(rs->ti, "Region size is not a power of 2");
+		if (!is_power_of_2(region_size)) {
+			rs->ti->error = "Region size is not a power of 2";
+			return -EINVAL;
+		}
 
-		if (region_size < rs->md.chunk_sectors)
-			return ti_error_einval(rs->ti, "Region size is smaller than the chunk size");
+		if (region_size < rs->md.chunk_sectors) {
+			rs->ti->error = "Region size is smaller than the chunk size";
+			return -EINVAL;
+		}
 	}
 
 	/*
@@ -1000,8 +1005,10 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	arg = dm_shift_arg(as);
 	num_raid_params--; /* Account for chunk_size argument */
 
-	if (kstrtouint(arg, 10, &value) < 0)
-		return ti_error_einval(rs->ti, "Bad numerical argument given for chunk_size");
+	if (kstrtouint(arg, 10, &value) < 0) {
+		rs->ti->error = "Bad numerical argument given for chunk_size";
+		return -EINVAL;
+	}
 
 	/*
 	 * First, parse the in-order required arguments
@@ -1011,10 +1018,13 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		if (value)
 			DMERR("Ignoring chunk size parameter for RAID 1");
 		value = 0;
-	} else if (!is_power_of_2(value))
-		return ti_error_einval(rs->ti, "Chunk size must be a power of 2");
-	else if (value < 8)
-		return ti_error_einval(rs->ti, "Chunk size value is too small");
+	} else if (!is_power_of_2(value)) {
+		rs->ti->error = "Chunk size must be a power of 2";
+		return -EINVAL;
+	} else if (value < 8) {
+		rs->ti->error = "Chunk size value is too small";
+		return -EINVAL;
+	}
 
 	rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
 
@@ -1045,49 +1055,67 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	 */
 	for (i = 0; i < num_raid_params; i++) {
 		key = dm_shift_arg(as);
-		if (!key)
-			return ti_error_einval(rs->ti, "Not enough raid parameters given");
+		if (!key) {
+			rs->ti->error = "Not enough raid parameters given";
+			return -EINVAL;
+		}
 
 		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_NOSYNC))) {
-			if (_test_and_set_flag(CTR_FLAG_NOSYNC, &rs->ctr_flags))
-				return ti_error_einval(rs->ti, "Only one 'nosync' argument allowed");
+			if (_test_and_set_flag(CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
+				rs->ti->error = "Only one 'nosync' argument allowed";
+				return -EINVAL;
+			}
 			rs->md.recovery_cp = MaxSector;
 			continue;
 		}
 		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_SYNC))) {
-			if (_test_and_set_flag(CTR_FLAG_SYNC, &rs->ctr_flags))
-				return ti_error_einval(rs->ti, "Only one 'sync' argument allowed");
+			if (_test_and_set_flag(CTR_FLAG_SYNC, &rs->ctr_flags)) {
+				rs->ti->error = "Only one 'sync' argument allowed";
+				return -EINVAL;
+			}
 			rs->md.recovery_cp = 0;
 			continue;
 		}
 		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_RAID10_USE_NEAR_SETS))) {
-			if (_test_and_set_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
-				return ti_error_einval(rs->ti, "Only one 'raid10_use_new_sets' argument allowed");
+			if (_test_and_set_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) {
+				rs->ti->error = "Only one 'raid10_use_new_sets' argument allowed";
+				return -EINVAL;
+			}
 			continue;
 		}
 
 		arg = dm_shift_arg(as);
 		i++; /* Account for the argument pairs */
-		if (!arg)
-			return ti_error_einval(rs->ti, "Wrong number of raid parameters given");
+		if (!arg) {
+			rs->ti->error = "Wrong number of raid parameters given";
+			return -EINVAL;
+		}
 
 		/*
 		 * Parameters that take a string value are checked here.
 		 */
 
 		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_RAID10_FORMAT))) {
-			if (_test_and_set_flag(CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
-				return ti_error_einval(rs->ti, "Only one 'raid10_format' argument pair allowed");
-			if (!rt_is_raid10(rt))
-				return ti_error_einval(rs->ti, "'raid10_format' is an invalid parameter for this RAID type");
+			if (_test_and_set_flag(CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
+				rs->ti->error = "Only one 'raid10_format' argument pair allowed";
+				return -EINVAL;
+			}
+			if (!rt_is_raid10(rt)) {
+				rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
+				return -EINVAL;
+			}
 			raid10_format = raid10_name_to_format(arg);
-			if (raid10_format < 0)
-				return ti_error_ret(rs->ti, "Invalid 'raid10_format' value given", raid10_format);
+			if (raid10_format < 0) {
+				rs->ti->error = "Invalid 'raid10_format' value given";
+				return raid10_format;
+			}
 			continue;
 		}
 
-		if (kstrtouint(arg, 10, &value) < 0)
-			return ti_error_einval(rs->ti, "Bad numerical argument given in raid params");
+		if (kstrtouint(arg, 10, &value) < 0) {
+			rs->ti->error = "Bad numerical argument given in raid params";
+			return -EINVAL;
+		}
 
 		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_REBUILD))) {
 			/*
@@ -1095,11 +1123,15 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			 * indexes of replaced devices and to set up additional
 			 * devices on raid level takeover.
 			 */
-			if (!_in_range(value, 0, rs->raid_disks - 1))
-				return ti_error_einval(rs->ti, "Invalid rebuild index given");
+			if (!_in_range(value, 0, rs->raid_disks - 1)) {
+				rs->ti->error = "Invalid rebuild index given";
+				return -EINVAL;
+			}
 
-			if (test_and_set_bit(value, (void *) rs->rebuild_disks))
-				return ti_error_einval(rs->ti, "rebuild for this index already given");
+			if (test_and_set_bit(value, (void *) rs->rebuild_disks)) {
+				rs->ti->error = "rebuild for this index already given";
+				return -EINVAL;
+			}
 
 			rd = rs->dev + value;
 			clear_bit(In_sync, &rd->rdev.flags);
@@ -1107,98 +1139,139 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			rd->rdev.recovery_offset = 0;
 			_set_flag(CTR_FLAG_REBUILD, &rs->ctr_flags);
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_WRITE_MOSTLY))) {
-			if (!rt_is_raid1(rt))
-				return ti_error_einval(rs->ti, "write_mostly option is only valid for RAID1");
+			if (!rt_is_raid1(rt)) {
+				rs->ti->error = "write_mostly option is only valid for RAID1";
+				return -EINVAL;
+			}
 
-			if (!_in_range(value, 0, rs->md.raid_disks - 1))
-				return ti_error_einval(rs->ti, "Invalid write_mostly index given");
+			if (!_in_range(value, 0, rs->md.raid_disks - 1)) {
+				rs->ti->error = "Invalid write_mostly index given";
+				return -EINVAL;
+			}
 
 			set_bit(WriteMostly, &rs->dev[value].rdev.flags);
 			_set_flag(CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags);
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) {
-			if (!rt_is_raid1(rt))
-				return ti_error_einval(rs->ti, "max_write_behind option is only valid for RAID1");
+			if (!rt_is_raid1(rt)) {
+				rs->ti->error = "max_write_behind option is only valid for RAID1";
+				return -EINVAL;
+			}
 
-			if (_test_and_set_flag(CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
-				return ti_error_einval(rs->ti, "Only one max_write_behind argument pair allowed");
+			if (_test_and_set_flag(CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) {
+				rs->ti->error = "Only one max_write_behind argument pair allowed";
+				return -EINVAL;
+			}
 
 			/*
 			 * In device-mapper, we specify things in sectors, but
 			 * MD records this value in kB
 			 */
 			value /= 2;
-			if (value > COUNTER_MAX)
-				return ti_error_einval(rs->ti, "Max write-behind limit out of range");
+			if (value > COUNTER_MAX) {
+				rs->ti->error = "Max write-behind limit out of range";
+				return -EINVAL;
+			}
 
 			rs->md.bitmap_info.max_write_behind = value;
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_DAEMON_SLEEP))) {
-			if (_test_and_set_flag(CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
-				return ti_error_einval(rs->ti, "Only one daemon_sleep argument pair allowed");
-			if (!value || (value > MAX_SCHEDULE_TIMEOUT))
-				return ti_error_einval(rs->ti, "daemon sleep period out of range");
+			if (_test_and_set_flag(CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) {
+				rs->ti->error = "Only one daemon_sleep argument pair allowed";
+				return -EINVAL;
+			}
+			if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
+				rs->ti->error = "daemon sleep period out of range";
+				return -EINVAL;
+			}
 			rs->md.bitmap_info.daemon_sleep = value;
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_DATA_OFFSET))) {
 			/* Userspace passes new data_offset after having extended the the data image LV */
-			if (_test_and_set_flag(CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
-				return ti_error_einval(rs->ti, "Only one data_offset argument pair allowed");
-
+			if (_test_and_set_flag(CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) {
+				rs->ti->error = "Only one data_offset argument pair allowed";
+				return -EINVAL;
+			}
 			/* Ensure sensible data offset */
-			if (value < 0)
-				return ti_error_einval(rs->ti, "Bogus data_offset value");
-
+			if (value < 0) {
+				rs->ti->error = "Bogus data_offset value";
+				return -EINVAL;
+			}
 			rs->data_offset = value;
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_DELTA_DISKS))) {
 			/* Define the +/-# of disks to add to/remove from the given raid set */
-			if (_test_and_set_flag(CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
-				return ti_error_einval(rs->ti, "Only one delta_disks argument pair allowed");
-
+			if (_test_and_set_flag(CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) {
+				rs->ti->error = "Only one delta_disks argument pair allowed";
+				return -EINVAL;
+			}
 			/* Ensure MAX_RAID_DEVICES and raid type minimal_devs! */
-			if (!_in_range(abs(value), 1, MAX_RAID_DEVICES - rt->minimal_devs))
-				return ti_error_einval(rs->ti, "Too many delta_disk requested");
+			if (!_in_range(abs(value), 1, MAX_RAID_DEVICES - rt->minimal_devs)) {
+				rs->ti->error = "Too many delta_disk requested";
+				return -EINVAL;
+			}
 
 			rs->delta_disks = value;
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_STRIPE_CACHE))) {
-			if (_test_and_set_flag(CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
-				return ti_error_einval(rs->ti, "Only one stripe_cache argument pair allowed");
+			if (_test_and_set_flag(CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) {
+				rs->ti->error = "Only one stripe_cache argument pair allowed";
+				return -EINVAL;
+			}
+
 			/*
 			 * In device-mapper, we specify things in sectors, but
 			 * MD records this value in kB
 			 */
 			value /= 2;
 
-			if (!rt_is_raid456(rt))
-				return ti_error_einval(rs->ti, "Inappropriate argument: stripe_cache");
-			if (raid5_set_cache_size(&rs->md, (int)value))
-				return ti_error_einval(rs->ti, "Bad stripe_cache size");
+			if (!rt_is_raid456(rt)) {
+				rs->ti->error = "Inappropriate argument: stripe_cache";
+				return -EINVAL;
+			}
+			if (raid5_set_cache_size(&rs->md, (int)value)) {
+				rs->ti->error = "Bad stripe_cache size";
+				return -EINVAL;
+			}
 
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_MIN_RECOVERY_RATE))) {
-			if (_test_and_set_flag(CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
-				return ti_error_einval(rs->ti, "Only one min_recovery_rate argument pair allowed");
-			if (value > INT_MAX)
-				return ti_error_einval(rs->ti, "min_recovery_rate out of range");
+			if (_test_and_set_flag(CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) {
+				rs->ti->error = "Only one min_recovery_rate argument pair allowed";
+				return -EINVAL;
+			}
+			if (value > INT_MAX) {
+				rs->ti->error = "min_recovery_rate out of range";
+				return -EINVAL;
+			}
 			rs->md.sync_speed_min = (int)value;
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_MAX_RECOVERY_RATE))) {
-			if (_test_and_set_flag(CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
-				return ti_error_einval(rs->ti, "Only one max_recovery_rate argument pair allowed");
-			if (value > INT_MAX)
-				return ti_error_einval(rs->ti, "max_recovery_rate out of range");
+			if (_test_and_set_flag(CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) {
+				rs->ti->error = "Only one max_recovery_rate argument pair allowed";
+				return -EINVAL;
+			}
+			if (value > INT_MAX) {
+				rs->ti->error = "max_recovery_rate out of range";
+				return -EINVAL;
+			}
 			rs->md.sync_speed_max = (int)value;
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_REGION_SIZE))) {
-			if (_test_and_set_flag(CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
-				return ti_error_einval(rs->ti, "Only one region_size argument pair allowed");
+			if (_test_and_set_flag(CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) {
+				rs->ti->error = "Only one region_size argument pair allowed";
+				return -EINVAL;
+			}
 
 			region_size = value;
 		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_RAID10_COPIES))) {
-			if (_test_and_set_flag(CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
-				return ti_error_einval(rs->ti, "Only one raid10_copies argument pair allowed");
+			if (_test_and_set_flag(CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) {
+				rs->ti->error = "Only one raid10_copies argument pair allowed";
+				return -EINVAL;
+			}
 
-			if (!_in_range(value, 2, rs->md.raid_disks))
-				return ti_error_einval(rs->ti, "Bad value for 'raid10_copies'");
+			if (!_in_range(value, 2, rs->md.raid_disks)) {
+				rs->ti->error = "Bad value for 'raid10_copies'";
+				return -EINVAL;
+			}
 
 			raid10_copies = value;
 		} else {
 			DMERR("Unable to parse RAID parameter: %s", key);
-			return ti_error_einval(rs->ti, "Unable to parse RAID parameters");
+			rs->ti->error = "Unable to parse RAID parameter";
+			return -EINVAL;
 		}
 	}
 
@@ -1214,21 +1287,29 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		return -EINVAL;
 
 	if (rt_is_raid10(rt)) {
-		if (raid10_copies > rs->md.raid_disks)
-			return ti_error_einval(rs->ti, "Not enough devices to satisfy specification");
+		if (raid10_copies > rs->md.raid_disks) {
+			rs->ti->error = "Not enough devices to satisfy specification";
+			return -EINVAL;
+		}
 
 		rs->md.new_layout = raid10_format_to_md_layout(rs, raid10_format, raid10_copies);
-		if (rs->md.new_layout < 0)
-			return ti_error_ret(rs->ti, "Error getting raid10 format", rs->md.new_layout);
+		if (rs->md.new_layout < 0) {
+			rs->ti->error = "Error getting raid10 format";
+			return rs->md.new_layout;
+		}
 
 		rt = get_raid_type_by_ll(10, rs->md.new_layout);
-		if (!rt)
-			return ti_error_einval(rs->ti, "Failed to recognize new raid10 layout");
+		if (!rt) {
+			rs->ti->error = "Failed to recognize new raid10 layout";
+			return -EINVAL;
+		}
 
 		if ((rt->algorithm == ALGORITHM_RAID10_DEFAULT ||
 		     rt->algorithm == ALGORITHM_RAID10_NEAR) &&
-		    _test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, rs->ctr_flags))
-			return ti_error_einval(rs->ti, "RAID10 format 'near' and 'raid10_use_near_sets' are incompatible");
+		    _test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, rs->ctr_flags)) {
+			rs->ti->error = "RAID10 format 'near' and 'raid10_use_near_sets' are incompatible";
+			return -EINVAL;
+		}
 
 		/* (Len * #mirrors) / #devices */
 		sectors_per_dev = rs->ti->len * raid10_copies;
@@ -1237,9 +1318,10 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		rs->md.layout = raid10_format_to_md_layout(rs, raid10_format, raid10_copies);
 		rs->md.new_layout = rs->md.layout;
 	} else if (!rt_is_raid1(rt) &&
-		   sector_div(sectors_per_dev,
-			      (rs->md.raid_disks - rt->parity_devs)))
-		return ti_error_einval(rs->ti, "Target length not divisible by number of data devices");
+		   sector_div(sectors_per_dev, (rs->md.raid_disks - rt->parity_devs))) {
+		rs->ti->error = "Target length not divisible by number of data devices";
+		return -EINVAL;
+	}
 
 	rs->raid10_copies = raid10_copies;
 	rs->md.dev_sectors = sectors_per_dev;
@@ -1420,7 +1502,8 @@ static int rs_check_takeover(struct raid_set *rs)
 		break;
 	}
 
-	return ti_error_einval(rs->ti, "takeover not possible");
+	rs->ti->error = "takeover not possible";
+	return -EINVAL;
 }
 
 /* True if @rs requested to be taken over */
@@ -1870,19 +1953,22 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 			if (role != r->raid_disk) {
 				if (_is_raid10_near(mddev->layout)) {
 					if (mddev->raid_disks % _raid10_near_copies(mddev->layout) ||
-					    rs->raid_disks % rs->raid10_copies)
-						return ti_error_einval(rs->ti, "Cannot change raid10 near "
-									       "set to odd # of devices!");
+					    rs->raid_disks % rs->raid10_copies) {
+						rs->ti->error =
+							"Cannot change raid10 near set to odd # of devices!";
+						return -EINVAL;
+					}
 
 					sb2->array_position = cpu_to_le32(r->raid_disk);
 
 				} else if (!(rs_is_raid10(rs) && rt_is_raid0(rs->raid_type)) &&
-				    !(rs_is_raid0(rs) && rt_is_raid10(rs->raid_type)) &&
-				    !rt_is_raid1(rs->raid_type))
-					return ti_error_einval(rs->ti, "Cannot change device positions in raid set");
+					   !(rs_is_raid0(rs) && rt_is_raid10(rs->raid_type)) &&
+					   !rt_is_raid1(rs->raid_type)) {
+					rs->ti->error = "Cannot change device positions in raid set";
+					return -EINVAL;
+				}
 
-				DMINFO("raid device #%d now at position #%d",
-				       role, r->raid_disk);
+				DMINFO("raid device #%d now at position #%d", role, r->raid_disk);
 			}
 
 			/*
@@ -2024,15 +2110,19 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 	if (!freshest)
 		return 0;
 
-	if (validate_raid_redundancy(rs))
-		return ti_error_einval(rs->ti, "Insufficient redundancy to activate array");
+	if (validate_raid_redundancy(rs)) {
+		rs->ti->error = "Insufficient redundancy to activate array";
+		return -EINVAL;
+	}
 
 	/*
 	 * Validation of the freshest device provides the source of
 	 * validation for the remaining devices.
 	 */
-	if (super_validate(rs, freshest))
-		return ti_error_einval(rs->ti, "Unable to assemble array: Invalid superblocks");
+	if (super_validate(rs, freshest)) {
+		rs->ti->error = "Unable to assemble array: Invalid superblocks";
+		return -EINVAL;
+	}
 
 	rdev_for_each(rdev, mddev)
 		if ((rdev != freshest) && super_validate(rs, rdev))
@@ -2176,12 +2266,16 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	/* Must have <raid_type> */
 	arg = dm_shift_arg(&as);
-	if (!arg)
-		return ti_error_einval(rs->ti, "No arguments");
+	if (!arg) {
+		ti->error = "No arguments";
+		return -EINVAL;
+	}
 
 	rt = get_raid_type(arg);
-	if (!rt)
-		return ti_error_einval(rs->ti, "Unrecognised raid_type");
+	if (!rt) {
+		ti->error = "Unrecognised raid_type";
+		return -EINVAL;
+	}
 
 	/* Must have <#raid_params> */
 	if (dm_read_arg_group(_args, &as, &num_raid_params, &ti->error))
@@ -2194,8 +2288,10 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (dm_read_arg(_args + 1, &as_nrd, &num_raid_devs, &ti->error))
 		return -EINVAL;
 
-	if (!_in_range(num_raid_devs, 1, MAX_RAID_DEVICES))
-		return ti_error_einval(rs->ti, "Invalid number of supplied raid devices");
+	if (!_in_range(num_raid_devs, 1, MAX_RAID_DEVICES)) {
+		ti->error = "Invalid number of supplied raid devices";
+		return -EINVAL;
+	}
 
 	rs = context_alloc(ti, rt, num_raid_devs);
 	if (IS_ERR(rs))
@@ -2265,7 +2361,8 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	if (ti->len != rs->md.array_sectors) {
-		r = ti_error_einval(ti, "Array size does not match requested target length");
+		ti->error = "Array size does not match requested target length";
+		r = -EINVAL;
 		goto size_mismatch;
 	}
 	rs->callbacks.congested_fn = raid_is_congested;
-- 
cgit v1.2.3-70-g09d2


From 9b6e54232992a2e39790d93df4581a2dcb8a5429 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 2 Jun 2016 11:48:09 -0400
Subject: dm raid: bump to v1.9.0 and make the extended SB feature flag reflect
 it

No idea what Heinz was doing with the versioning but upstream commit
4c9971ca6a ("dm raid: make sure no feature flags are set in metadata")
bumped to 1.8.0 already.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 06a4d170e724..88738281284a 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -63,7 +63,7 @@ struct raid_dev {
 #define CTR_FLAG_REGION_SIZE	   0x200 /* 2 */ /* Not with raid0! */
 #define CTR_FLAG_RAID10_COPIES	   0x400 /* 2 */ /* Only with raid10 */
 #define CTR_FLAG_RAID10_FORMAT	   0x800 /* 2 */ /* Only with raid10 */
-/* New for v1.8.0 */
+/* New for v1.9.0 */
 #define CTR_FLAG_DELTA_DISKS	      0x1000 /* 2 */ /* Only with reshapable raid4/5/6/10! */
 #define CTR_FLAG_DATA_OFFSET	      0x2000 /* 2 */ /* Only with reshapable raid4/5/6/10! */
 #define CTR_FLAG_RAID10_USE_NEAR_SETS 0x4000 /* 2 */ /* Only with raid10! */
@@ -1513,8 +1513,7 @@ static bool rs_takeover_requested(struct raid_set *rs)
 }
 
 /*  Features */
-#define	FEATURE_FLAG_SUPPORTS_V180	0x1 /* Supports v1.8.0 extended superblock */
-#define	FEATURE_FLAG_SUPPORTS_RESHAPE	0x2 /* Supports v1.8.0 reshaping functionality */
+#define	FEATURE_FLAG_SUPPORTS_V190	0x1 /* Supports extended superblock */
 
 /* State flags for sb->flags */
 #define	SB_FLAG_RESHAPE_ACTIVE		0x1
@@ -1527,13 +1526,13 @@ static bool rs_takeover_requested(struct raid_set *rs)
 #define DM_RAID_MAGIC 0x64526D44
 struct dm_raid_superblock {
 	__le32 magic;		/* "DmRd" */
-	__le32 compat_features;	/* Used to indicate compatible features (like 1.8.0 ondisk metadata extension) */
+	__le32 compat_features;	/* Used to indicate compatible features (like 1.9.0 ondisk metadata extension) */
 
 	__le32 num_devices;	/* Number of devices in this raid set. (Max 64) */
 	__le32 array_position;	/* The position of this drive in the raid set */
 
 	__le64 events;		/* Incremented by md when superblock updated */
-	__le64 failed_devices;	/* Pre 1.8.0 part of bit field of devices to */
+	__le64 failed_devices;	/* Pre 1.9.0 part of bit field of devices to */
 				/* indicate failures (see extension below) */
 
 	/*
@@ -1556,9 +1555,9 @@ struct dm_raid_superblock {
 	__le32 stripe_sectors;
 
 	/********************************************************************
-	 * BELOW FOLLOW V1.8.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
+	 * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
 	 *
-	 * FEATURE_FLAG_SUPPORTS_V180 in the features member indicates that those exist
+	 * FEATURE_FLAG_SUPPORTS_V190 in the features member indicates that those exist
 	 */
 
 	__le32 flags; /* Flags defining array states for reshaping */
@@ -1592,7 +1591,7 @@ struct dm_raid_superblock {
 
 	/*
 	 * Additonal Bit field of devices indicating failures to support
-	 * up to 256 devices with the 1.8.0 on-disk metadata format
+	 * up to 256 devices with the 1.9.0 on-disk metadata format
 	 */
 	__le64 extended_failed_devices[DISKS_ARRAY_ELEMS - 1];
 
@@ -1625,7 +1624,7 @@ static void sb_retrieve_failed_devices(struct dm_raid_superblock *sb, uint64_t *
 	failed_devices[0] = le64_to_cpu(sb->failed_devices);
 	memset(failed_devices + 1, 0, sizeof(sb->extended_failed_devices));
 
-	if (_test_flag(FEATURE_FLAG_SUPPORTS_V180, le32_to_cpu(sb->compat_features))) {
+	if (_test_flag(FEATURE_FLAG_SUPPORTS_V190, le32_to_cpu(sb->compat_features))) {
 		int i = ARRAY_SIZE(sb->extended_failed_devices);
 
 		while (i--)
@@ -1675,7 +1674,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 		sb_update_failed_devices(sb, failed_devices);
 
 	sb->magic = cpu_to_le32(DM_RAID_MAGIC);
-	sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V180); /* Don't set reshape flag yet */
+	sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190);
 
 	sb->num_devices = cpu_to_le32(mddev->raid_disks);
 	sb->array_position = cpu_to_le32(rdev->raid_disk);
@@ -1754,7 +1753,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 		super_sync(rdev->mddev, rdev);
 
 		set_bit(FirstUse, &rdev->flags);
-		sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V180); /* Don't set reshape flag yet */
+		sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190);
 
 		/* Force writing of superblocks to disk */
 		set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
@@ -1800,7 +1799,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 	 * Reshaping is supported, e.g. reshape_position is valid
 	 * in superblock and superblock content is authoritative.
 	 */
-	if (_test_flag(FEATURE_FLAG_SUPPORTS_V180, le32_to_cpu(sb->compat_features))) {
+	if (_test_flag(FEATURE_FLAG_SUPPORTS_V190, le32_to_cpu(sb->compat_features))) {
 		/* Superblock is authoritative wrt given raid set layout! */
 		mddev->raid_disks = le32_to_cpu(sb->num_devices);
 		mddev->level = le32_to_cpu(sb->level);
@@ -1831,7 +1830,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 
 	} else {
 		/*
-		 * No takeover/reshaping, because we don't have the extended v1.8.0 metadata
+		 * No takeover/reshaping, because we don't have the extended v1.9.0 metadata
 		 */
 		if (le32_to_cpu(sb->level) != mddev->level) {
 			DMERR("Reshaping/takeover raid sets not yet supported. (raid level/stripes/size change)");
@@ -2000,8 +1999,12 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 	if (!mddev->events && super_init_validation(rs, rdev))
 		return -EINVAL;
 
-	if (le32_to_cpu(sb->compat_features) != FEATURE_FLAG_SUPPORTS_V180 ||
-	    sb->incompat_features) {
+	if (le32_to_cpu(sb->compat_features) != FEATURE_FLAG_SUPPORTS_V190) {
+		rs->ti->error = "Unable to assemble array: Unknown flag(s) in compatible feature flags";
+		return -EINVAL;
+	}
+
+	if (sb->incompat_features) {
 		rs->ti->error = "Unable to assemble array: No incompatible feature flags supported yet";
 		return -EINVAL;
 	}
@@ -2595,7 +2598,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		DMEMIT(" %llu", (unsigned long long) resync_mismatches);
 
 		/*
-		 * v1.8.0+:
+		 * v1.9.0+:
 		 *
 		 * data_offset (needed for out of space reshaping)
 		 *   This field shows the data offset into the data
@@ -2903,7 +2906,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 8, 1},
+	.version = {1, 9, 0},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
-- 
cgit v1.2.3-70-g09d2


From 3fa6cf38211619f2a1fc9e54b5a19befb464f79a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 2 Jun 2016 11:58:51 -0400
Subject: dm raid: rename _argname_by_flag to dm_raid_arg_name_by_flag

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 60 ++++++++++++++++++++++++++--------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 88738281284a..7876f9529f82 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -328,7 +328,7 @@ static struct arg_name_flag {
 };
 
 /* Return argument name string for given @flag */
-static const char *_argname_by_flag(const uint32_t flag)
+static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
 {
 	if (hweight32(flag) == 1) {
 		struct arg_name_flag *anf = _arg_name_flags + ARRAY_SIZE(_arg_name_flags);
@@ -1060,7 +1060,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			return -EINVAL;
 		}
 
-		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_NOSYNC))) {
+		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC))) {
 			if (_test_and_set_flag(CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
 				rs->ti->error = "Only one 'nosync' argument allowed";
 				return -EINVAL;
@@ -1068,7 +1068,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			rs->md.recovery_cp = MaxSector;
 			continue;
 		}
-		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_SYNC))) {
+		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_SYNC))) {
 			if (_test_and_set_flag(CTR_FLAG_SYNC, &rs->ctr_flags)) {
 				rs->ti->error = "Only one 'sync' argument allowed";
 				return -EINVAL;
@@ -1076,7 +1076,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			rs->md.recovery_cp = 0;
 			continue;
 		}
-		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_RAID10_USE_NEAR_SETS))) {
+		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_USE_NEAR_SETS))) {
 			if (_test_and_set_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) {
 				rs->ti->error = "Only one 'raid10_use_new_sets' argument allowed";
 				return -EINVAL;
@@ -1095,7 +1095,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		 * Parameters that take a string value are checked here.
 		 */
 
-		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_RAID10_FORMAT))) {
+		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) {
 			if (_test_and_set_flag(CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
 				rs->ti->error = "Only one 'raid10_format' argument pair allowed";
 				return -EINVAL;
@@ -1117,7 +1117,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			return -EINVAL;
 		}
 
-		if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_REBUILD))) {
+		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD))) {
 			/*
 			 * "rebuild" is being passed in by userspace to provide
 			 * indexes of replaced devices and to set up additional
@@ -1138,7 +1138,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			clear_bit(Faulty, &rd->rdev.flags);
 			rd->rdev.recovery_offset = 0;
 			_set_flag(CTR_FLAG_REBUILD, &rs->ctr_flags);
-		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_WRITE_MOSTLY))) {
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY))) {
 			if (!rt_is_raid1(rt)) {
 				rs->ti->error = "write_mostly option is only valid for RAID1";
 				return -EINVAL;
@@ -1151,7 +1151,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 
 			set_bit(WriteMostly, &rs->dev[value].rdev.flags);
 			_set_flag(CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags);
-		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) {
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) {
 			if (!rt_is_raid1(rt)) {
 				rs->ti->error = "max_write_behind option is only valid for RAID1";
 				return -EINVAL;
@@ -1173,7 +1173,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			}
 
 			rs->md.bitmap_info.max_write_behind = value;
-		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_DAEMON_SLEEP))) {
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP))) {
 			if (_test_and_set_flag(CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) {
 				rs->ti->error = "Only one daemon_sleep argument pair allowed";
 				return -EINVAL;
@@ -1183,7 +1183,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				return -EINVAL;
 			}
 			rs->md.bitmap_info.daemon_sleep = value;
-		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_DATA_OFFSET))) {
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET))) {
 			/* Userspace passes new data_offset after having extended the the data image LV */
 			if (_test_and_set_flag(CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) {
 				rs->ti->error = "Only one data_offset argument pair allowed";
@@ -1195,7 +1195,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				return -EINVAL;
 			}
 			rs->data_offset = value;
-		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_DELTA_DISKS))) {
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS))) {
 			/* Define the +/-# of disks to add to/remove from the given raid set */
 			if (_test_and_set_flag(CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) {
 				rs->ti->error = "Only one delta_disks argument pair allowed";
@@ -1208,7 +1208,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			}
 
 			rs->delta_disks = value;
-		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_STRIPE_CACHE))) {
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE))) {
 			if (_test_and_set_flag(CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) {
 				rs->ti->error = "Only one stripe_cache argument pair allowed";
 				return -EINVAL;
@@ -1229,7 +1229,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				return -EINVAL;
 			}
 
-		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_MIN_RECOVERY_RATE))) {
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE))) {
 			if (_test_and_set_flag(CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) {
 				rs->ti->error = "Only one min_recovery_rate argument pair allowed";
 				return -EINVAL;
@@ -1239,7 +1239,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				return -EINVAL;
 			}
 			rs->md.sync_speed_min = (int)value;
-		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_MAX_RECOVERY_RATE))) {
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE))) {
 			if (_test_and_set_flag(CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) {
 				rs->ti->error = "Only one max_recovery_rate argument pair allowed";
 				return -EINVAL;
@@ -1249,14 +1249,14 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				return -EINVAL;
 			}
 			rs->md.sync_speed_max = (int)value;
-		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_REGION_SIZE))) {
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE))) {
 			if (_test_and_set_flag(CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) {
 				rs->ti->error = "Only one region_size argument pair allowed";
 				return -EINVAL;
 			}
 
 			region_size = value;
-		} else if (!strcasecmp(key, _argname_by_flag(CTR_FLAG_RAID10_COPIES))) {
+		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES))) {
 			if (_test_and_set_flag(CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) {
 				rs->ti->error = "Only one raid10_copies argument pair allowed";
 				return -EINVAL;
@@ -2625,46 +2625,46 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		/* Emit table line */
 		DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
 		if (_test_flag(CTR_FLAG_RAID10_FORMAT, rs->ctr_flags))
-			DMEMIT(" %s %s", _argname_by_flag(CTR_FLAG_RAID10_FORMAT),
+			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
 					 raid10_md_layout_to_format(mddev->layout));
 		if (_test_flag(CTR_FLAG_RAID10_COPIES, rs->ctr_flags))
-			DMEMIT(" %s %d", _argname_by_flag(CTR_FLAG_RAID10_COPIES),
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
 					 raid10_md_layout_to_copies(mddev->layout));
 		if (_test_flag(CTR_FLAG_NOSYNC, rs->ctr_flags))
-			DMEMIT(" %s", _argname_by_flag(CTR_FLAG_NOSYNC));
+			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
 		if (_test_flag(CTR_FLAG_SYNC, rs->ctr_flags))
-			DMEMIT(" %s", _argname_by_flag(CTR_FLAG_SYNC));
+			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
 		if (_test_flag(CTR_FLAG_REGION_SIZE, rs->ctr_flags))
-			DMEMIT(" %s %llu", _argname_by_flag(CTR_FLAG_REGION_SIZE),
+			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
 					   (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
 		if (_test_flag(CTR_FLAG_DATA_OFFSET, rs->ctr_flags))
-			DMEMIT(" %s %llu", _argname_by_flag(CTR_FLAG_DATA_OFFSET),
+			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
 					   (unsigned long long) rs->data_offset);
 		if (_test_flag(CTR_FLAG_DAEMON_SLEEP, rs->ctr_flags))
-			DMEMIT(" %s %lu", _argname_by_flag(CTR_FLAG_DAEMON_SLEEP),
+			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
 					  mddev->bitmap_info.daemon_sleep);
 		if (_test_flag(CTR_FLAG_DELTA_DISKS, rs->ctr_flags))
-			DMEMIT(" %s %d", _argname_by_flag(CTR_FLAG_DELTA_DISKS),
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
 					 mddev->delta_disks);
 		if (_test_flag(CTR_FLAG_STRIPE_CACHE, rs->ctr_flags))
-			DMEMIT(" %s %d", _argname_by_flag(CTR_FLAG_STRIPE_CACHE),
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
 					 max_nr_stripes);
 		rdev_for_each(rdev, mddev)
 			if (test_bit(rdev->raid_disk, (void *) rs->rebuild_disks))
-				DMEMIT(" %s %u", _argname_by_flag(CTR_FLAG_REBUILD),
+				DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
 						 rdev->raid_disk);
 		rdev_for_each(rdev, mddev)
 			if (test_bit(WriteMostly, &rdev->flags))
-				DMEMIT(" %s %d", _argname_by_flag(CTR_FLAG_WRITE_MOSTLY),
+				DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY),
 						 rdev->raid_disk);
 		if (_test_flag(CTR_FLAG_MAX_WRITE_BEHIND, rs->ctr_flags))
-			DMEMIT(" %s %lu", _argname_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
+			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
 					  mddev->bitmap_info.max_write_behind);
 		if (_test_flag(CTR_FLAG_MAX_RECOVERY_RATE, rs->ctr_flags))
-			DMEMIT(" %s %d", _argname_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
 					 mddev->sync_speed_max);
 		if (_test_flag(CTR_FLAG_MIN_RECOVERY_RATE, rs->ctr_flags))
-			DMEMIT(" %s %d", _argname_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
 					 mddev->sync_speed_min);
 		DMEMIT(" %d", rs->raid_disks);
 		rdev_for_each(rdev, mddev) {
-- 
cgit v1.2.3-70-g09d2


From ef9b85a651d4f09c36867c6666ea4086440d89a0 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 2 Jun 2016 12:02:19 -0400
Subject: dm raid: add missing "dm-raid0" module alias

Also update module description to "raid0/1/10/4/5/6 target"

Reported by Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 7876f9529f82..02c07a75e4dc 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2942,7 +2942,8 @@ module_param(devices_handle_discard_safely, bool, 0644);
 MODULE_PARM_DESC(devices_handle_discard_safely,
 		 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
 
-MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_DESCRIPTION(DM_NAME " raid0/1/10/4/5/6 target");
+MODULE_ALIAS("dm-raid0");
 MODULE_ALIAS("dm-raid1");
 MODULE_ALIAS("dm-raid10");
 MODULE_ALIAS("dm-raid4");
-- 
cgit v1.2.3-70-g09d2


From bb91a63fcc58d5a992fe5e92c6ff1e7f4d20664e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 2 Jun 2016 12:06:54 -0400
Subject: dm raid: rename _in_range to __within_range

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 02c07a75e4dc..039db81c9d53 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -244,7 +244,7 @@ static struct raid_type {
 };
 
 /* True, if @v is in inclusive range [@min, @max] */
-static bool _in_range(long v, long min, long max)
+static bool __within_range(long v, long min, long max)
 {
 	return v >= min && v <= max;
 }
@@ -385,7 +385,7 @@ static bool rt_is_raid10(struct raid_type *rt)
 /* Return true, if raid type in @rt is raid4/5 */
 static bool rt_is_raid45(struct raid_type *rt)
 {
-	return _in_range(rt->level, 4, 5);
+	return __within_range(rt->level, 4, 5);
 }
 
 /* Return true, if raid type in @rt is raid6 */
@@ -397,7 +397,7 @@ static bool rt_is_raid6(struct raid_type *rt)
 /* Return true, if raid type in @rt is raid4/5/6 */
 static bool rt_is_raid456(struct raid_type *rt)
 {
-	return _in_range(rt->level, 4, 6);
+	return __within_range(rt->level, 4, 6);
 }
 /* END: raid level bools */
 
@@ -1123,7 +1123,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			 * indexes of replaced devices and to set up additional
 			 * devices on raid level takeover.
 			 */
-			if (!_in_range(value, 0, rs->raid_disks - 1)) {
+			if (!__within_range(value, 0, rs->raid_disks - 1)) {
 				rs->ti->error = "Invalid rebuild index given";
 				return -EINVAL;
 			}
@@ -1144,7 +1144,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				return -EINVAL;
 			}
 
-			if (!_in_range(value, 0, rs->md.raid_disks - 1)) {
+			if (!__within_range(value, 0, rs->md.raid_disks - 1)) {
 				rs->ti->error = "Invalid write_mostly index given";
 				return -EINVAL;
 			}
@@ -1202,7 +1202,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				return -EINVAL;
 			}
 			/* Ensure MAX_RAID_DEVICES and raid type minimal_devs! */
-			if (!_in_range(abs(value), 1, MAX_RAID_DEVICES - rt->minimal_devs)) {
+			if (!__within_range(abs(value), 1, MAX_RAID_DEVICES - rt->minimal_devs)) {
 				rs->ti->error = "Too many delta_disk requested";
 				return -EINVAL;
 			}
@@ -1262,7 +1262,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				return -EINVAL;
 			}
 
-			if (!_in_range(value, 2, rs->md.raid_disks)) {
+			if (!__within_range(value, 2, rs->md.raid_disks)) {
 				rs->ti->error = "Bad value for 'raid10_copies'";
 				return -EINVAL;
 			}
@@ -1380,7 +1380,7 @@ static int rs_check_takeover(struct raid_set *rs)
 			return 0;
 
 		/* raid0 with multiple disks -> raid4/5/6 */
-		if (_in_range(mddev->new_level, 4, 6) &&
+		if (__within_range(mddev->new_level, 4, 6) &&
 		    mddev->new_layout == ALGORITHM_PARITY_N &&
 		    mddev->raid_disks > 1)
 			return 0;
@@ -1418,14 +1418,14 @@ static int rs_check_takeover(struct raid_set *rs)
 			return 0;
 
 		/* raid10_{near,far} with 2 disks -> raid4/5 */
-		if (_in_range(mddev->new_level, 4, 5) &&
+		if (__within_range(mddev->new_level, 4, 5) &&
 		    mddev->raid_disks == 2)
 			return 0;
 		break;
 
 	case 1:
 		/* raid1 with 2 disks -> raid4/5 */
-		if (_in_range(mddev->new_level, 4, 5) &&
+		if (__within_range(mddev->new_level, 4, 5) &&
 		    mddev->raid_disks == 2) {
 			mddev->degraded = 1;
 			return 0;
@@ -1453,7 +1453,7 @@ static int rs_check_takeover(struct raid_set *rs)
 			return 0;
 
 		/* raid4 -> raid5/6 with parity N */
-		if (_in_range(mddev->new_level, 5, 6) &&
+		if (__within_range(mddev->new_level, 5, 6) &&
 		    mddev->layout == ALGORITHM_PARITY_N)
 			return 0;
 		break;
@@ -1477,7 +1477,7 @@ static int rs_check_takeover(struct raid_set *rs)
 		/* raid5 with parity N -> raid6 with parity N */
 		if (mddev->new_level == 6 &&
 		    ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) ||
-		      _in_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC_6, ALGORITHM_RIGHT_SYMMETRIC_6)))
+		      __within_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC_6, ALGORITHM_RIGHT_SYMMETRIC_6)))
 			return 0;
 		break;
 
@@ -1495,7 +1495,7 @@ static int rs_check_takeover(struct raid_set *rs)
 		/* raid6_*_n with parity N -> raid5_* */
 		if (mddev->new_level == 5 &&
 		    ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) ||
-		     _in_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC, ALGORITHM_RIGHT_SYMMETRIC)))
+		     __within_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC, ALGORITHM_RIGHT_SYMMETRIC)))
 			return 0;
 
 	default:
@@ -2291,7 +2291,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (dm_read_arg(_args + 1, &as_nrd, &num_raid_devs, &ti->error))
 		return -EINVAL;
 
-	if (!_in_range(num_raid_devs, 1, MAX_RAID_DEVICES)) {
+	if (!__within_range(num_raid_devs, 1, MAX_RAID_DEVICES)) {
 		ti->error = "Invalid number of supplied raid devices";
 		return -EINVAL;
 	}
-- 
cgit v1.2.3-70-g09d2


From 4286325b4b0dc9d67e829e91c5377e070adaffec Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 2 Jun 2016 12:27:46 -0400
Subject: dm raid: remove all the bitops wrappers

Removes obfuscation that is of little value.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 214 +++++++++++++++++++++------------------------------
 1 file changed, 89 insertions(+), 125 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 039db81c9d53..32c3bae69aae 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -46,27 +46,46 @@ struct raid_dev {
 };
 
 /*
- * Flags for rs->ctr_flags field.
+ * Bits for establishing rs->ctr_flags
  *
  * 1 = no flag value
  * 2 = flag with value
  */
-#define CTR_FLAG_SYNC		   0x1	 /* 1 */ /* Not with raid0! */
-#define CTR_FLAG_NOSYNC		   0x2	 /* 1 */ /* Not with raid0! */
-#define CTR_FLAG_REBUILD	   0x4	 /* 2 */ /* Not with raid0! */
-#define CTR_FLAG_DAEMON_SLEEP	   0x8	 /* 2 */ /* Not with raid0! */
-#define CTR_FLAG_MIN_RECOVERY_RATE 0x10	 /* 2 */ /* Not with raid0! */
-#define CTR_FLAG_MAX_RECOVERY_RATE 0x20	 /* 2 */ /* Not with raid0! */
-#define CTR_FLAG_MAX_WRITE_BEHIND  0x40	 /* 2 */ /* Only with raid1! */
-#define CTR_FLAG_WRITE_MOSTLY	   0x80	 /* 2 */ /* Only with raid1! */
-#define CTR_FLAG_STRIPE_CACHE	   0x100 /* 2 */ /* Only with raid4/5/6! */
-#define CTR_FLAG_REGION_SIZE	   0x200 /* 2 */ /* Not with raid0! */
-#define CTR_FLAG_RAID10_COPIES	   0x400 /* 2 */ /* Only with raid10 */
-#define CTR_FLAG_RAID10_FORMAT	   0x800 /* 2 */ /* Only with raid10 */
+#define __CTR_FLAG_SYNC			0  /* 1 */ /* Not with raid0! */
+#define __CTR_FLAG_NOSYNC		1  /* 1 */ /* Not with raid0! */
+#define __CTR_FLAG_REBUILD		2  /* 2 */ /* Not with raid0! */
+#define __CTR_FLAG_DAEMON_SLEEP		3  /* 2 */ /* Not with raid0! */
+#define __CTR_FLAG_MIN_RECOVERY_RATE	4  /* 2 */ /* Not with raid0! */
+#define __CTR_FLAG_MAX_RECOVERY_RATE	5  /* 2 */ /* Not with raid0! */
+#define __CTR_FLAG_MAX_WRITE_BEHIND	6  /* 2 */ /* Only with raid1! */
+#define __CTR_FLAG_WRITE_MOSTLY		7  /* 2 */ /* Only with raid1! */
+#define __CTR_FLAG_STRIPE_CACHE		8  /* 2 */ /* Only with raid4/5/6! */
+#define __CTR_FLAG_REGION_SIZE		9  /* 2 */ /* Not with raid0! */
+#define __CTR_FLAG_RAID10_COPIES	10 /* 2 */ /* Only with raid10 */
+#define __CTR_FLAG_RAID10_FORMAT	11 /* 2 */ /* Only with raid10 */
 /* New for v1.9.0 */
-#define CTR_FLAG_DELTA_DISKS	      0x1000 /* 2 */ /* Only with reshapable raid4/5/6/10! */
-#define CTR_FLAG_DATA_OFFSET	      0x2000 /* 2 */ /* Only with reshapable raid4/5/6/10! */
-#define CTR_FLAG_RAID10_USE_NEAR_SETS 0x4000 /* 2 */ /* Only with raid10! */
+#define __CTR_FLAG_DELTA_DISKS		12 /* 2 */ /* Only with reshapable raid4/5/6/10! */
+#define __CTR_FLAG_DATA_OFFSET		13 /* 2 */ /* Only with reshapable raid4/5/6/10! */
+#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
+
+/*
+ * Flags for rs->ctr_flags field.
+ */
+#define CTR_FLAG_SYNC			(1 << __CTR_FLAG_SYNC)
+#define CTR_FLAG_NOSYNC			(1 << __CTR_FLAG_NOSYNC)
+#define CTR_FLAG_REBUILD		(1 << __CTR_FLAG_REBUILD)
+#define CTR_FLAG_DAEMON_SLEEP		(1 << __CTR_FLAG_DAEMON_SLEEP)
+#define CTR_FLAG_MIN_RECOVERY_RATE	(1 << __CTR_FLAG_MIN_RECOVERY_RATE)
+#define CTR_FLAG_MAX_RECOVERY_RATE	(1 << __CTR_FLAG_MAX_RECOVERY_RATE)
+#define CTR_FLAG_MAX_WRITE_BEHIND	(1 << __CTR_FLAG_MAX_WRITE_BEHIND)
+#define CTR_FLAG_WRITE_MOSTLY		(1 << __CTR_FLAG_WRITE_MOSTLY)
+#define CTR_FLAG_STRIPE_CACHE		(1 << __CTR_FLAG_STRIPE_CACHE)
+#define CTR_FLAG_REGION_SIZE		(1 << __CTR_FLAG_REGION_SIZE)
+#define CTR_FLAG_RAID10_COPIES		(1 << __CTR_FLAG_RAID10_COPIES)
+#define CTR_FLAG_RAID10_FORMAT		(1 << __CTR_FLAG_RAID10_FORMAT)
+#define CTR_FLAG_DELTA_DISKS		(1 << __CTR_FLAG_DELTA_DISKS)
+#define CTR_FLAG_DATA_OFFSET		(1 << __CTR_FLAG_DATA_OFFSET)
+#define CTR_FLAG_RAID10_USE_NEAR_SETS	(1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
 
 /*
  * Definitions of various constructor flags to
@@ -158,8 +177,8 @@ struct raid_set {
 	struct dm_target *ti;
 
 	uint32_t bitmap_loaded;
-	uint32_t ctr_flags;
-	uint32_t runtime_flags;
+	unsigned long ctr_flags;
+	unsigned long runtime_flags;
 
 	uint64_t rebuild_disks[DISKS_ARRAY_ELEMS];
 
@@ -249,65 +268,9 @@ static bool __within_range(long v, long min, long max)
 	return v >= min && v <= max;
 }
 
-/* ctr flag bit manipulation... */
-/* Set single @flag in @flags */
-static void _set_flag(uint32_t flag, uint32_t *flags)
-{
-	WARN_ON_ONCE(hweight32(flag) != 1);
-	*flags |= flag;
-}
-
-/* Clear single @flag in @flags */
-static void _clear_flag(uint32_t flag, uint32_t *flags)
-{
-	WARN_ON_ONCE(hweight32(flag) != 1);
-	*flags &= ~flag;
-}
-
-/* Test single @flag in @flags */
-static bool _test_flag(uint32_t flag, uint32_t flags)
-{
-	WARN_ON_ONCE(hweight32(flag) != 1);
-	return (flag & flags) ? true : false;
-}
-
-/* Test multiple @flags in @all_flags */
-static bool _test_flags(uint32_t flags, uint32_t all_flags)
-{
-	return (flags & all_flags) ? true : false;
-}
-
-/* Clear (multiple) @flags in @all_flags */
-static void _clear_flags(uint32_t flags, uint32_t *all_flags)
-{
-	*all_flags &= ~flags;
-}
-
-/* Return true if single @flag is set in @*flags, else set it and return false */
-static bool _test_and_set_flag(uint32_t flag, uint32_t *flags)
-{
-	if (_test_flag(flag, *flags))
-		return true;
-
-	_set_flag(flag, flags);
-	return false;
-}
-
-/* Return true if single @flag is set in @*flags and clear it, else return false */
-static bool _test_and_clear_flag(uint32_t flag, uint32_t *flags)
-{
-	if (_test_flag(flag, *flags)) {
-		_clear_flag(flag, flags);
-		return true;
-	}
-
-	return false;
-}
-/* ...ctr and runtime flag bit manipulation */
-
 /* All table line arguments are defined here */
 static struct arg_name_flag {
-	const uint32_t flag;
+	const unsigned long flag;
 	const char *name;
 } _arg_name_flags[] = {
 	{ CTR_FLAG_SYNC, "sync"},
@@ -334,7 +297,7 @@ static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
 		struct arg_name_flag *anf = _arg_name_flags + ARRAY_SIZE(_arg_name_flags);
 
 		while (anf-- > _arg_name_flags)
-			if (_test_flag(flag, anf->flag))
+			if (flag & anf->flag)
 				return anf->name;
 
 	} else
@@ -425,8 +388,8 @@ static uint32_t _invalid_flags(struct raid_set *rs)
  */
 static int rs_check_for_invalid_flags(struct raid_set *rs)
 {
-	if (_test_flags(rs->ctr_flags, _invalid_flags(rs))) {
-		rs->ti->error = "Invalid flag combined";
+	if (rs->ctr_flags & _invalid_flags(rs)) {
+		rs->ti->error = "Invalid flags combination";
 		return -EINVAL;
 	}
 
@@ -533,13 +496,13 @@ static int raid10_format_to_md_layout(struct raid_set *rs,
 	else if (algorithm == ALGORITHM_RAID10_OFFSET) {
 		f = copies;
 		r = RAID10_OFFSET;
-		if (!_test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, rs->ctr_flags))
+		if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
 			r |= RAID10_USE_FAR_SETS;
 
 	} else if (algorithm == ALGORITHM_RAID10_FAR) {
 		f = copies;
 		r = !RAID10_OFFSET;
-		if (!_test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, rs->ctr_flags))
+		if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
 			r |= RAID10_USE_FAR_SETS;
 
 	} else
@@ -1061,7 +1024,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		}
 
 		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC))) {
-			if (_test_and_set_flag(CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
+			if (test_and_set_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
 				rs->ti->error = "Only one 'nosync' argument allowed";
 				return -EINVAL;
 			}
@@ -1069,7 +1032,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			continue;
 		}
 		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_SYNC))) {
-			if (_test_and_set_flag(CTR_FLAG_SYNC, &rs->ctr_flags)) {
+			if (test_and_set_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) {
 				rs->ti->error = "Only one 'sync' argument allowed";
 				return -EINVAL;
 			}
@@ -1077,7 +1040,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			continue;
 		}
 		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_USE_NEAR_SETS))) {
-			if (_test_and_set_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) {
+			if (test_and_set_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) {
 				rs->ti->error = "Only one 'raid10_use_new_sets' argument allowed";
 				return -EINVAL;
 			}
@@ -1096,7 +1059,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		 */
 
 		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) {
-			if (_test_and_set_flag(CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
+			if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
 				rs->ti->error = "Only one 'raid10_format' argument pair allowed";
 				return -EINVAL;
 			}
@@ -1137,7 +1100,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			clear_bit(In_sync, &rd->rdev.flags);
 			clear_bit(Faulty, &rd->rdev.flags);
 			rd->rdev.recovery_offset = 0;
-			_set_flag(CTR_FLAG_REBUILD, &rs->ctr_flags);
+			set_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags);
 		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY))) {
 			if (!rt_is_raid1(rt)) {
 				rs->ti->error = "write_mostly option is only valid for RAID1";
@@ -1150,14 +1113,14 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			}
 
 			set_bit(WriteMostly, &rs->dev[value].rdev.flags);
-			_set_flag(CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags);
+			set_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags);
 		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) {
 			if (!rt_is_raid1(rt)) {
 				rs->ti->error = "max_write_behind option is only valid for RAID1";
 				return -EINVAL;
 			}
 
-			if (_test_and_set_flag(CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) {
+			if (test_and_set_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags)) {
 				rs->ti->error = "Only one max_write_behind argument pair allowed";
 				return -EINVAL;
 			}
@@ -1174,7 +1137,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 
 			rs->md.bitmap_info.max_write_behind = value;
 		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP))) {
-			if (_test_and_set_flag(CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) {
+			if (test_and_set_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) {
 				rs->ti->error = "Only one daemon_sleep argument pair allowed";
 				return -EINVAL;
 			}
@@ -1185,7 +1148,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			rs->md.bitmap_info.daemon_sleep = value;
 		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET))) {
 			/* Userspace passes new data_offset after having extended the the data image LV */
-			if (_test_and_set_flag(CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) {
+			if (test_and_set_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) {
 				rs->ti->error = "Only one data_offset argument pair allowed";
 				return -EINVAL;
 			}
@@ -1197,7 +1160,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			rs->data_offset = value;
 		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS))) {
 			/* Define the +/-# of disks to add to/remove from the given raid set */
-			if (_test_and_set_flag(CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) {
+			if (test_and_set_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) {
 				rs->ti->error = "Only one delta_disks argument pair allowed";
 				return -EINVAL;
 			}
@@ -1209,7 +1172,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 
 			rs->delta_disks = value;
 		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE))) {
-			if (_test_and_set_flag(CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) {
+			if (test_and_set_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags)) {
 				rs->ti->error = "Only one stripe_cache argument pair allowed";
 				return -EINVAL;
 			}
@@ -1230,7 +1193,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			}
 
 		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE))) {
-			if (_test_and_set_flag(CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) {
+			if (test_and_set_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) {
 				rs->ti->error = "Only one min_recovery_rate argument pair allowed";
 				return -EINVAL;
 			}
@@ -1240,7 +1203,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			}
 			rs->md.sync_speed_min = (int)value;
 		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE))) {
-			if (_test_and_set_flag(CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) {
+			if (test_and_set_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) {
 				rs->ti->error = "Only one max_recovery_rate argument pair allowed";
 				return -EINVAL;
 			}
@@ -1250,14 +1213,14 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			}
 			rs->md.sync_speed_max = (int)value;
 		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE))) {
-			if (_test_and_set_flag(CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) {
+			if (test_and_set_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) {
 				rs->ti->error = "Only one region_size argument pair allowed";
 				return -EINVAL;
 			}
 
 			region_size = value;
 		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES))) {
-			if (_test_and_set_flag(CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) {
+			if (test_and_set_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) {
 				rs->ti->error = "Only one raid10_copies argument pair allowed";
 				return -EINVAL;
 			}
@@ -1306,7 +1269,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 
 		if ((rt->algorithm == ALGORITHM_RAID10_DEFAULT ||
 		     rt->algorithm == ALGORITHM_RAID10_NEAR) &&
-		    _test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS, rs->ctr_flags)) {
+		    test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) {
 			rs->ti->error = "RAID10 format 'near' and 'raid10_use_near_sets' are incompatible";
 			return -EINVAL;
 		}
@@ -1624,7 +1587,7 @@ static void sb_retrieve_failed_devices(struct dm_raid_superblock *sb, uint64_t *
 	failed_devices[0] = le64_to_cpu(sb->failed_devices);
 	memset(failed_devices + 1, 0, sizeof(sb->extended_failed_devices));
 
-	if (_test_flag(FEATURE_FLAG_SUPPORTS_V190, le32_to_cpu(sb->compat_features))) {
+	if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) {
 		int i = ARRAY_SIZE(sb->extended_failed_devices);
 
 		while (i--)
@@ -1702,9 +1665,10 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 
 		if (mddev->delta_disks < 0 || mddev->reshape_backwards)
 			sb->flags |= cpu_to_le32(SB_FLAG_RESHAPE_BACKWARDS);
-	} else
-		/* Flag no reshape */
-		_clear_flags(cpu_to_le32(SB_FLAG_RESHAPE_ACTIVE|SB_FLAG_RESHAPE_BACKWARDS), &sb->flags);
+	} else {
+		/* Clear reshape flags */
+		sb->flags &= ~(cpu_to_le32(SB_FLAG_RESHAPE_ACTIVE|SB_FLAG_RESHAPE_BACKWARDS));
+	}
 
 	sb->array_sectors = cpu_to_le64(mddev->array_sectors);
 	sb->data_offset = cpu_to_le64(rdev->data_offset);
@@ -1799,7 +1763,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 	 * Reshaping is supported, e.g. reshape_position is valid
 	 * in superblock and superblock content is authoritative.
 	 */
-	if (_test_flag(FEATURE_FLAG_SUPPORTS_V190, le32_to_cpu(sb->compat_features))) {
+	if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) {
 		/* Superblock is authoritative wrt given raid set layout! */
 		mddev->raid_disks = le32_to_cpu(sb->num_devices);
 		mddev->level = le32_to_cpu(sb->level);
@@ -1812,14 +1776,14 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 		mddev->array_sectors = le64_to_cpu(sb->array_sectors);
 
 		/* raid was reshaping and got interrupted */
-		if (_test_flag(SB_FLAG_RESHAPE_ACTIVE, le32_to_cpu(sb->flags))) {
-			if (_test_flag(CTR_FLAG_DELTA_DISKS, rs->ctr_flags)) {
+		if (le32_to_cpu(sb->flags) & SB_FLAG_RESHAPE_ACTIVE) {
+			if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags)) {
 				DMERR("Reshape requested but raid set is still reshaping");
 				return -EINVAL;
 			}
 
 			if (mddev->delta_disks < 0 ||
-			    (!mddev->delta_disks && _test_flag(SB_FLAG_RESHAPE_BACKWARDS, le32_to_cpu(sb->flags))))
+			    (!mddev->delta_disks && (le32_to_cpu(sb->flags) & SB_FLAG_RESHAPE_BACKWARDS)))
 				mddev->reshape_backwards = 1;
 			else
 				mddev->reshape_backwards = 0;
@@ -1864,7 +1828,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 		rs_set_new(rs);
 	}
 
-	if (!_test_flag(CTR_FLAG_NOSYNC, rs->ctr_flags))
+	if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
 		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
 
 	/*
@@ -1902,7 +1866,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 		if (new_devs == rs->raid_disks) {
 			DMINFO("Superblocks created for new raid set");
 			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
-			_set_flag(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+			set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 			mddev->recovery_cp = 0;
 		} else if (new_devs && new_devs != rs->raid_disks && !rebuilds) {
 			DMERR("New device injected into existing raid set without "
@@ -2065,7 +2029,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 		 * that the "sync" directive is disallowed during the
 		 * reshape.
 		 */
-		if (_test_flag(CTR_FLAG_SYNC, rs->ctr_flags))
+		if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
 			continue;
 
 		if (!rdev->meta_bdev)
@@ -2342,7 +2306,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			return r;
 
 		/* Tell preresume to update superblocks with new layout */
-		_set_flag(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 		rs_set_new(rs);
 	} else
 		rs_set_cur(rs);
@@ -2553,7 +2517,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		/* Access most recent mddev properties for status output */
 		smp_rmb();
 		/* Get sensible max sectors even if raid set not yet started */
-		resync_max_sectors = _test_flag(RT_FLAG_RS_PRERESUMED, rs->runtime_flags) ?
+		resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ?
 				      mddev->resync_max_sectors : mddev->dev_sectors;
 		progress = rs_get_progress(rs, resync_max_sectors, &array_in_sync);
 		resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
@@ -2624,29 +2588,29 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
 		/* Emit table line */
 		DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
-		if (_test_flag(CTR_FLAG_RAID10_FORMAT, rs->ctr_flags))
+		if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
 			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
 					 raid10_md_layout_to_format(mddev->layout));
-		if (_test_flag(CTR_FLAG_RAID10_COPIES, rs->ctr_flags))
+		if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
 			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
 					 raid10_md_layout_to_copies(mddev->layout));
-		if (_test_flag(CTR_FLAG_NOSYNC, rs->ctr_flags))
+		if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
 			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
-		if (_test_flag(CTR_FLAG_SYNC, rs->ctr_flags))
+		if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
 			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
-		if (_test_flag(CTR_FLAG_REGION_SIZE, rs->ctr_flags))
+		if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
 			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
 					   (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
-		if (_test_flag(CTR_FLAG_DATA_OFFSET, rs->ctr_flags))
+		if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
 			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
 					   (unsigned long long) rs->data_offset);
-		if (_test_flag(CTR_FLAG_DAEMON_SLEEP, rs->ctr_flags))
+		if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
 			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
 					  mddev->bitmap_info.daemon_sleep);
-		if (_test_flag(CTR_FLAG_DELTA_DISKS, rs->ctr_flags))
+		if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
 			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
 					 mddev->delta_disks);
-		if (_test_flag(CTR_FLAG_STRIPE_CACHE, rs->ctr_flags))
+		if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
 			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
 					 max_nr_stripes);
 		rdev_for_each(rdev, mddev)
@@ -2657,13 +2621,13 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 			if (test_bit(WriteMostly, &rdev->flags))
 				DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY),
 						 rdev->raid_disk);
-		if (_test_flag(CTR_FLAG_MAX_WRITE_BEHIND, rs->ctr_flags))
+		if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
 			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
 					  mddev->bitmap_info.max_write_behind);
-		if (_test_flag(CTR_FLAG_MAX_RECOVERY_RATE, rs->ctr_flags))
+		if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
 			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
 					 mddev->sync_speed_max);
-		if (_test_flag(CTR_FLAG_MIN_RECOVERY_RATE, rs->ctr_flags))
+		if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
 			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
 					 mddev->sync_speed_min);
 		DMEMIT(" %d", rs->raid_disks);
@@ -2835,7 +2799,7 @@ static int _bitmap_load(struct raid_set *rs)
 
 	/* Try loading the bitmap unless "raid0", which does not have one */
 	if (!rs_is_raid0(rs) &&
-	    !_test_and_set_flag(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
+	    !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
 		r = bitmap_load(&rs->md);
 		if (r)
 			DMERR("Failed to load bitmap");
@@ -2850,7 +2814,7 @@ static int raid_preresume(struct dm_target *ti)
 	struct mddev *mddev = &rs->md;
 
 	/* This is a resume after a suspend of the set -> it's already started */
-	if (_test_and_set_flag(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
+	if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
 		return 0;
 
 	/*
@@ -2865,7 +2829,7 @@ static int raid_preresume(struct dm_target *ti)
 	 * Have to switch to readwrite and back in order to
 	 * allow for the superblock updates.
 	 */
-	if (_test_and_clear_flag(RT_FLAG_UPDATE_SBS, &rs->runtime_flags)) {
+	if (test_and_clear_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags)) {
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		mddev->ro = 0;
 		md_update_sb(mddev, 1);
@@ -2887,7 +2851,7 @@ static void raid_resume(struct dm_target *ti)
 	struct raid_set *rs = ti->private;
 	struct mddev *mddev = &rs->md;
 
-	if (_test_and_set_flag(RT_FLAG_RS_RESUMED, &rs->runtime_flags)) {
+	if (test_and_set_bit(RT_FLAG_RS_RESUMED, &rs->runtime_flags)) {
 		/*
 		 * A secondary resume while the device is active.
 		 * Take this opportunity to check whether any failed
-- 
cgit v1.2.3-70-g09d2


From bfcee0e312f9d11c5d009be213ee46a9fb765f38 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 2 Jun 2016 15:08:09 -0400
Subject: dm raid: rename functions that alloc and free struct raid_set

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 32c3bae69aae..8fa9f3e90784 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -587,8 +587,8 @@ static void rs_set_new(struct raid_set *rs)
 	mddev->delta_disks = 0;
 }
 
-
-static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
+static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *raid_type,
+				       unsigned raid_devs)
 {
 	unsigned i;
 	struct raid_set *rs;
@@ -634,7 +634,7 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
 	return rs;
 }
 
-static void context_free(struct raid_set *rs)
+static void raid_set_free(struct raid_set *rs)
 {
 	int i;
 
@@ -663,7 +663,7 @@ static void context_free(struct raid_set *rs)
  *    <meta_dev> -
  *
  * This code parses those words.  If there is a failure,
- * the caller must use context_free to unwind the operations.
+ * the caller must use raid_set_free() to unwind the operations.
  */
 static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
 {
@@ -2260,7 +2260,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		return -EINVAL;
 	}
 
-	rs = context_alloc(ti, rt, num_raid_devs);
+	rs = raid_set_alloc(ti, rt, num_raid_devs);
 	if (IS_ERR(rs))
 		return PTR_ERR(rs);
 
@@ -2341,7 +2341,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 size_mismatch:
 	md_stop(&rs->md);
 bad:
-	context_free(rs);
+	raid_set_free(rs);
 
 	return r;
 }
@@ -2352,7 +2352,7 @@ static void raid_dtr(struct dm_target *ti)
 
 	list_del_init(&rs->callbacks.list);
 	md_stop(&rs->md);
-	context_free(rs);
+	raid_set_free(rs);
 }
 
 static int raid_map(struct dm_target *ti, struct bio *bio)
-- 
cgit v1.2.3-70-g09d2


From e6ca5e1a0323a34a24999243a00374a8c2cbafe1 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 2 Jun 2016 15:27:22 -0400
Subject: dm raid: various code cleanups

Renamed functions and variables with leading single underscore to have a
double underscore.  Renamed some functions to have better names.  Folded
functions that were split out without reason.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 99 +++++++++++++++++++++++-----------------------------
 1 file changed, 43 insertions(+), 56 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 8fa9f3e90784..f78a5e9d25c8 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -195,36 +195,26 @@ struct raid_set {
 	struct raid_dev dev[0];
 };
 
-/* Backup/restore raid set configuration helpers */
-static void _rs_config_backup(struct raid_set *rs, struct rs_layout *l)
+static void rs_config_backup(struct raid_set *rs)
 {
 	struct mddev *mddev = &rs->md;
+	struct rs_layout *l = &rs->rs_layout;
 
 	l->new_level = mddev->new_level;
 	l->new_layout = mddev->new_layout;
 	l->new_chunk_sectors = mddev->new_chunk_sectors;
 }
 
-static void rs_config_backup(struct raid_set *rs)
-{
-	return _rs_config_backup(rs, &rs->rs_layout);
-}
-
-static void _rs_config_restore(struct raid_set *rs, struct rs_layout *l)
+static void rs_config_restore(struct raid_set *rs)
 {
 	struct mddev *mddev = &rs->md;
+	struct rs_layout *l = &rs->rs_layout;
 
 	mddev->new_level = l->new_level;
 	mddev->new_layout = l->new_layout;
 	mddev->new_chunk_sectors = l->new_chunk_sectors;
 }
 
-static void rs_config_restore(struct raid_set *rs)
-{
-	return _rs_config_restore(rs, &rs->rs_layout);
-}
-/* END: backup/restore raid set configuration helpers */
-
 /* raid10 algorithms (i.e. formats) */
 #define	ALGORITHM_RAID10_DEFAULT	0
 #define	ALGORITHM_RAID10_NEAR		1
@@ -272,7 +262,7 @@ static bool __within_range(long v, long min, long max)
 static struct arg_name_flag {
 	const unsigned long flag;
 	const char *name;
-} _arg_name_flags[] = {
+} __arg_name_flags[] = {
 	{ CTR_FLAG_SYNC, "sync"},
 	{ CTR_FLAG_NOSYNC, "nosync"},
 	{ CTR_FLAG_REBUILD, "rebuild"},
@@ -294,9 +284,9 @@ static struct arg_name_flag {
 static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
 {
 	if (hweight32(flag) == 1) {
-		struct arg_name_flag *anf = _arg_name_flags + ARRAY_SIZE(_arg_name_flags);
+		struct arg_name_flag *anf = __arg_name_flags + ARRAY_SIZE(__arg_name_flags);
 
-		while (anf-- > _arg_name_flags)
+		while (anf-- > __arg_name_flags)
 			if (flag & anf->flag)
 				return anf->name;
 
@@ -365,7 +355,7 @@ static bool rt_is_raid456(struct raid_type *rt)
 /* END: raid level bools */
 
 /* Return invalid ctr flags for the raid level of @rs */
-static uint32_t _invalid_flags(struct raid_set *rs)
+static uint32_t __invalid_flags(struct raid_set *rs)
 {
 	if (rt_is_raid0(rs->raid_type))
 		return RAID0_INVALID_FLAGS;
@@ -388,7 +378,7 @@ static uint32_t _invalid_flags(struct raid_set *rs)
  */
 static int rs_check_for_invalid_flags(struct raid_set *rs)
 {
-	if (rs->ctr_flags & _invalid_flags(rs)) {
+	if (rs->ctr_flags & __invalid_flags(rs)) {
 		rs->ti->error = "Invalid flags combination";
 		return -EINVAL;
 	}
@@ -396,7 +386,6 @@ static int rs_check_for_invalid_flags(struct raid_set *rs)
 	return 0;
 }
 
-
 /* MD raid10 bit definitions and helpers */
 #define RAID10_OFFSET			(1 << 16) /* stripes with data copies area adjacent on devices */
 #define RAID10_BROCKEN_USE_FAR_SETS	(1 << 17) /* Broken in raid10.c: use sets instead of whole stripe rotation */
@@ -404,33 +393,33 @@ static int rs_check_for_invalid_flags(struct raid_set *rs)
 #define RAID10_FAR_COPIES_SHIFT		8	  /* raid10 # far copies shift (2nd byte of layout) */
 
 /* Return md raid10 near copies for @layout */
-static unsigned int _raid10_near_copies(int layout)
+static unsigned int __raid10_near_copies(int layout)
 {
 	return layout & 0xFF;
 }
 
 /* Return md raid10 far copies for @layout */
-static unsigned int _raid10_far_copies(int layout)
+static unsigned int __raid10_far_copies(int layout)
 {
-	return _raid10_near_copies(layout >> RAID10_FAR_COPIES_SHIFT);
+	return __raid10_near_copies(layout >> RAID10_FAR_COPIES_SHIFT);
 }
 
 /* Return true if md raid10 offset for @layout */
-static unsigned int _is_raid10_offset(int layout)
+static unsigned int __is_raid10_offset(int layout)
 {
 	return layout & RAID10_OFFSET;
 }
 
 /* Return true if md raid10 near for @layout */
-static unsigned int _is_raid10_near(int layout)
+static unsigned int __is_raid10_near(int layout)
 {
-	return !_is_raid10_offset(layout) && _raid10_near_copies(layout) > 1;
+	return !__is_raid10_offset(layout) && __raid10_near_copies(layout) > 1;
 }
 
 /* Return true if md raid10 far for @layout */
-static unsigned int _is_raid10_far(int layout)
+static unsigned int __is_raid10_far(int layout)
 {
-	return !_is_raid10_offset(layout) && _raid10_far_copies(layout) > 1;
+	return !__is_raid10_offset(layout) && __raid10_far_copies(layout) > 1;
 }
 
 /* Return md raid10 layout string for @layout */
@@ -442,13 +431,13 @@ static const char *raid10_md_layout_to_format(int layout)
 	 *
 	 * Refer to MD's raid10.c for details
 	 */
-	if (_is_raid10_offset(layout))
+	if (__is_raid10_offset(layout))
 		return "offset";
 
-	if (_raid10_near_copies(layout) > 1)
+	if (__raid10_near_copies(layout) > 1)
 		return "near";
 
-	WARN_ON(_raid10_far_copies(layout) < 2);
+	WARN_ON(__raid10_far_copies(layout) < 2);
 
 	return "far";
 }
@@ -466,12 +455,11 @@ static const int raid10_name_to_format(const char *name)
 	return -EINVAL;
 }
 
-
 /* Return md raid10 copies for @layout */
 static unsigned int raid10_md_layout_to_copies(int layout)
 {
-	return _raid10_near_copies(layout) > 1 ?
-	       _raid10_near_copies(layout) : _raid10_far_copies(layout);
+	return __raid10_near_copies(layout) > 1 ?
+		__raid10_near_copies(layout) : __raid10_far_copies(layout);
 }
 
 /* Return md raid10 format id for @format string */
@@ -513,17 +501,17 @@ static int raid10_format_to_md_layout(struct raid_set *rs,
 /* END: MD raid10 bit definitions and helpers */
 
 /* Check for any of the raid10 algorithms */
-static int _got_raid10(struct raid_type *rtp, const int layout)
+static int __got_raid10(struct raid_type *rtp, const int layout)
 {
 	if (rtp->level == 10) {
 		switch (rtp->algorithm) {
 		case ALGORITHM_RAID10_DEFAULT:
 		case ALGORITHM_RAID10_NEAR:
-			return _is_raid10_near(layout);
+			return __is_raid10_near(layout);
 		case ALGORITHM_RAID10_OFFSET:
-			return _is_raid10_offset(layout);
+			return __is_raid10_offset(layout);
 		case ALGORITHM_RAID10_FAR:
-			return _is_raid10_far(layout);
+			return __is_raid10_far(layout);
 		default:
 			break;
 		}
@@ -552,7 +540,7 @@ static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
 	while (rtp-- > raid_types) {
 		/* RAID10 special checks based on @layout flags/properties */
 		if (rtp->level == level &&
-		    (_got_raid10(rtp, layout) || rtp->algorithm == layout))
+		    (__got_raid10(rtp, layout) || rtp->algorithm == layout))
 			return rtp;
 	}
 
@@ -1352,10 +1340,10 @@ static int rs_check_takeover(struct raid_set *rs)
 
 	case 10:
 		/* Can't takeover raid10_offset! */
-		if (_is_raid10_offset(mddev->layout))
+		if (__is_raid10_offset(mddev->layout))
 			break;
 
-		near_copies = _raid10_near_copies(mddev->layout);
+		near_copies = __raid10_near_copies(mddev->layout);
 
 		/* raid10* -> raid0 */
 		if (mddev->new_level == 0) {
@@ -1369,7 +1357,7 @@ static int rs_check_takeover(struct raid_set *rs)
 
 			/* Can takeover raid10_far */
 			if (near_copies == 1 &&
-			   _raid10_far_copies(mddev->layout) > 1)
+			    __raid10_far_copies(mddev->layout) > 1)
 				return 0;
 
 			break;
@@ -1377,7 +1365,7 @@ static int rs_check_takeover(struct raid_set *rs)
 
 		/* raid10_{near,far} -> raid1 */
 		if (mddev->new_level == 1 &&
-		    max(near_copies, _raid10_far_copies(mddev->layout)) == mddev->raid_disks)
+		    max(near_copies, __raid10_far_copies(mddev->layout)) == mddev->raid_disks)
 			return 0;
 
 		/* raid10_{near,far} with 2 disks -> raid4/5 */
@@ -1914,8 +1902,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 				continue;
 
 			if (role != r->raid_disk) {
-				if (_is_raid10_near(mddev->layout)) {
-					if (mddev->raid_disks % _raid10_near_copies(mddev->layout) ||
+				if (__is_raid10_near(mddev->layout)) {
+					if (mddev->raid_disks % __raid10_near_copies(mddev->layout) ||
 					    rs->raid_disks % rs->raid10_copies) {
 						rs->ti->error =
 							"Cannot change raid10 near set to odd # of devices!";
@@ -2099,7 +2087,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 }
 
 /* Userpace reordered disks -> adjust raid_disk indexes in @rs */
-static void _reorder_raid_disk_indexes(struct raid_set *rs)
+static void __reorder_raid_disk_indexes(struct raid_set *rs)
 {
 	int i = 0;
 	struct md_rdev *rdev;
@@ -2123,7 +2111,7 @@ static int rs_setup_takeover(struct raid_set *rs)
 	if (rt_is_raid10(rs->raid_type)) {
 		if (mddev->level == 0) {
 			/* Userpace reordered disks -> adjust raid_disk indexes */
-			_reorder_raid_disk_indexes(rs);
+			__reorder_raid_disk_indexes(rs);
 
 			/* raid0 -> raid10_far layout */
 			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_FAR,
@@ -2400,7 +2388,7 @@ static const char *decipher_sync_action(struct mddev *mddev)
  *  'a' = Alive but not in-sync
  *  'A' = Alive and in-sync
  */
-static const char *_raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
+static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
 {
 	if (test_bit(Faulty, &rdev->flags))
 		return "D";
@@ -2484,7 +2472,7 @@ static sector_t rs_get_progress(struct raid_set *rs,
 }
 
 /* Helper to return @dev name or "-" if !@dev */
-static const char *_get_dev_name(struct dm_dev *dev)
+static const char *__get_dev_name(struct dm_dev *dev)
 {
 	return dev ? dev->name : "-";
 }
@@ -2526,7 +2514,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 
 		/* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
 		rdev_for_each(rdev, mddev)
-			DMEMIT(_raid_dev_status(rdev, array_in_sync));
+			DMEMIT(__raid_dev_status(rdev, array_in_sync));
 
 		/*
 		 * In-sync/Reshape ratio:
@@ -2634,8 +2622,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		rdev_for_each(rdev, mddev) {
 			struct raid_dev *rd = container_of(rdev, struct raid_dev, rdev);
 
-			DMEMIT(" %s %s", _get_dev_name(rd->meta_dev),
-					 _get_dev_name(rd->data_dev));
+			DMEMIT(" %s %s", __get_dev_name(rd->meta_dev),
+					 __get_dev_name(rd->data_dev));
 		}
 	}
 }
@@ -2792,8 +2780,7 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
 	}
 }
 
-/* Load the dirty region bitmap */
-static int _bitmap_load(struct raid_set *rs)
+static int __load_dirty_region_bitmap(struct raid_set *rs)
 {
 	int r = 0;
 
@@ -2819,7 +2806,7 @@ static int raid_preresume(struct dm_target *ti)
 
 	/*
 	 * The superblocks need to be updated on disk if the
-	 * array is new or _bitmap_load will overwrite them
+	 * array is new or __load_dirty_region_bitmap will overwrite them
 	 * in core with old data.
 	 *
 	 * In case the array got modified (takeover/reshape/resize)
@@ -2843,7 +2830,7 @@ static int raid_preresume(struct dm_target *ti)
 	configure_discard_support(rs);
 
 	/* Load the bitmap from disk unless raid0 */
-	return _bitmap_load(rs);
+	return __load_dirty_region_bitmap(rs);
 }
 
 static void raid_resume(struct dm_target *ti)
-- 
cgit v1.2.3-70-g09d2


From a30cbc0d1c80f2d07a1b973e1f4c58d925a3ff4a Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 9 Jun 2016 16:42:16 +0200
Subject: dm raid: inverse check for flags from invalid to valid flags

It is more intuitive to manage each raid level's features in terms of
what is supported rather than what isn't supported.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 88 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 56 insertions(+), 32 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index f78a5e9d25c8..a32cddbb5f4f 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -113,26 +113,35 @@ struct raid_dev {
 				  CTR_FLAG_DELTA_DISKS | \
 				  CTR_FLAG_DATA_OFFSET)
 
-/* All ctr optional arguments */
-#define ALL_CTR_FLAGS		(CTR_FLAG_OPTIONS_NO_ARGS | \
-				 CTR_FLAG_OPTIONS_ONE_ARG)
-
-/* Invalid options definitions per raid level... */
-
-/* "raid0" does not accept any options */
-#define RAID0_INVALID_FLAGS ALL_CTR_FLAGS
+/* Valid options definitions per raid level... */
+
+/* "raid0" does only accept data offset */
+#define RAID0_VALID_FLAGS	(CTR_FLAG_DATA_OFFSET)
+
+/* "raid1" does not accept stripe cache, data offset, delta_disks or any raid10 options */
+#define RAID1_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
+				 CTR_FLAG_REBUILD | \
+				 CTR_FLAG_WRITE_MOSTLY | \
+				 CTR_FLAG_DAEMON_SLEEP | \
+				 CTR_FLAG_MIN_RECOVERY_RATE | \
+				 CTR_FLAG_MAX_RECOVERY_RATE | \
+				 CTR_FLAG_MAX_WRITE_BEHIND | \
+				 CTR_FLAG_REGION_SIZE | \
+				 CTR_FLAG_DATA_OFFSET)
 
-/* "raid1" does not accept stripe cache or any raid10 options */
-#define RAID1_INVALID_FLAGS	(CTR_FLAG_STRIPE_CACHE | \
+/* "raid10" does not accept any raid1 or stripe cache options */
+#define RAID10_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
+				 CTR_FLAG_REBUILD | \
+				 CTR_FLAG_DAEMON_SLEEP | \
+				 CTR_FLAG_MIN_RECOVERY_RATE | \
+				 CTR_FLAG_MAX_RECOVERY_RATE | \
+				 CTR_FLAG_REGION_SIZE | \
 				 CTR_FLAG_RAID10_COPIES | \
 				 CTR_FLAG_RAID10_FORMAT | \
 				 CTR_FLAG_DELTA_DISKS | \
-				 CTR_FLAG_DATA_OFFSET)
+				 CTR_FLAG_DATA_OFFSET | \
+				 CTR_FLAG_RAID10_USE_NEAR_SETS)
 
-/* "raid10" does not accept any raid1 or stripe cache options */
-#define RAID10_INVALID_FLAGS	(CTR_FLAG_WRITE_MOSTLY | \
-				 CTR_FLAG_MAX_WRITE_BEHIND | \
-				 CTR_FLAG_STRIPE_CACHE)
 /*
  * "raid4/5/6" do not accept any raid1 or raid10 specific options
  *
@@ -140,13 +149,28 @@ struct raid_dev {
  * that both parity and q-syndrome are being written properly with
  * any writes
  */
-#define RAID45_INVALID_FLAGS	(CTR_FLAG_WRITE_MOSTLY | \
+#define RAID45_VALID_FLAGS	(CTR_FLAGS_ANY_SYNC | \
+				 CTR_FLAG_REBUILD | \
+				 CTR_FLAG_DAEMON_SLEEP | \
+				 CTR_FLAG_MIN_RECOVERY_RATE | \
+				 CTR_FLAG_MAX_RECOVERY_RATE | \
 				 CTR_FLAG_MAX_WRITE_BEHIND | \
-				 CTR_FLAG_RAID10_FORMAT | \
-				 CTR_FLAG_RAID10_COPIES | \
-				 CTR_FLAG_RAID10_USE_NEAR_SETS)
-#define RAID6_INVALID_FLAGS	(CTR_FLAG_NOSYNC | RAID45_INVALID_FLAGS)
-/* ...invalid options definitions per raid level */
+				 CTR_FLAG_STRIPE_CACHE | \
+				 CTR_FLAG_REGION_SIZE | \
+				 CTR_FLAG_DELTA_DISKS | \
+				 CTR_FLAG_DATA_OFFSET)
+
+#define RAID6_VALID_FLAGS	(CTR_FLAG_SYNC | \
+				 CTR_FLAG_REBUILD | \
+				 CTR_FLAG_DAEMON_SLEEP | \
+				 CTR_FLAG_MIN_RECOVERY_RATE | \
+				 CTR_FLAG_MAX_RECOVERY_RATE | \
+				 CTR_FLAG_MAX_WRITE_BEHIND | \
+				 CTR_FLAG_STRIPE_CACHE | \
+				 CTR_FLAG_REGION_SIZE | \
+				 CTR_FLAG_DELTA_DISKS | \
+				 CTR_FLAG_DATA_OFFSET)
+/* ...valid options definitions per raid level */
 
 /*
  * Flags for rs->runtime_flags field
@@ -354,31 +378,31 @@ static bool rt_is_raid456(struct raid_type *rt)
 }
 /* END: raid level bools */
 
-/* Return invalid ctr flags for the raid level of @rs */
-static uint32_t __invalid_flags(struct raid_set *rs)
+/* Return valid ctr flags for the raid level of @rs */
+static unsigned long __valid_flags(struct raid_set *rs)
 {
 	if (rt_is_raid0(rs->raid_type))
-		return RAID0_INVALID_FLAGS;
+		return RAID0_VALID_FLAGS;
 	else if (rt_is_raid1(rs->raid_type))
-		return RAID1_INVALID_FLAGS;
+		return RAID1_VALID_FLAGS;
 	else if (rt_is_raid10(rs->raid_type))
-		return RAID10_INVALID_FLAGS;
+		return RAID10_VALID_FLAGS;
 	else if (rt_is_raid45(rs->raid_type))
-		return RAID45_INVALID_FLAGS;
+		return RAID45_VALID_FLAGS;
 	else if (rt_is_raid6(rs->raid_type))
-		return RAID6_INVALID_FLAGS;
+		return RAID6_VALID_FLAGS;
 
 	return ~0;
 }
 
 /*
- * Check for any invalid flags set on @rs defined by bitset @invalid_flags
+ * Check for valid flags set on @rs
  *
  * Has to be called after parsing of the ctr flags!
  */
-static int rs_check_for_invalid_flags(struct raid_set *rs)
+static int rs_check_for_valid_flags(struct raid_set *rs)
 {
-	if (rs->ctr_flags & __invalid_flags(rs)) {
+	if (rs->ctr_flags & ~__valid_flags(rs)) {
 		rs->ti->error = "Invalid flags combination";
 		return -EINVAL;
 	}
@@ -1282,7 +1306,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	rs->md.external = 1;
 
 	/* Check, if any invalid ctr arguments have been passed in for the raid level */
-	return rs_check_for_invalid_flags(rs);
+	return rs_check_for_valid_flags(rs);
 }
 
 /* Return # of data stripes as kept in mddev as of @rs (i.e. as of superblock) */
-- 
cgit v1.2.3-70-g09d2


From 40ba37e5647c9241cbf5f63495815d7022e00a29 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Mon, 13 Jun 2016 17:55:13 +0200
Subject: dm raid: add prerequisite functions and definitions for reshaping

Add rs_is_reshapable(), rs_data_stripes(), rs_reshape_requested(),
rs_set_dev_and_array_sectors() and rs_adjust_data_offsets()

Remove superfluous check for reshape message

Correct runtime bit definitions to be incremental

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 224 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 202 insertions(+), 22 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index a32cddbb5f4f..14835ae064c1 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -19,6 +19,11 @@
 #define DM_MSG_PREFIX "raid"
 #define	MAX_RAID_DEVICES	253 /* md-raid kernel limit */
 
+/*
+ * Minimum sectors of free reshape space per raid device
+ */
+#define	MIN_FREE_RESHAPE_SPACE to_sector(4*4096)
+
 static bool devices_handle_discard_safely = false;
 
 /*
@@ -180,10 +185,10 @@ struct raid_dev {
  * e.g. to prevent another resume from preresume processing
  * the raid set all over again.
  */
-#define RT_FLAG_RS_PRERESUMED		0x1
-#define RT_FLAG_RS_RESUMED		0x2
-#define RT_FLAG_RS_BITMAP_LOADED	0x4
-#define RT_FLAG_UPDATE_SBS		0x8
+#define RT_FLAG_RS_PRERESUMED		0
+#define RT_FLAG_RS_RESUMED		1
+#define RT_FLAG_RS_BITMAP_LOADED	2
+#define RT_FLAG_UPDATE_SBS		3
 
 /* Array elements of 64 bit needed for rebuild/write_mostly bits */
 #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@@ -337,6 +342,20 @@ static bool rs_is_raid10(struct raid_set *rs)
 	return rs->md.level == 10;
 }
 
+/* Return true, if raid set in @rs is level 4, 5 or 6 */
+static bool rs_is_raid456(struct raid_set *rs)
+{
+	return __within_range(rs->md.level, 4, 6);
+}
+
+/* Return true, if raid set in @rs is reshapable */
+static unsigned int __is_raid10_far(int layout);
+static bool rs_is_reshapable(struct raid_set *rs)
+{
+	return rs_is_raid456(rs) ||
+	       (rs_is_raid10(rs) && !__is_raid10_far(rs->md.new_layout));
+}
+
 /*
  * bool helpers to test for various raid levels of a raid type
  */
@@ -899,7 +918,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
 					rebuilds_per_group = 0;
 				d = i % rs->md.raid_disks;
 				if ((!rs->dev[d].rdev.sb_page ||
-				     !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
+				    !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
 				    (++rebuilds_per_group >= copies))
 					goto too_many;
 			}
@@ -971,7 +990,6 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	unsigned raid10_copies = 2;
 	unsigned i;
 	unsigned value, region_size = 0;
-	sector_t sectors_per_dev = rs->ti->len;
 	sector_t max_io_len;
 	const char *arg, *key;
 	struct raid_dev *rd;
@@ -1286,20 +1304,10 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			return -EINVAL;
 		}
 
-		/* (Len * #mirrors) / #devices */
-		sectors_per_dev = rs->ti->len * raid10_copies;
-		sector_div(sectors_per_dev, rs->md.raid_disks);
-
-		rs->md.layout = raid10_format_to_md_layout(rs, raid10_format, raid10_copies);
-		rs->md.new_layout = rs->md.layout;
-	} else if (!rt_is_raid1(rt) &&
-		   sector_div(sectors_per_dev, (rs->md.raid_disks - rt->parity_devs))) {
-		rs->ti->error = "Target length not divisible by number of data devices";
-		return -EINVAL;
+		rs->md.layout = rs->md.new_layout;
 	}
 
 	rs->raid10_copies = raid10_copies;
-	rs->md.dev_sectors = sectors_per_dev;
 
 	/* Assume there are no metadata devices until the drives are parsed */
 	rs->md.persistent = 0;
@@ -1315,6 +1323,66 @@ static unsigned int mddev_data_stripes(struct raid_set *rs)
 	return rs->md.raid_disks - rs->raid_type->parity_devs;
 }
 
+/* Return # of data stripes of @rs (i.e. as of ctr) */
+static unsigned int rs_data_stripes(struct raid_set *rs)
+{
+	return rs->raid_disks - rs->raid_type->parity_devs;
+}
+
+/* Calculate the sectors per device and per array used for @rs */
+static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
+{
+	int delta_disks;
+	unsigned int data_stripes;
+	struct mddev *mddev = &rs->md;
+	struct md_rdev *rdev;
+	sector_t array_sectors = rs->ti->len, dev_sectors = rs->ti->len;
+
+	if (use_mddev) {
+		delta_disks = mddev->delta_disks;
+		data_stripes = mddev_data_stripes(rs);
+	} else {
+		delta_disks = rs->delta_disks;
+		data_stripes = rs_data_stripes(rs);
+	}
+
+	/* Special raid1 case w/o delta_disks support (yet) */
+	if (rt_is_raid1(rs->raid_type))
+		;
+	else if (rt_is_raid10(rs->raid_type)) {
+		if (rs->raid10_copies < 2 ||
+		    delta_disks < 0) {
+			rs->ti->error = "Bogus raid10 data copies or delta disks";
+			return EINVAL;
+		}
+
+		dev_sectors *= rs->raid10_copies;
+		if (sector_div(dev_sectors, data_stripes))
+			goto bad;
+
+		array_sectors = (data_stripes + delta_disks) * dev_sectors;
+		if (sector_div(array_sectors, rs->raid10_copies))
+			goto bad;
+
+	} else if (sector_div(dev_sectors, data_stripes))
+		goto bad;
+
+	else
+		/* Striped layouts */
+		array_sectors = (data_stripes + delta_disks) * dev_sectors;
+
+	rdev_for_each(rdev, mddev)
+		rdev->sectors = dev_sectors;
+
+	mddev->array_sectors = array_sectors;
+	mddev->dev_sectors = dev_sectors;
+
+	return 0;
+bad:
+	rs->ti->error = "Target length not divisible by number of data devices";
+	return EINVAL;
+}
+
 static void do_table_event(struct work_struct *ws)
 {
 	struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
@@ -1487,6 +1555,21 @@ static bool rs_takeover_requested(struct raid_set *rs)
 	return rs->md.new_level != rs->md.level;
 }
 
+/* True if @rs is requested to reshape by ctr */
+static bool rs_reshape_requested(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+
+	if (!mddev->level)
+		return false;
+
+	return !__is_raid10_far(mddev->new_layout) &&
+	       mddev->new_level == mddev->level &&
+	       (mddev->new_layout != mddev->layout ||
+		mddev->new_chunk_sectors != mddev->chunk_sectors ||
+		rs->raid_disks + rs->delta_disks != mddev->raid_disks);
+}
+
 /*  Features */
 #define	FEATURE_FLAG_SUPPORTS_V190	0x1 /* Supports extended superblock */
 
@@ -2110,6 +2193,97 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 	return 0;
 }
 
+/*
+ * Adjust data_offset and new_data_offset on all disk members of @rs
+ * for out of place reshaping if requested by contructor
+ *
+ * We need free space at the beginning of each raid disk for forward
+ * and at the end for backward reshapes which userspace has to provide
+ * via remapping/reordering of space.
+ */
+static int rs_adjust_data_offsets(struct raid_set *rs)
+{
+	sector_t data_offset = 0, new_data_offset = 0;
+	struct md_rdev *rdev;
+
+	/* Constructor did not request data offset change */
+	if (!test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags)) {
+		if (!rs_is_reshapable(rs))
+			goto out;
+
+		return 0;
+	}
+
+	/* HM FIXME: get InSync raid_dev? */
+	rdev = &rs->dev[0].rdev;
+
+	if (rs->delta_disks < 0) {
+		/*
+		 * Removing disks (reshaping backwards):
+		 *
+		 * - before reshape: data is at offset 0 and free space
+		 *		     is at end of each component LV
+		 *
+		 * - after reshape: data is at offset rs->data_offset != 0 on each component LV
+		 */
+		data_offset = 0;
+		new_data_offset = rs->data_offset;
+
+	} else if (rs->delta_disks > 0) {
+		/*
+		 * Adding disks (reshaping forwards):
+		 *
+		 * - before reshape: data is at offset rs->data_offset != 0 and
+		 *		     free space is at begin of each component LV
+		 *
+		 * - after reshape: data is at offset 0 on each component LV
+		 */
+		data_offset = rs->data_offset;
+		new_data_offset = 0;
+
+	} else {
+		/*
+		 * User space passes in 0 for data offset after having removed reshape space
+		 *
+		 * - or - (data offset != 0)
+		 *
+		 * Changing RAID layout or chunk size -> toggle offsets
+		 *
+		 * - before reshape: data is at offset rs->data_offset 0 and
+		 *		     free space is at end of each component LV
+		 *		     -or-
+		 *                   data is at offset rs->data_offset != 0 and
+		 *		     free space is at begin of each component LV
+		 *
+		 * - after reshape: data is at offset 0 if i was at offset != 0
+		 *                  of at offset != 0 if it was at offset 0
+		 *                  on each component LV
+		 *
+		 */
+		data_offset = rs->data_offset ? rdev->data_offset : 0;
+		new_data_offset = data_offset ? 0 : rs->data_offset;
+		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+	}
+
+	/*
+	 * Make sure we got a minimum amount of free sectors per device
+	 */
+	if (rs->data_offset &&
+	    to_sector(i_size_read(rdev->bdev->bd_inode)) - rdev->sectors < MIN_FREE_RESHAPE_SPACE) {
+		rs->ti->error = data_offset ? "No space for forward reshape" :
+					      "No space for backward reshape";
+		return -ENOSPC;
+	}
+out:
+	/* Adjust data offsets on all rdevs */
+	rdev_for_each(rdev, &rs->md) {
+		rdev->data_offset = data_offset;
+		rdev->new_data_offset = new_data_offset;
+	}
+
+	return 0;
+}
+
 /* Userpace reordered disks -> adjust raid_disk indexes in @rs */
 static void __reorder_raid_disk_indexes(struct raid_set *rs)
 {
@@ -2286,6 +2460,10 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	rs->md.sync_super = super_sync;
 
+	r = rs_set_dev_and_array_sectors(rs, false);
+	if (r)
+		return r;
+
 	/*
 	 * Backup any new raid set level, layout, ...
 	 * requested to be able to compare to superblock
@@ -2320,9 +2498,16 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		/* Tell preresume to update superblocks with new layout */
 		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 		rs_set_new(rs);
+	} else if (rs_reshape_requested(rs)) {
+		rs_set_cur(rs); /* Dummy to reject, fill in */
 	} else
 		rs_set_cur(rs);
 
+	/* If constructor requested it, change data and new_data offsets */
+	r = rs_adjust_data_offsets(rs);
+	if (r)
+		return r;
+
 	/* Start raid set read-only and assumed clean to change in raid_resume() */
 	rs->md.ro = 1;
 	rs->md.in_sync = 1;
@@ -2657,11 +2842,6 @@ static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
 	struct raid_set *rs = ti->private;
 	struct mddev *mddev = &rs->md;
 
-	if (!strcasecmp(argv[0], "reshape")) {
-		DMERR("Reshape not supported.");
-		return -EINVAL;
-	}
-
 	if (!mddev->pers || !mddev->pers->sync_request)
 		return -EINVAL;
 
-- 
cgit v1.2.3-70-g09d2


From 9dbd1aa3a81c6166608fec87994b6c464701f73a Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Mon, 13 Jun 2016 17:55:14 +0200
Subject: dm raid: add reshaping support to the target

Add bool functions rs_is_recovering and rs_is_reshaping()
to test for ongoing recovery/reshaping respectively in order
to reject respective requests on ongoing ones.

Remove ctr array size check, because ti->len and array
sectors will differ during disk addition/removal reshape.

Use __is_raid10_near() rather than type string compare.

Introduce rs_check_reshape() and rs_start_reshape(),
use the former in the ctr to reject bogus rehsape requests
and the latter in preresume to actually start a reshape.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 505 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 447 insertions(+), 58 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 14835ae064c1..e4c41232107f 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -189,6 +189,7 @@ struct raid_dev {
 #define RT_FLAG_RS_RESUMED		1
 #define RT_FLAG_RS_BITMAP_LOADED	2
 #define RT_FLAG_UPDATE_SBS		3
+#define RT_FLAG_RESHAPE_RS		4
 
 /* Array elements of 64 bit needed for rebuild/write_mostly bits */
 #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@@ -206,6 +207,7 @@ struct raid_set {
 	struct dm_target *ti;
 
 	uint32_t bitmap_loaded;
+	uint32_t stripe_cache_entries;
 	unsigned long ctr_flags;
 	unsigned long runtime_flags;
 
@@ -219,25 +221,22 @@ struct raid_set {
 	struct mddev md;
 	struct raid_type *raid_type;
 	struct dm_target_callbacks callbacks;
-	struct rs_layout rs_layout;
 
 	struct raid_dev dev[0];
 };
 
-static void rs_config_backup(struct raid_set *rs)
+static void rs_config_backup(struct raid_set *rs, struct rs_layout *l)
 {
 	struct mddev *mddev = &rs->md;
-	struct rs_layout *l = &rs->rs_layout;
 
 	l->new_level = mddev->new_level;
 	l->new_layout = mddev->new_layout;
 	l->new_chunk_sectors = mddev->new_chunk_sectors;
 }
 
-static void rs_config_restore(struct raid_set *rs)
+static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
 {
 	struct mddev *mddev = &rs->md;
-	struct rs_layout *l = &rs->rs_layout;
 
 	mddev->new_level = l->new_level;
 	mddev->new_layout = l->new_layout;
@@ -336,6 +335,12 @@ static bool rs_is_raid0(struct raid_set *rs)
 	return !rs->md.level;
 }
 
+/* Return true, if raid set in @rs is raid1 */
+static bool rs_is_raid1(struct raid_set *rs)
+{
+	return rs->md.level == 1;
+}
+
 /* Return true, if raid set in @rs is raid10 */
 static bool rs_is_raid10(struct raid_set *rs)
 {
@@ -356,6 +361,20 @@ static bool rs_is_reshapable(struct raid_set *rs)
 	       (rs_is_raid10(rs) && !__is_raid10_far(rs->md.new_layout));
 }
 
+/* Return true, if raid set in @rs is recovering */
+static bool rs_is_recovering(struct raid_set *rs)
+{
+	smp_rmb();
+	return rs->md.recovery_cp != MaxSector;
+}
+
+/* Return true, if raid set in @rs is reshaping */
+static bool rs_is_reshaping(struct raid_set *rs)
+{
+	smp_rmb();
+	return rs->md.reshape_position != MaxSector;
+}
+
 /*
  * bool helpers to test for various raid levels of a raid type
  */
@@ -590,6 +609,24 @@ static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
 	return NULL;
 }
 
+/*
+ * Conditionally change bdev capacity of @rs
+ * in case of a disk add/remove reshape
+ */
+static void rs_set_capacity(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+
+	/* Make sure we access most actual mddev properties */
+	smp_rmb();
+	if (rs->ti->len != mddev->array_sectors && !rs_is_reshaping(rs)) {
+		struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
+
+		set_capacity(gendisk, mddev->array_sectors);
+		revalidate_disk(gendisk);
+	}
+}
+
 /*
  * Set the mddev properties in @rs to the current
  * ones retrieved from the freshest superblock
@@ -642,6 +679,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
 
 	rs->ti = ti;
 	rs->raid_type = raid_type;
+	rs->stripe_cache_entries = 256;
 	rs->md.raid_disks = raid_devs;
 	rs->md.level = raid_type->level;
 	rs->md.new_level = rs->md.level;
@@ -874,7 +912,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
 static int validate_raid_redundancy(struct raid_set *rs)
 {
 	unsigned i, rebuild_cnt = 0;
-	unsigned rebuilds_per_group = 0, copies, d;
+	unsigned rebuilds_per_group = 0, copies;
 	unsigned group_size, last_group_start;
 
 	for (i = 0; i < rs->md.raid_disks; i++)
@@ -894,7 +932,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
 			goto too_many;
 		break;
 	case 10:
-		copies = raid10_md_layout_to_copies(rs->md.layout);
+		copies = raid10_md_layout_to_copies(rs->md.new_layout);
 		if (rebuild_cnt < copies)
 			break;
 
@@ -912,12 +950,11 @@ static int validate_raid_redundancy(struct raid_set *rs)
 		 *	    A	 A    B	   B	C
 		 *	    C	 D    D	   E	E
 		 */
-		if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
-			for (i = 0; i < rs->md.raid_disks * copies; i++) {
+		if (__is_raid10_near(rs->md.new_layout)) {
+			for (i = 0; i < rs->raid_disks; i++) {
 				if (!(i % copies))
 					rebuilds_per_group = 0;
-				d = i % rs->md.raid_disks;
-				if ((!rs->dev[d].rdev.sb_page ||
+				if ((!rs->dev[i].rdev.sb_page ||
 				    !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
 				    (++rebuilds_per_group >= copies))
 					goto too_many;
@@ -986,10 +1023,10 @@ too_many:
 static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			     unsigned num_raid_params)
 {
-	int raid10_format = ALGORITHM_RAID10_DEFAULT;
+	int value, raid10_format = ALGORITHM_RAID10_DEFAULT;
 	unsigned raid10_copies = 2;
 	unsigned i;
-	unsigned value, region_size = 0;
+	unsigned region_size = 0;
 	sector_t max_io_len;
 	const char *arg, *key;
 	struct raid_dev *rd;
@@ -998,7 +1035,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	arg = dm_shift_arg(as);
 	num_raid_params--; /* Account for chunk_size argument */
 
-	if (kstrtouint(arg, 10, &value) < 0) {
+	if (kstrtoint(arg, 10, &value) < 0) {
 		rs->ti->error = "Bad numerical argument given for chunk_size";
 		return -EINVAL;
 	}
@@ -1105,7 +1142,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			continue;
 		}
 
-		if (kstrtouint(arg, 10, &value) < 0) {
+		if (kstrtoint(arg, 10, &value) < 0) {
 			rs->ti->error = "Bad numerical argument given in raid params";
 			return -EINVAL;
 		}
@@ -1207,21 +1244,12 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				return -EINVAL;
 			}
 
-			/*
-			 * In device-mapper, we specify things in sectors, but
-			 * MD records this value in kB
-			 */
-			value /= 2;
-
 			if (!rt_is_raid456(rt)) {
 				rs->ti->error = "Inappropriate argument: stripe_cache";
 				return -EINVAL;
 			}
-			if (raid5_set_cache_size(&rs->md, (int)value)) {
-				rs->ti->error = "Bad stripe_cache size";
-				return -EINVAL;
-			}
 
+			rs->stripe_cache_entries = value;
 		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE))) {
 			if (test_and_set_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) {
 				rs->ti->error = "Only one min_recovery_rate argument pair allowed";
@@ -1303,8 +1331,6 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			rs->ti->error = "RAID10 format 'near' and 'raid10_use_near_sets' are incompatible";
 			return -EINVAL;
 		}
-
-		rs->md.layout = rs->md.new_layout;
 	}
 
 	rs->raid10_copies = raid10_copies;
@@ -1317,6 +1343,46 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	return rs_check_for_valid_flags(rs);
 }
 
+/* Set raid4/5/6 cache size */
+static int rs_set_raid456_stripe_cache(struct raid_set *rs)
+{
+	int r;
+	struct r5conf *conf;
+	struct mddev *mddev = &rs->md;
+	uint32_t min_stripes = max(mddev->chunk_sectors, mddev->new_chunk_sectors) / 2;
+	uint32_t nr_stripes = rs->stripe_cache_entries;
+
+	if (!rt_is_raid456(rs->raid_type)) {
+		rs->ti->error = "Inappropriate raid level; cannot change stripe_cache size";
+		return -EINVAL;
+	}
+
+	if (nr_stripes < min_stripes) {
+		DMINFO("Adjusting requested %u stripe cache entries to %u to suit stripe size",
+		       nr_stripes, min_stripes);
+		nr_stripes = min_stripes;
+	}
+
+	conf = mddev->private;
+	if (!conf) {
+		rs->ti->error = "Cannot change stripe_cache size on inactive RAID set";
+		return -EINVAL;
+	}
+
+	/* Try setting number of stripes in raid456 stripe cache */
+	if (conf->min_nr_stripes != nr_stripes) {
+		r = raid5_set_cache_size(mddev, nr_stripes);
+		if (r) {
+			rs->ti->error = "Failed to set raid4/5/6 stripe cache size";
+			return r;
+		}
+
+		DMINFO("%u stripe cache entries", nr_stripes);
+	}
+
+	return 0;
+}
+
 /* Return # of data stripes as kept in mddev as of @rs (i.e. as of superblock) */
 static unsigned int mddev_data_stripes(struct raid_set *rs)
 {
@@ -1337,6 +1403,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
 	struct mddev *mddev = &rs->md;
 	struct md_rdev *rdev;
 	sector_t array_sectors = rs->ti->len, dev_sectors = rs->ti->len;
+	sector_t cur_dev_sectors = rs->dev[0].rdev.sectors;
 
 	if (use_mddev) {
 		delta_disks = mddev->delta_disks;
@@ -1377,6 +1444,9 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
 	mddev->array_sectors = array_sectors;
 	mddev->dev_sectors = dev_sectors;
 
+	if (!rs_is_raid0(rs) && dev_sectors > cur_dev_sectors)
+		mddev->recovery_cp = dev_sectors;
+
 	return 0;
 bad:
 	rs->ti->error = "Target length not divisible by number of data devices";
@@ -1387,6 +1457,7 @@ static void do_table_event(struct work_struct *ws)
 {
 	struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
 
+	rs_set_capacity(rs);
 	dm_table_event(rs->ti->table);
 }
 
@@ -1410,6 +1481,17 @@ static int rs_check_takeover(struct raid_set *rs)
 	struct mddev *mddev = &rs->md;
 	unsigned int near_copies;
 
+	smp_rmb();
+	if (rs->md.degraded) {
+		rs->ti->error = "Can't takeover degraded raid set";
+		return -EPERM;
+	}
+
+	if (rs_is_reshaping(rs)) {
+		rs->ti->error = "Can't takeover reshaping raid set";
+		return -EPERM;
+	}
+
 	switch (mddev->level) {
 	case 0:
 		/* raid0 -> raid1/5 with one disk */
@@ -1419,7 +1501,7 @@ static int rs_check_takeover(struct raid_set *rs)
 
 		/* raid0 -> raid10 */
 		if (mddev->new_level == 10 &&
-		    !(rs->raid_disks % 2))
+		    !(rs->raid_disks % mddev->raid_disks))
 			return 0;
 
 		/* raid0 with multiple disks -> raid4/5/6 */
@@ -1658,6 +1740,39 @@ struct dm_raid_superblock {
 	/* Always set rest up to logical block size to 0 when writing (see get_metadata_device() below). */
 } __packed;
 
+/*
+ * Check for reshape constraints on raid set @rs:
+ *
+ * - reshape function non-existent
+ * - degraded set
+ * - ongoing recovery
+ * - ongoing reshape
+ *
+ * Returns 0 if none or -EPERM if given constraint
+ * and error message reference in @errmsg
+ */
+static int rs_check_reshape(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+
+	smp_rmb(); /* Make sure we access recent reshape position */
+
+	if (!mddev->pers || !mddev->pers->check_reshape)
+		rs->ti->error = "Reshape not supported";
+	else if (mddev->degraded)
+		rs->ti->error = "Can't reshape degraded raid set";
+	else if (rs_is_recovering(rs))
+		rs->ti->error = "Convert request on recovering raid set prohibited";
+	else if (mddev->reshape_position && rs_is_reshaping(rs))
+		rs->ti->error = "raid set already reshaping!";
+	else if (!(rs_is_raid10(rs) || rs_is_raid456(rs)))
+		rs->ti->error = "Reshaping only supported for raid4/5/6/10";
+	else
+		return 0;
+
+	return -EPERM;
+}
+
 static int read_disk_sb(struct md_rdev *rdev, int size)
 {
 	BUG_ON(!rdev->sb_page);
@@ -1936,6 +2051,10 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 	 *    and the new device needs to be rebuilt - in which
 	 *    case the In_sync bit will /not/ be set and
 	 *    recovery_cp must be MaxSector.
+	 * 3) This is/are a new device(s) being added to an old
+	 *    raid set during takeover to a higher raid level
+	 *    to provide capacity for redundancy or during reshape
+	 *    to add capacity to grow the raid set.
 	 */
 	d = 0;
 	rdev_for_each(r, mddev) {
@@ -1961,9 +2080,9 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 		if (new_devs == rs->raid_disks) {
 			DMINFO("Superblocks created for new raid set");
 			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
-			set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 			mddev->recovery_cp = 0;
-		} else if (new_devs && new_devs != rs->raid_disks && !rebuilds) {
+		} else if (new_devs != rebuilds &&
+			   new_devs != rs->delta_disks) {
 			DMERR("New device injected into existing raid set without "
 			      "'delta_disks' or 'rebuild' parameter specified");
 			return -EINVAL;
@@ -1978,12 +2097,13 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 			DMERR("new device%s provided without 'rebuild'",
 			      new_devs > 1 ? "s" : "");
 			return -EINVAL;
-		} else if (mddev->recovery_cp != MaxSector) {
+		} else if (rs_is_recovering(rs)) {
 			DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)",
 			      (unsigned long long) mddev->recovery_cp);
 			return -EINVAL;
-		} else if (mddev->reshape_position != MaxSector) {
-			DMERR("'rebuild' specified while raid set is being reshaped");
+		} else if (rs_is_reshaping(rs)) {
+			DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)",
+			      (unsigned long long) mddev->reshape_position);
 			return -EINVAL;
 		}
 	}
@@ -2082,7 +2202,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 		 * If no reshape in progress -> we're recovering single
 		 * disk(s) and have to set the device(s) to out-of-sync
 		 */
-		else if (rs->md.reshape_position == MaxSector)
+		else if (!rs_is_reshaping(rs))
 			clear_bit(In_sync, &rdev->flags); /* Mandatory for recovery */
 	}
 
@@ -2181,15 +2301,13 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 	 * Validation of the freshest device provides the source of
 	 * validation for the remaining devices.
 	 */
-	if (super_validate(rs, freshest)) {
-		rs->ti->error = "Unable to assemble array: Invalid superblocks";
+	rs->ti->error = "Unable to assemble array: Invalid superblocks";
+	if (super_validate(rs, freshest))
 		return -EINVAL;
-	}
 
 	rdev_for_each(rdev, mddev)
 		if ((rdev != freshest) && super_validate(rs, rdev))
 			return -EINVAL;
-
 	return 0;
 }
 
@@ -2343,6 +2461,106 @@ static int rs_setup_takeover(struct raid_set *rs)
 	return 0;
 }
 
+/*
+ *
+ * - change raid layout
+ * - change chunk size
+ * - add disks
+ * - remove disks
+ */
+static int rs_setup_reshape(struct raid_set *rs)
+{
+	int r = 0;
+	unsigned int cur_raid_devs, d;
+	struct mddev *mddev = &rs->md;
+	struct md_rdev *rdev;
+
+	mddev->delta_disks = rs->delta_disks;
+	cur_raid_devs = mddev->raid_disks;
+
+	/* Ignore impossible layout change whilst adding/removing disks */
+	if (mddev->delta_disks &&
+	    mddev->layout != mddev->new_layout) {
+		DMINFO("Ignoring invalid layout change with delta_disks=%d", rs->delta_disks);
+		mddev->new_layout = mddev->layout;
+	}
+
+	/*
+	 * Adjust array size:
+	 *
+	 * - in case of adding disks, array size has
+	 *   to grow after the disk adding reshape,
+	 *   which'll hapen in the event handler;
+	 *   reshape will happen forward, so space has to
+	 *   be available at the beginning of each disk
+	 *
+	 * - in case of removing disks, array size
+	 *   has to shrink before starting the reshape,
+	 *   which'll happen here;
+	 *   reshape will happen backward, so space has to
+	 *   be available at the end of each disk
+	 *
+	 * - data_offset and new_data_offset are
+	 *   adjusted for afreentioned out of place
+	 *   reshaping based on userspace passing in
+	 *   the "data_offset <sectors>" key/value
+	 *   pair via te constructor
+	 */
+
+	/* Add disk(s) */
+	if (rs->delta_disks > 0) {
+		/* Prepare disks for check in raid4/5/6/10 {check|start}_reshape */
+		for (d = cur_raid_devs; d < rs->raid_disks; d++) {
+			rdev = &rs->dev[d].rdev;
+			clear_bit(In_sync, &rdev->flags);
+
+			/*
+			 * save_raid_disk needs to be -1, or recovery_offset will be set to 0
+			 * by md, which'll store that erroneously in the superblock on reshape
+			 */
+			rdev->saved_raid_disk = -1;
+			rdev->raid_disk = d;
+
+			rdev->sectors = mddev->dev_sectors;
+			rdev->recovery_offset = MaxSector;
+		}
+
+		mddev->reshape_backwards = 0; /* adding disks -> forward reshape */
+
+	/* Remove disk(s) */
+	} else if (rs->delta_disks < 0) {
+		r = rs_set_dev_and_array_sectors(rs, true);
+		mddev->reshape_backwards = 1; /* removing disk(s) -> backward reshape */
+
+	/* Change layout and/or chunk size */
+	} else {
+		/*
+		 * Reshape layout (e.g. raid5_ls -> raid5_n) and/or chunk size:
+		 *
+		 * keeping number of disks and do layout change ->
+		 *
+		 * toggle reshape_backward depending on data_offset:
+		 *
+		 * - free space upfront -> reshape forward
+		 *
+		 * - free space at the end -> reshape backward
+		 *
+		 *
+		 * This utilizes free reshape space avoiding the need
+		 * for userspace to move (parts of) LV segments in
+		 * case of layout/chunksize change  (for disk
+		 * adding/removing reshape space has to be at
+		 * the proper address (see above with delta_disks):
+		 *
+		 * add disk(s)   -> begin
+		 * remove disk(s)-> end
+		 */
+		mddev->reshape_backwards = rs->dev[0].rdev.data_offset ? 0 : 1;
+	}
+
+	return r;
+}
+
 /*
  * Enable/disable discard support on RAID set depending on
  * RAID level and discard properties of underlying RAID members.
@@ -2411,6 +2629,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	unsigned num_raid_params, num_raid_devs;
 	struct raid_set *rs = NULL;
 	const char *arg;
+	struct rs_layout rs_layout;
 	struct dm_arg_set as = { argc, argv }, as_nrd;
 	struct dm_arg _args[] = {
 		{ 0, as.argc, "Cannot understand number of raid parameters" },
@@ -2469,7 +2688,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	 * requested to be able to compare to superblock
 	 * members for conversion decisions.
 	 */
-	rs_config_backup(rs);
+	rs_config_backup(rs, &rs_layout);
 
 	r = analyse_superblocks(ti, rs);
 	if (r)
@@ -2480,13 +2699,23 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->num_flush_bios = 1;
 
 	/* Restore any requested new layout for conversion decision */
-	rs_config_restore(rs);
+	rs_config_restore(rs, &rs_layout);
 
-	/*
-	 * If a takeover is needed, just set the level to
-	 * the new requested one and allow the raid set to run.
-	 */
-	if (rs_takeover_requested(rs)) {
+	if (test_bit(MD_ARRAY_FIRST_USE, &rs->md.flags)) {
+		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+		rs_set_new(rs);
+	} else if (rs_is_reshaping(rs))
+		; /* skip rs setup */
+	else if (rs_takeover_requested(rs)) {
+		if (rs_is_reshaping(rs)) {
+			ti->error = "Can't takeover a reshaping raid set";
+			return -EPERM;
+		}
+
+		/*
+		 * If a takeover is needed, just set the level to
+		 * the new requested one and allow the raid set to run.
+		 */
 		r = rs_check_takeover(rs);
 		if (r)
 			return r;
@@ -2495,11 +2724,55 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		if (r)
 			return r;
 
-		/* Tell preresume to update superblocks with new layout */
 		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 		rs_set_new(rs);
 	} else if (rs_reshape_requested(rs)) {
-		rs_set_cur(rs); /* Dummy to reject, fill in */
+		if (rs_is_reshaping(rs)) {
+			ti->error = "raid set already reshaping!";
+			return -EPERM;
+		}
+
+		if (rs_is_raid10(rs)) {
+			if (rs->raid_disks != rs->md.raid_disks &&
+			    __is_raid10_near(rs->md.layout) &&
+			    rs->raid10_copies &&
+			    rs->raid10_copies != __raid10_near_copies(rs->md.layout)) {
+				/*
+				 * raid disk have to be multiple of data copies to allow this conversion,
+				 *
+				 * This is actually not a reshape it is a
+				 * rebuild of any additional mirrors per group
+				 */
+				if (rs->raid_disks % rs->raid10_copies) {
+					ti->error = "Can't reshape raid10 mirror groups";
+					return -EINVAL;
+				}
+
+				/* Userpace reordered disks to add/remove mirrors -> adjust raid_disk indexes */
+				__reorder_raid_disk_indexes(rs);
+				rs->md.layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
+									   rs->raid10_copies);
+				rs->md.new_layout = rs->md.layout;
+
+			} else
+				set_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags);
+
+		} else if (rs_is_raid456(rs))
+			set_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags);
+
+		/*
+		 * HM FIXME: process raid1 via delta_disks as well?
+		 *           Would cause allocations in raid1->check_reshape
+		 *           though, thus more issues with potential failures
+		 */
+		else if (rs_is_raid1(rs))
+			rs->md.raid_disks = rs->raid_disks;
+
+		if (rs->md.raid_disks < rs->raid_disks)
+			set_bit(MD_ARRAY_FIRST_USE, &rs->md.flags);
+
+		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+		rs_set_cur(rs);
 	} else
 		rs_set_cur(rs);
 
@@ -2517,25 +2790,46 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	mddev_lock_nointr(&rs->md);
 	r = md_run(&rs->md);
 	rs->md.in_sync = 0; /* Assume already marked dirty */
-	mddev_unlock(&rs->md);
 
 	if (r) {
-		ti->error = "Fail to run raid array";
+		ti->error = "Failed to run raid array";
+		mddev_unlock(&rs->md);
 		goto bad;
 	}
 
-	if (ti->len != rs->md.array_sectors) {
-		ti->error = "Array size does not match requested target length";
-		r = -EINVAL;
-		goto size_mismatch;
-	}
 	rs->callbacks.congested_fn = raid_is_congested;
 	dm_table_add_target_callbacks(ti->table, &rs->callbacks);
 
 	mddev_suspend(&rs->md);
+
+	/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
+	if (rs_is_raid456(rs)) {
+		r = rs_set_raid456_stripe_cache(rs);
+		if (r)
+			goto bad_stripe_cache;
+	}
+
+	/* Now do an early reshape check */
+	if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
+		r = rs_check_reshape(rs);
+		if (r)
+			return r;
+
+		/* Restore new, ctr requested layout to perform check */
+		rs_config_restore(rs, &rs_layout);
+
+		r = rs->md.pers->check_reshape(&rs->md);
+		if (r) {
+			ti->error = "Reshape check failed";
+			goto bad_check_reshape;
+		}
+	}
+
+	mddev_unlock(&rs->md);
 	return 0;
 
-size_mismatch:
+bad_stripe_cache:
+bad_check_reshape:
 	md_stop(&rs->md);
 bad:
 	raid_set_free(rs);
@@ -2557,6 +2851,17 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
 	struct raid_set *rs = ti->private;
 	struct mddev *mddev = &rs->md;
 
+	/*
+	 * If we're reshaping to add disk(s)), ti->len and
+	 * mddev->array_sectors will differ during the process
+	 * (ti->len > mddev->array_sectors), so we have to requeue
+	 * bios with addresses > mddev->array_sectors here or
+	 * or there will occur accesses past EOD of the component
+	 * data images thus erroring the raid set.
+	 */
+	if (unlikely(bio_end_sector(bio) > mddev->array_sectors))
+		return DM_MAPIO_REQUEUE;
+
 	mddev->pers->make_request(mddev, bio);
 
 	return DM_MAPIO_SUBMITTED;
@@ -2709,7 +3014,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		if (!rt)
 			return;
 
-		DMEMIT("%s %d ", rt ? rt->name : "unknown", mddev->raid_disks);
+		DMEMIT("%s %d ", rt->name, mddev->raid_disks);
 
 		/* Access most recent mddev properties for status output */
 		smp_rmb();
@@ -2718,7 +3023,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 				      mddev->resync_max_sectors : mddev->dev_sectors;
 		progress = rs_get_progress(rs, resync_max_sectors, &array_in_sync);
 		resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
-				    (unsigned int) atomic64_read(&mddev->resync_mismatches) : 0;
+				    atomic64_read(&mddev->resync_mismatches) : 0;
 		sync_action = decipher_sync_action(&rs->md);
 
 		/* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
@@ -2925,6 +3230,8 @@ static void raid_postsuspend(struct dm_target *ti)
 	struct raid_set *rs = ti->private;
 
 	mddev_suspend(&rs->md);
+	rs->md.ro = 1;
+	clear_bit(RT_FLAG_RS_RESUMED, &rs->runtime_flags);
 }
 
 static void attempt_restore_of_faulty_devices(struct raid_set *rs)
@@ -2999,8 +3306,64 @@ static int __load_dirty_region_bitmap(struct raid_set *rs)
 	return r;
 }
 
+/*
+ * Reshape changes raid algorithm of @rs to new one within personality
+ * (e.g. raid6_zr -> raid6_nc), changes stripe size, adds/removes
+ * disks from a raid set thus growing/shrinking it or resizes the set
+ *
+ * Call mddev_lock_nointr() before!
+ */
+static int rs_start_reshape(struct raid_set *rs)
+{
+	int r;
+	struct mddev *mddev = &rs->md;
+	struct md_personality *pers = mddev->pers;
+
+	r = rs_setup_reshape(rs);
+	if (r)
+		return r;
+
+	/* Need to be resumed to be able to start reshape, recovery is frozen until raid_resume() though */
+	if (mddev->suspended)
+		mddev_resume(mddev);
+
+	/*
+	 * Check any reshape constraints enforced by the personalility
+	 *
+	 * May as well already kick the reshape off so that * pers->start_reshape() becomes optional.
+	 */
+	r = pers->check_reshape(mddev);
+	if (r) {
+		rs->ti->error = "pers->check_reshape() failed";
+		return r;
+	}
+
+	/*
+	 * Personality may not provide start reshape method in which
+	 * case check_reshape above has already covered everything
+	 */
+	if (pers->start_reshape) {
+		r = pers->start_reshape(mddev);
+		if (r) {
+			rs->ti->error = "pers->start_reshape() failed";
+			return r;
+		}
+	}
+
+	/* Suspend because a resume will happen in raid_resume() */
+	if (!mddev->suspended)
+		mddev_suspend(mddev);
+
+	mddev->ro = 0;
+	md_update_sb(mddev, 1);
+	mddev->ro = 1;
+
+	return 0;
+}
+
 static int raid_preresume(struct dm_target *ti)
 {
+	int r;
 	struct raid_set *rs = ti->private;
 	struct mddev *mddev = &rs->md;
 
@@ -3034,7 +3397,33 @@ static int raid_preresume(struct dm_target *ti)
 	configure_discard_support(rs);
 
 	/* Load the bitmap from disk unless raid0 */
-	return __load_dirty_region_bitmap(rs);
+	r = __load_dirty_region_bitmap(rs);
+	if (r)
+		return r;
+
+	/* Check for any resize/reshape on @rs and adjust/initiate */
+	/* Be prepared for mddev_resume() in raid_resume() */
+	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+	if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) {
+		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+		mddev->resync_min = mddev->recovery_cp;
+	}
+
+	rs_set_capacity(rs);
+
+	/* Check for any reshape request and region size change unless new raid set */
+	if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
+		/* Initiate a reshape. */
+		mddev_lock_nointr(mddev);
+		r = rs_start_reshape(rs);
+		mddev_unlock(mddev);
+		if (r)
+			DMWARN("Failed to check/start reshape, continuing without change");
+		r = 0;
+	}
+
+	return r;
 }
 
 static void raid_resume(struct dm_target *ti)
-- 
cgit v1.2.3-70-g09d2


From 4257e085e26edaba0bf516ea231bd5122e3f3e6f Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Tue, 14 Jun 2016 01:46:03 +0200
Subject: dm raid: support to change bitmap region size

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index e4c41232107f..7df450877423 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -217,6 +217,7 @@ struct raid_set {
 	int delta_disks;
 	int data_offset;
 	int raid10_copies;
+	int requested_bitmap_chunk_sectors;
 
 	struct mddev md;
 	struct raid_type *raid_type;
@@ -1277,6 +1278,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			}
 
 			region_size = value;
+			rs->requested_bitmap_chunk_sectors = value;
 		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES))) {
 			if (test_and_set_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags)) {
 				rs->ti->error = "Only one raid10_copies argument pair allowed";
@@ -3401,6 +3403,15 @@ static int raid_preresume(struct dm_target *ti)
 	if (r)
 		return r;
 
+	/* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */
+	if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) &&
+	    mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) {
+		r = bitmap_resize(mddev->bitmap, mddev->dev_sectors,
+				  to_bytes(rs->requested_bitmap_chunk_sectors), 0);
+		if (r)
+			DMERR("Failed to resize bitmap");
+	}
+
 	/* Check for any resize/reshape on @rs and adjust/initiate */
 	/* Be prepared for mddev_resume() in raid_resume() */
 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-- 
cgit v1.2.3-70-g09d2


From 6e20902e8f9e1551afa75bd499be853a95745b9f Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Tue, 14 Jun 2016 15:23:13 -0400
Subject: dm raid: fix failed takeover/reshapes by keeping raid set frozen

Superblock updates where bogus causing some takovers/reshapes to fail.

Introduce new runtime flag (RT_FLAG_KEEP_RS_FROZEN) to keep a raid set
frozen when a layout change was requested.  Userpace will immediately
reload the table w/o the flags requesting such change once they made it
to the superblocks and any change of recovery/reshape offsets has to be
avoided until after read.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 85 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 56 insertions(+), 29 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 7df450877423..8d4865184b96 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -190,6 +190,7 @@ struct raid_dev {
 #define RT_FLAG_RS_BITMAP_LOADED	2
 #define RT_FLAG_UPDATE_SBS		3
 #define RT_FLAG_RESHAPE_RS		4
+#define RT_FLAG_KEEP_RS_FROZEN		5
 
 /* Array elements of 64 bit needed for rebuild/write_mostly bits */
 #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@@ -2727,6 +2728,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			return r;
 
 		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+		set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
 		rs_set_new(rs);
 	} else if (rs_reshape_requested(rs)) {
 		if (rs_is_reshaping(rs)) {
@@ -2767,13 +2769,19 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		 *           Would cause allocations in raid1->check_reshape
 		 *           though, thus more issues with potential failures
 		 */
-		else if (rs_is_raid1(rs))
+		else if (rs_is_raid1(rs)) {
+			set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
 			rs->md.raid_disks = rs->raid_disks;
+		}
+
+		if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
+			set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+			set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
+		}
 
 		if (rs->md.raid_disks < rs->raid_disks)
 			set_bit(MD_ARRAY_FIRST_USE, &rs->md.flags);
 
-		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 		rs_set_cur(rs);
 	} else
 		rs_set_cur(rs);
@@ -3231,9 +3239,11 @@ static void raid_postsuspend(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;
 
-	mddev_suspend(&rs->md);
-	rs->md.ro = 1;
-	clear_bit(RT_FLAG_RS_RESUMED, &rs->runtime_flags);
+	if (test_and_clear_bit(RT_FLAG_RS_RESUMED, &rs->runtime_flags)) {
+		if (!rs->md.suspended)
+			mddev_suspend(&rs->md);
+		rs->md.ro = 1;
+	}
 }
 
 static void attempt_restore_of_faulty_devices(struct raid_set *rs)
@@ -3308,6 +3318,18 @@ static int __load_dirty_region_bitmap(struct raid_set *rs)
 	return r;
 }
 
+/* Enforce updating all superblocks */
+static void rs_update_sbs(struct raid_set *rs)
+{
+	struct mddev *mddev = &rs->md;
+	int ro = mddev->ro;
+
+	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	mddev->ro = 0;
+	md_update_sb(mddev, 1);
+	mddev->ro = ro;
+}
+
 /*
  * Reshape changes raid algorithm of @rs to new one within personality
  * (e.g. raid6_zr -> raid6_nc), changes stripe size, adds/removes
@@ -3356,9 +3378,12 @@ static int rs_start_reshape(struct raid_set *rs)
 	if (!mddev->suspended)
 		mddev_suspend(mddev);
 
-	mddev->ro = 0;
-	md_update_sb(mddev, 1);
-	mddev->ro = 1;
+	/*
+	 * Now reshape got set up, update superblocks to
+	 * reflect the fact so that a table reload will
+	 * access proper superblock content in the ctr.
+	 */
+	rs_update_sbs(rs);
 
 	return 0;
 }
@@ -3375,22 +3400,12 @@ static int raid_preresume(struct dm_target *ti)
 
 	/*
 	 * The superblocks need to be updated on disk if the
-	 * array is new or __load_dirty_region_bitmap will overwrite them
-	 * in core with old data.
-	 *
-	 * In case the array got modified (takeover/reshape/resize)
-	 * or the data offsets on the component devices changed, they
-	 * have to be updated as well.
-	 *
-	 * Have to switch to readwrite and back in order to
-	 * allow for the superblock updates.
+	 * array is new or new devices got added (thus zeroed
+	 * out by userspace) or __load_dirty_region_bitmap
+	 * will overwrite them in core with old data or fail.
 	 */
-	if (test_and_clear_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags)) {
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
-		mddev->ro = 0;
-		md_update_sb(mddev, 1);
-		mddev->ro = 1;
-	}
+	if (test_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags))
+		rs_update_sbs(rs);
 
 	/*
 	 * Disable/enable discard support on raid set after any
@@ -3449,14 +3464,26 @@ static void raid_resume(struct dm_target *ti)
 		 * devices are reachable again.
 		 */
 		attempt_restore_of_faulty_devices(rs);
-	}
+	} else {
+		mddev->ro = 0;
+		mddev->in_sync = 0;
 
-	mddev->ro = 0;
-	mddev->in_sync = 0;
-	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+		/*
+		 * When passing in flags to the ctr, we expect userspace
+		 * to reset them because they made it to the superblocks
+		 * and reload the mapping anyway.
+		 *
+		 * -> only unfreeze recovery in case of a table reload or
+		 *    we'll have a bogus recovery/reshape position
+		 *    retrieved from the superblock by the ctr because
+		 *    the ongoing recovery/reshape will change it after read.
+		 */
+		if (!test_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags))
+			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 
-	if (mddev->suspended)
-		mddev_resume(mddev);
+		if (mddev->suspended)
+			mddev_resume(mddev);
+	}
 }
 
 static struct target_type raid_target = {
-- 
cgit v1.2.3-70-g09d2


From 68c1c4d5eafc65dda05bf7d3d172f10f6982e092 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 16 Jun 2016 11:03:25 +0200
Subject: dm raid: don't use 'const' in function return

A newly introduced function has 'const int' as the return type,
but as "make W=1" reports, that has no meaning:

drivers/md/dm-raid.c:510:18: error: type qualifiers ignored on function return type [-Werror=ignored-qualifiers]

This changes the return type to plain 'int'.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 33e53f06850f ("dm raid: introduce extended superblock and new raid types to support takeover/reshaping")
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 8d4865184b96..248053d84528 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -507,7 +507,7 @@ static const char *raid10_md_layout_to_format(int layout)
 }
 
 /* Return md raid10 algorithm for @name */
-static const int raid10_name_to_format(const char *name)
+static int raid10_name_to_format(const char *name)
 {
 	if (!strcasecmp(name, "near"))
 		return ALGORITHM_RAID10_NEAR;
-- 
cgit v1.2.3-70-g09d2


From 350b53932810840a0c68467d7d78795010929940 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Tue, 28 Jun 2016 16:32:32 +0200
Subject: dm crypt: Fix sparse complaints

Avoid that sparse complains about assigning a __le64 value to a u64
variable.  Remove the (u64) casts since these are superfluous.  This
patch does not change the behavior of the source code.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 96dd5d7e454a..8f2e3e2ffd26 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -683,7 +683,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc,
 				  u8 *data)
 {
 	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
-	u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
+	__le64 sector = cpu_to_le64(dmreq->iv_sector);
 	u8 buf[TCW_WHITENING_SIZE];
 	SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm);
 	int i, r;
@@ -722,7 +722,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
 			    struct dm_crypt_request *dmreq)
 {
 	struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
-	u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
+	__le64 sector = cpu_to_le64(dmreq->iv_sector);
 	u8 *src;
 	int r = 0;
 
-- 
cgit v1.2.3-70-g09d2


From 028b39e314dd8b4ab8d3ac401285db40ab1f39d8 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Tue, 28 Jun 2016 16:36:46 +0200
Subject: dm ioctl: Simplify parameter buffer management code

Merge the two DM_PARAMS_[KV]MALLOC flags into a single flag.

Doing so avoids the crashes seen with previous attempts to consolidate
buffer management to use kvfree() without first flagging that memory had
actually been allocated.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-ioctl.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index b59e34595ad8..96df89a31f42 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1670,8 +1670,7 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
 	return r;
 }
 
-#define DM_PARAMS_KMALLOC	0x0001	/* Params alloced with kmalloc */
-#define DM_PARAMS_VMALLOC	0x0002	/* Params alloced with vmalloc */
+#define DM_PARAMS_MALLOC	0x0001	/* Params allocated with kvmalloc() */
 #define DM_WIPE_BUFFER		0x0010	/* Wipe input buffer before returning from ioctl */
 
 static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags)
@@ -1679,10 +1678,8 @@ static void free_params(struct dm_ioctl *param, size_t param_size, int param_fla
 	if (param_flags & DM_WIPE_BUFFER)
 		memset(param, 0, param_size);
 
-	if (param_flags & DM_PARAMS_KMALLOC)
-		kfree(param);
-	if (param_flags & DM_PARAMS_VMALLOC)
-		vfree(param);
+	if (param_flags & DM_PARAMS_MALLOC)
+		kvfree(param);
 }
 
 static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel,
@@ -1714,19 +1711,14 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
 	 * Use kmalloc() rather than vmalloc() when we can.
 	 */
 	dmi = NULL;
-	if (param_kernel->data_size <= KMALLOC_MAX_SIZE) {
+	if (param_kernel->data_size <= KMALLOC_MAX_SIZE)
 		dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
-		if (dmi)
-			*param_flags |= DM_PARAMS_KMALLOC;
-	}
 
 	if (!dmi) {
 		unsigned noio_flag;
 		noio_flag = memalloc_noio_save();
 		dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL);
 		memalloc_noio_restore(noio_flag);
-		if (dmi)
-			*param_flags |= DM_PARAMS_VMALLOC;
 	}
 
 	if (!dmi) {
@@ -1735,6 +1727,8 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
 		return -ENOMEM;
 	}
 
+	*param_flags |= DM_PARAMS_MALLOC;
+
 	if (copy_from_user(dmi, user, param_kernel->data_size))
 		goto bad;
 
-- 
cgit v1.2.3-70-g09d2


From 602d1657c603eedd7379a8bcde1ad3a2972ecc5f Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Tue, 21 Jun 2016 11:02:42 -0700
Subject: dm verity fec: fix block calculation

do_div was replaced with div64_u64 at some point, causing a bug with
block calculation due to incompatible semantics of the two functions.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Fixes: a739ff3f543a ("dm verity: add support for forward error correction")
Cc: stable@vger.kernel.org # v4.5+
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-verity-fec.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 459a9f8905ed..0f0eb8a3d922 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -453,9 +453,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
 	 */
 
 	offset = block << v->data_dev_block_bits;
-
-	res = offset;
-	div64_u64(res, v->fec->rounds << v->data_dev_block_bits);
+	res = div64_u64(offset, v->fec->rounds << v->data_dev_block_bits);
 
 	/*
 	 * The base RS block we can feed to the interleaver to find out all
-- 
cgit v1.2.3-70-g09d2


From 7193a9defcab6f3d3f1eb64c68bad7534e5a39ad Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 6 Jul 2016 09:06:37 -0400
Subject: dm rq: check kthread_run return for .request_fn request-based DM

Check return value of kthread_run() in dm_old_init_request_queue().

Reported-by: Minfei Huang <mnghuan@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-rq.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 266f7b674108..aa81539374a6 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -819,6 +819,8 @@ int dm_old_init_request_queue(struct mapped_device *md)
 	init_kthread_worker(&md->kworker);
 	md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
 				       "kdmwork-%s", dm_device_name(md));
+	if (IS_ERR(md->kworker_task))
+		return PTR_ERR(md->kworker_task);
 
 	elv_register_queue(md->queue);
 
-- 
cgit v1.2.3-70-g09d2


From ff4a88bf1cedbe73ece1a6fad34650f21c06167c Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 15 Jun 2016 18:39:17 +0200
Subject: dm raid: avoid superfluous memory barriers on static metadata

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 248053d84528..4bf7747a25a9 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -366,14 +366,12 @@ static bool rs_is_reshapable(struct raid_set *rs)
 /* Return true, if raid set in @rs is recovering */
 static bool rs_is_recovering(struct raid_set *rs)
 {
-	smp_rmb();
 	return rs->md.recovery_cp != MaxSector;
 }
 
 /* Return true, if raid set in @rs is reshaping */
 static bool rs_is_reshaping(struct raid_set *rs)
 {
-	smp_rmb();
 	return rs->md.reshape_position != MaxSector;
 }
 
@@ -1484,7 +1482,6 @@ static int rs_check_takeover(struct raid_set *rs)
 	struct mddev *mddev = &rs->md;
 	unsigned int near_copies;
 
-	smp_rmb();
 	if (rs->md.degraded) {
 		rs->ti->error = "Can't takeover degraded raid set";
 		return -EPERM;
@@ -1758,8 +1755,6 @@ static int rs_check_reshape(struct raid_set *rs)
 {
 	struct mddev *mddev = &rs->md;
 
-	smp_rmb(); /* Make sure we access recent reshape position */
-
 	if (!mddev->pers || !mddev->pers->check_reshape)
 		rs->ti->error = "Reshape not supported";
 	else if (mddev->degraded)
-- 
cgit v1.2.3-70-g09d2


From 0d851d14b8dfed601e165ccec2819cab492442d8 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 15 Jun 2016 18:43:55 +0200
Subject: dm raid: prohibit to pass in both sync and nosync ctr flags

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 4bf7747a25a9..e8e9b6abe133 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1297,6 +1297,12 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		}
 	}
 
+	if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) &&
+	    test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
+		rs->ti->error = "sync and nosync are mutually exclusive";
+		return -EINVAL;
+	}
+
 	if (validate_region_size(rs, region_size))
 		return -EINVAL;
 
-- 
cgit v1.2.3-70-g09d2


From 0a7b818892e27f6a6ac728a525bd0d8a307fe2bc Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 15 Jun 2016 18:45:56 +0200
Subject: dm raid: the sync_page_io() metadata_op argument is bool

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index e8e9b6abe133..0eceee802571 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1784,7 +1784,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
 	if (rdev->sb_loaded)
 		return 0;
 
-	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, 1)) {
+	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) {
 		DMERR("Failed to read superblock of device at position %d",
 		      rdev->raid_disk);
 		md_error(rdev->mddev, rdev);
@@ -3258,8 +3258,8 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
 	for (i = 0; i < rs->md.raid_disks; i++) {
 		r = &rs->dev[i].rdev;
 		if (test_bit(Faulty, &r->flags) && r->sb_page &&
-		    sync_page_io(r, 0, r->sb_size, r->sb_page, REQ_OP_READ, 0,
-				 1)) {
+		    sync_page_io(r, 0, r->sb_size, r->sb_page,
+				 REQ_OP_READ, 0, true)) {
 			DMINFO("Faulty %s device #%d has readable super block."
 			       "  Attempting to revive it.",
 			       rs->raid_type->name, i);
-- 
cgit v1.2.3-70-g09d2


From 5fa146b25ba86682f74e3060debd4c4d3073561b Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 15 Jun 2016 18:50:18 +0200
Subject: dm raid: reject too many write_mostly devices

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 0eceee802571..a38298589db8 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1025,7 +1025,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 {
 	int value, raid10_format = ALGORITHM_RAID10_DEFAULT;
 	unsigned raid10_copies = 2;
-	unsigned i;
+	unsigned i, write_mostly = 0;
 	unsigned region_size = 0;
 	sector_t max_io_len;
 	const char *arg, *key;
@@ -1179,6 +1179,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				return -EINVAL;
 			}
 
+			write_mostly++;
 			set_bit(WriteMostly, &rs->dev[value].rdev.flags);
 			set_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags);
 		} else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND))) {
@@ -1303,6 +1304,11 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		return -EINVAL;
 	}
 
+	if (write_mostly >= rs->md.raid_disks) {
+		rs->ti->error = "Can't set all raid1 devices to write_mostly";
+		return -EINVAL;
+	}
+
 	if (validate_region_size(rs, region_size))
 		return -EINVAL;
 
-- 
cgit v1.2.3-70-g09d2


From 75dd3b9ecb027e90261ed4052e155e0c1236d717 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 15 Jun 2016 22:27:08 +0200
Subject: dm raid: more restricting data_offset value checks

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index a38298589db8..b1dbf28f36d2 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1221,7 +1221,8 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				return -EINVAL;
 			}
 			/* Ensure sensible data offset */
-			if (value < 0) {
+			if (value < 0 ||
+			    (value && (value < MIN_FREE_RESHAPE_SPACE || value % to_sector(PAGE_SIZE)))) {
 				rs->ti->error = "Bogus data_offset value";
 				return -EINVAL;
 			}
-- 
cgit v1.2.3-70-g09d2


From ae3c6cfff98864fd2c282500632e11e3c9c514e3 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 15 Jun 2016 22:27:40 +0200
Subject: dm raid: remove bogus comment and fix comment typos

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index b1dbf28f36d2..864e903cddda 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1487,8 +1487,6 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
  *
  * Conversions of raid sets from one MD personality to another
  * have to conform to restrictions which are enforced here.
- *
- * Degration is already checked for in rs_check_conversion() below.
  */
 static int rs_check_takeover(struct raid_set *rs)
 {
@@ -2512,10 +2510,10 @@ static int rs_setup_reshape(struct raid_set *rs)
 	 *   be available at the end of each disk
 	 *
 	 * - data_offset and new_data_offset are
-	 *   adjusted for afreentioned out of place
+	 *   adjusted for aforementioned out of place
 	 *   reshaping based on userspace passing in
 	 *   the "data_offset <sectors>" key/value
-	 *   pair via te constructor
+	 *   pair via the constructor
 	 */
 
 	/* Add disk(s) */
-- 
cgit v1.2.3-70-g09d2


From 6ee0bae9c847086b7025494e84a2fff0dfc83bdc Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 15 Jun 2016 22:29:09 +0200
Subject: dm raid: enhance comments in takeover checks

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 864e903cddda..cc6eb7c27a7d 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1610,7 +1610,7 @@ static int rs_check_takeover(struct raid_set *rs)
 		    mddev->raid_disks == 2)
 			return 0;
 
-		/* raid5 with parity N -> raid6 with parity N */
+		/* raid5_* ->  raid6_*_6 with Q-Syndrome N (e.g. raid5_ra -> raid6_ra_6 */
 		if (mddev->new_level == 6 &&
 		    ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) ||
 		      __within_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC_6, ALGORITHM_RIGHT_SYMMETRIC_6)))
@@ -1628,7 +1628,7 @@ static int rs_check_takeover(struct raid_set *rs)
 		    mddev->layout == ALGORITHM_PARITY_N)
 			return 0;
 
-		/* raid6_*_n with parity N -> raid5_* */
+		/* raid6_*_n with Q-Syndrome N -> raid5_* */
 		if (mddev->new_level == 5 &&
 		    ((mddev->layout == ALGORITHM_PARITY_N && mddev->new_layout == ALGORITHM_PARITY_N) ||
 		     __within_range(mddev->new_layout, ALGORITHM_LEFT_ASYMMETRIC, ALGORITHM_RIGHT_SYMMETRIC)))
-- 
cgit v1.2.3-70-g09d2


From 9d9d939c80eb96bb2072a5eaee51d9bf29a0910c Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 16 Jun 2016 03:15:49 +0200
Subject: dm raid: make rs_set_capacity to work on shrinking reshape

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index cc6eb7c27a7d..62e31b47400c 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -617,9 +617,7 @@ static void rs_set_capacity(struct raid_set *rs)
 {
 	struct mddev *mddev = &rs->md;
 
-	/* Make sure we access most actual mddev properties */
-	smp_rmb();
-	if (rs->ti->len != mddev->array_sectors && !rs_is_reshaping(rs)) {
+	if (rs->ti->len != mddev->array_sectors) {
 		struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
 
 		set_capacity(gendisk, mddev->array_sectors);
@@ -1471,7 +1469,9 @@ static void do_table_event(struct work_struct *ws)
 {
 	struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
 
-	rs_set_capacity(rs);
+	smp_rmb(); /* Make sure we access most actual mddev properties */
+	if (!rs_is_reshaping(rs))
+		rs_set_capacity(rs);
 	dm_table_event(rs->ti->table);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 0095dbc98bfdcd5a3b6cda6d2dde70ae5ffefec7 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Fri, 24 Jun 2016 00:10:12 +0200
Subject: dm raid: fix rs_set_capacity on growing reshape

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 62e31b47400c..63883f4c550d 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -616,13 +616,10 @@ static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
 static void rs_set_capacity(struct raid_set *rs)
 {
 	struct mddev *mddev = &rs->md;
+	struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
 
-	if (rs->ti->len != mddev->array_sectors) {
-		struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
-
-		set_capacity(gendisk, mddev->array_sectors);
-		revalidate_disk(gendisk);
-	}
+	set_capacity(gendisk, mddev->array_sectors);
+	revalidate_disk(gendisk);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 4dff2f1e26f2621dc5b02436cb889df15400036b Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Fri, 24 Jun 2016 00:21:09 +0200
Subject: dm raid: clarify and fix recovery

Add function rs_setup_recovery() to allow for defined setup of RAID set
recovery in the constructor.

Will be called with dev_sectors={0, rdev->sectors, MaxSectors} to
recover a new or enforced sync, grown or not to be synhronized RAID set
respectively.

Prevents recovery on raid0, which doesn't support it.

Enforces recovery on raid6 to ensure properly defined Syndromes
mandatory for that MD personality are being created.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 64 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 55 insertions(+), 9 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 63883f4c550d..7e334b65b1c3 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -349,6 +349,12 @@ static bool rs_is_raid10(struct raid_set *rs)
 	return rs->md.level == 10;
 }
 
+/* Return true, if raid set in @rs is level 6 */
+static bool rs_is_raid6(struct raid_set *rs)
+{
+	return rs->md.level == 6;
+}
+
 /* Return true, if raid set in @rs is level 4, 5 or 6 */
 static bool rs_is_raid456(struct raid_set *rs)
 {
@@ -681,7 +687,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
 	rs->md.layout = raid_type->algorithm;
 	rs->md.new_layout = rs->md.layout;
 	rs->md.delta_disks = 0;
-	rs->md.recovery_cp = rs_is_raid0(rs) ? MaxSector : 0;
+	rs->md.recovery_cp = MaxSector;
 
 	for (i = 0; i < raid_devs; i++)
 		md_rdev_init(&rs->dev[i].rdev);
@@ -1090,7 +1096,6 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				rs->ti->error = "Only one 'nosync' argument allowed";
 				return -EINVAL;
 			}
-			rs->md.recovery_cp = MaxSector;
 			continue;
 		}
 		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_SYNC))) {
@@ -1098,7 +1103,6 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				rs->ti->error = "Only one 'sync' argument allowed";
 				return -EINVAL;
 			}
-			rs->md.recovery_cp = 0;
 			continue;
 		}
 		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_USE_NEAR_SETS))) {
@@ -1412,7 +1416,6 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
 	struct mddev *mddev = &rs->md;
 	struct md_rdev *rdev;
 	sector_t array_sectors = rs->ti->len, dev_sectors = rs->ti->len;
-	sector_t cur_dev_sectors = rs->dev[0].rdev.sectors;
 
 	if (use_mddev) {
 		delta_disks = mddev->delta_disks;
@@ -1453,15 +1456,50 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
 	mddev->array_sectors = array_sectors;
 	mddev->dev_sectors = dev_sectors;
 
-	if (!rs_is_raid0(rs) && dev_sectors > cur_dev_sectors)
-		mddev->recovery_cp = dev_sectors;
-
 	return 0;
 bad:
 	rs->ti->error = "Target length not divisible by number of data devices";
 	return EINVAL;
 }
 
+/* Setup recovery on @rs */
+static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
+{
+	/* raid0 does not recover */
+	if (rs_is_raid0(rs))
+		rs->md.recovery_cp = MaxSector;
+	/*
+	 * A raid6 set has to be recovered either
+	 * completely or for the grown part to
+	 * ensure proper parity and Q-Syndrome
+	 */
+	else if (rs_is_raid6(rs))
+		rs->md.recovery_cp = dev_sectors;
+	/*
+	 * Other raid set types may skip recovery
+	 * depending on the 'nosync' flag.
+	 */
+	else
+		rs->md.recovery_cp = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)
+				     ? MaxSector : dev_sectors;
+}
+
+/* Setup recovery on @rs based on raid type, device size and 'nosync' flag */
+static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
+{
+	if (!dev_sectors)
+		/* New raid set or 'sync' flag provided */
+		__rs_setup_recovery(rs, 0);
+	else if (dev_sectors == MaxSector)
+		/* Prevent recovery */
+		__rs_setup_recovery(rs, MaxSector);
+	else if (rs->dev[0].rdev.sectors < dev_sectors)
+		/* Grown raid set */
+		__rs_setup_recovery(rs, rs->dev[0].rdev.sectors);
+	else
+		__rs_setup_recovery(rs, MaxSector);
+}
+
 static void do_table_event(struct work_struct *ws)
 {
 	struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
@@ -2086,7 +2124,6 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 		if (new_devs == rs->raid_disks) {
 			DMINFO("Superblocks created for new raid set");
 			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
-			mddev->recovery_cp = 0;
 		} else if (new_devs != rebuilds &&
 			   new_devs != rs->delta_disks) {
 			DMERR("New device injected into existing raid set without "
@@ -2633,6 +2670,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	int r;
 	struct raid_type *rt;
 	unsigned num_raid_params, num_raid_devs;
+	sector_t calculated_dev_sectors;
 	struct raid_set *rs = NULL;
 	const char *arg;
 	struct rs_layout rs_layout;
@@ -2689,6 +2727,8 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (r)
 		return r;
 
+	calculated_dev_sectors = rs->dev[0].rdev.sectors;
+
 	/*
 	 * Backup any new raid set level, layout, ...
 	 * requested to be able to compare to superblock
@@ -2700,6 +2740,8 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (r)
 		goto bad;
 
+	rs_setup_recovery(rs, calculated_dev_sectors);
+
 	INIT_WORK(&rs->md.event_work, do_table_event);
 	ti->private = rs;
 	ti->num_flush_bios = 1;
@@ -2786,8 +2828,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			set_bit(MD_ARRAY_FIRST_USE, &rs->md.flags);
 
 		rs_set_cur(rs);
-	} else
+		rs_setup_recovery(rs, MaxSector);
+	} else {
 		rs_set_cur(rs);
+		rs_setup_recovery(rs, test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ?
+				      0 : calculated_dev_sectors);
+	}
 
 	/* If constructor requested it, change data and new_data offsets */
 	r = rs_adjust_data_offsets(rs);
-- 
cgit v1.2.3-70-g09d2


From 2d92a3c2a45c7c40c84a4a59e3ce2bf6b34a4195 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Fri, 24 Jun 2016 00:32:58 +0200
Subject: dm raid: prohibit 'nosync' on new raid6 and reject resize during
 reshape

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 7e334b65b1c3..6dc494055267 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2752,9 +2752,21 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (test_bit(MD_ARRAY_FIRST_USE, &rs->md.flags)) {
 		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 		rs_set_new(rs);
-	} else if (rs_is_reshaping(rs))
-		; /* skip rs setup */
-	else if (rs_takeover_requested(rs)) {
+		/* A new raid6 set has to be recovered to ensure proper parity and Q-Syndrome */
+		if (rs_is_raid6(rs) &&
+		    test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
+			ti->error = "'nosync' not allowed for new raid6 set";
+			return -EINVAL;
+		}
+		rs_setup_recovery(rs, 0);
+	} else if (rs_is_reshaping(rs)) {
+		/* Have to reject size change request during reshape */
+		if (calculated_dev_sectors != rs->dev[0].rdev.sectors) {
+			ti->error = "Can't resize a reshaping raid set";
+			return -EPERM;
+		}
+		/* skip setup rs */
+	} else if (rs_takeover_requested(rs)) {
 		if (rs_is_reshaping(rs)) {
 			ti->error = "Can't takeover a reshaping raid set";
 			return -EPERM;
-- 
cgit v1.2.3-70-g09d2


From fbe6365bb4732199a36e0fe6da89086936505e07 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Fri, 24 Jun 2016 00:36:08 +0200
Subject: dm raid: fix raid10 device size error on out-of-place reshape

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 6dc494055267..8118f1e0218b 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -622,8 +622,16 @@ static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
 static void rs_set_capacity(struct raid_set *rs)
 {
 	struct mddev *mddev = &rs->md;
+	struct md_rdev *rdev;
 	struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
 
+	/*
+	 * raid10 sets rdev->sector to the device size, which
+	 * is unintended in case of out-of-place reshaping
+	 */
+	rdev_for_each(rdev, mddev)
+		rdev->sectors = mddev->dev_sectors;
+
 	set_capacity(gendisk, mddev->array_sectors);
 	revalidate_disk(gendisk);
 }
-- 
cgit v1.2.3-70-g09d2


From 2527b56e0d2f6c4f4a2a20a0ae773d96ba69d3fe Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Fri, 24 Jun 2016 01:03:19 +0200
Subject: dm raid: add comments and fix typos

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 8118f1e0218b..457220217a93 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1618,7 +1618,6 @@ static int rs_check_takeover(struct raid_set *rs)
 		/* raid1 -> raid10 */
 		if (mddev->new_level == 10)
 			return 0;
-
 		break;
 
 	case 4:
@@ -2424,8 +2423,8 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
 		 *                   data is at offset rs->data_offset != 0 and
 		 *		     free space is at begin of each component LV
 		 *
-		 * - after reshape: data is at offset 0 if i was at offset != 0
-		 *                  of at offset != 0 if it was at offset 0
+		 * - after reshape: data is at offset 0 if it was at offset != 0
+		 *                  or at offset != 0 if it was at offset 0
 		 *                  on each component LV
 		 *
 		 */
@@ -2731,6 +2730,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	rs->md.sync_super = super_sync;
 
+	/*
+	 * Calculate ctr requested array and device sizes to allow
+	 * for superblock analysis needing device sizes defined.
+	 *
+	 * Any existing superblock will overwrite the array and device sizes
+	 */
 	r = rs_set_dev_and_array_sectors(rs, false);
 	if (r)
 		return r;
@@ -2781,8 +2786,9 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		}
 
 		/*
-		 * If a takeover is needed, just set the level to
-		 * the new requested one and allow the raid set to run.
+		 * If a takeover is needed, userspace sets any additional
+		 * devices to rebuild, so just set the level to the new
+		 * requested one and allow the raid set to run
 		 */
 		r = rs_check_takeover(rs);
 		if (r)
@@ -2845,7 +2851,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		}
 
 		if (rs->md.raid_disks < rs->raid_disks)
-			set_bit(MD_ARRAY_FIRST_USE, &rs->md.flags);
+			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
 
 		rs_set_cur(rs);
 		rs_setup_recovery(rs, MaxSector);
@@ -2935,7 +2941,7 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
 	 * mddev->array_sectors will differ during the process
 	 * (ti->len > mddev->array_sectors), so we have to requeue
 	 * bios with addresses > mddev->array_sectors here or
-	 * or there will occur accesses past EOD of the component
+	 * there will occur accesses past EOD of the component
 	 * data images thus erroring the raid set.
 	 */
 	if (unlikely(bio_end_sector(bio) > mddev->array_sectors))
-- 
cgit v1.2.3-70-g09d2


From f6895fd5058910d010026e1b78c4e596754994be Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Fri, 24 Jun 2016 01:06:28 +0200
Subject: dm raid: fix new superblock/bitmap creation on disk addition

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 457220217a93..5e41bf34eb87 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2850,8 +2850,9 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
 		}
 
+		/* Create new superblocks and bitmaps, if any */
 		if (rs->md.raid_disks < rs->raid_disks)
-			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+			set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 
 		rs_set_cur(rs);
 		rs_setup_recovery(rs, MaxSector);
-- 
cgit v1.2.3-70-g09d2


From 4348309a8ba535dfb1d4f6510739b56ccf4afead Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Fri, 24 Jun 2016 01:36:06 +0200
Subject: dm raid: also reject size change during recovery

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 5e41bf34eb87..7a33af9cb78b 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2772,10 +2772,10 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			return -EINVAL;
 		}
 		rs_setup_recovery(rs, 0);
-	} else if (rs_is_reshaping(rs)) {
-		/* Have to reject size change request during reshape */
+	} else if (rs_is_recovering(rs) || rs_is_reshaping(rs)) {
+		/* Have to reject size change request during recovery/reshape */
 		if (calculated_dev_sectors != rs->dev[0].rdev.sectors) {
-			ti->error = "Can't resize a reshaping raid set";
+			ti->error = "Can't resize a recovering/reshaping raid set";
 			return -EPERM;
 		}
 		/* skip setup rs */
-- 
cgit v1.2.3-70-g09d2


From 65359ee6b106cfb74b50bd0f63714955371ef780 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Fri, 24 Jun 2016 21:32:25 +0200
Subject: dm raid: fix typo in write_mostly flag

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 7a33af9cb78b..109d698d1704 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -300,7 +300,7 @@ static struct arg_name_flag {
 	{ CTR_FLAG_MIN_RECOVERY_RATE, "min_recovery_rate"},
 	{ CTR_FLAG_MAX_RECOVERY_RATE, "max_recovery_rate"},
 	{ CTR_FLAG_MAX_WRITE_BEHIND, "max_write_behind"},
-	{ CTR_FLAG_WRITE_MOSTLY, "writemostly"},
+	{ CTR_FLAG_WRITE_MOSTLY, "write_mostly"},
 	{ CTR_FLAG_STRIPE_CACHE, "stripe_cache"},
 	{ CTR_FLAG_REGION_SIZE, "region_size"},
 	{ CTR_FLAG_RAID10_COPIES, "raid10_copies"},
-- 
cgit v1.2.3-70-g09d2


From b1956dc4fa5c055e2229b848de418f1528ae7990 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Fri, 24 Jun 2016 21:49:26 +0200
Subject: dm raid: fix ctr memory leaks on error paths

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 109d698d1704..2fb5a9bff1bb 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2738,7 +2738,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	 */
 	r = rs_set_dev_and_array_sectors(rs, false);
 	if (r)
-		return r;
+		goto bad;
 
 	calculated_dev_sectors = rs->dev[0].rdev.sectors;
 
@@ -2769,20 +2769,23 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		if (rs_is_raid6(rs) &&
 		    test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
 			ti->error = "'nosync' not allowed for new raid6 set";
-			return -EINVAL;
+			r = -EINVAL;
+			goto bad;
 		}
 		rs_setup_recovery(rs, 0);
 	} else if (rs_is_recovering(rs) || rs_is_reshaping(rs)) {
 		/* Have to reject size change request during recovery/reshape */
 		if (calculated_dev_sectors != rs->dev[0].rdev.sectors) {
 			ti->error = "Can't resize a recovering/reshaping raid set";
-			return -EPERM;
+			r = -EPERM;
+			goto bad;
 		}
 		/* skip setup rs */
 	} else if (rs_takeover_requested(rs)) {
 		if (rs_is_reshaping(rs)) {
 			ti->error = "Can't takeover a reshaping raid set";
-			return -EPERM;
+			r = -EPERM;
+			goto bad;
 		}
 
 		/*
@@ -2792,11 +2795,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		 */
 		r = rs_check_takeover(rs);
 		if (r)
-			return r;
+			goto bad;
 
 		r = rs_setup_takeover(rs);
 		if (r)
-			return r;
+			goto bad;
 
 		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 		set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
@@ -2804,7 +2807,8 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	} else if (rs_reshape_requested(rs)) {
 		if (rs_is_reshaping(rs)) {
 			ti->error = "raid set already reshaping!";
-			return -EPERM;
+			r = -EPERM;
+			goto bad;
 		}
 
 		if (rs_is_raid10(rs)) {
@@ -2820,7 +2824,8 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 				 */
 				if (rs->raid_disks % rs->raid10_copies) {
 					ti->error = "Can't reshape raid10 mirror groups";
-					return -EINVAL;
+					r = -EINVAL;
+					goto bad;
 				}
 
 				/* Userpace reordered disks to add/remove mirrors -> adjust raid_disk indexes */
@@ -2865,7 +2870,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	/* If constructor requested it, change data and new_data offsets */
 	r = rs_adjust_data_offsets(rs);
 	if (r)
-		return r;
+		goto bad;
 
 	/* Start raid set read-only and assumed clean to change in raid_resume() */
 	rs->md.ro = 1;
@@ -2899,7 +2904,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
 		r = rs_check_reshape(rs);
 		if (r)
-			return r;
+			goto bad_check_reshape;
 
 		/* Restore new, ctr requested layout to perform check */
 		rs_config_restore(rs, &rs_layout);
-- 
cgit v1.2.3-70-g09d2


From 37f10be15076737067a4d4d8f1cf7e9103765a0f Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Fri, 24 Jun 2016 23:21:37 +0200
Subject: dm raid: fix rebuild and catch bogus sync/resync flags

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 2fb5a9bff1bb..14b3d93e84cb 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1312,6 +1312,13 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 		return -EINVAL;
 	}
 
+	if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) &&
+	    (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ||
+	     test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))) {
+		rs->ti->error = "sync/nosync and rebuild are mutually exclusive";
+		return -EINVAL;
+	}
+
 	if (write_mostly >= rs->md.raid_disks) {
 		rs->ti->error = "Can't set all raid1 devices to write_mostly";
 		return -EINVAL;
@@ -2776,7 +2783,9 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	} else if (rs_is_recovering(rs) || rs_is_reshaping(rs)) {
 		/* Have to reject size change request during recovery/reshape */
 		if (calculated_dev_sectors != rs->dev[0].rdev.sectors) {
-			ti->error = "Can't resize a recovering/reshaping raid set";
+			ti->error = rs_is_recovering(rs) ?
+				    "Can't resize a recovering raid set" :
+				    "Can't resize a reshaping raid set";
 			r = -EPERM;
 			goto bad;
 		}
@@ -2863,8 +2872,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		rs_setup_recovery(rs, MaxSector);
 	} else {
 		rs_set_cur(rs);
-		rs_setup_recovery(rs, test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ?
-				      0 : calculated_dev_sectors);
+		if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
+			rs_setup_recovery(rs, MaxSector);
+			set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+		} else
+			rs_setup_recovery(rs, test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ?
+					      0 : calculated_dev_sectors);
 	}
 
 	/* If constructor requested it, change data and new_data offsets */
-- 
cgit v1.2.3-70-g09d2


From 345a6cdc250d1519c5d15576965629eab6858257 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Sat, 25 Jun 2016 02:42:54 +0200
Subject: dm raid: fix rs_is_recovering() to allow for lvextend

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 14b3d93e84cb..1ff469f14ff2 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -372,7 +372,7 @@ static bool rs_is_reshapable(struct raid_set *rs)
 /* Return true, if raid set in @rs is recovering */
 static bool rs_is_recovering(struct raid_set *rs)
 {
-	return rs->md.recovery_cp != MaxSector;
+	return rs->md.recovery_cp < rs->dev[0].rdev.sectors;
 }
 
 /* Return true, if raid set in @rs is reshaping */
@@ -3532,7 +3532,7 @@ static int raid_preresume(struct dm_target *ti)
 
 	rs_set_capacity(rs);
 
-	/* Check for any reshape request and region size change unless new raid set */
+	/* Check for any reshape request unless new raid set */
 	if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
 		/* Initiate a reshape. */
 		mddev_lock_nointr(mddev);
-- 
cgit v1.2.3-70-g09d2


From 2a5556c2a86f77ff6085f2cae798728cda47e2c4 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Mon, 27 Jun 2016 14:44:09 +0200
Subject: dm raid: allow resize during recovery

Resizing a RAID set during recovery can be allowed, because the MD
resynchronization thread will either stop any ongoing recovery in case
of shrinking below the current recovery position or carry on recovery
to the new size if the set is growing.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1ff469f14ff2..846c58d2bcf9 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2682,6 +2682,7 @@ static void configure_discard_support(struct raid_set *rs)
 static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	int r;
+	bool resize = false;
 	struct raid_type *rt;
 	unsigned num_raid_params, num_raid_devs;
 	sector_t calculated_dev_sectors;
@@ -2760,7 +2761,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (r)
 		goto bad;
 
-	rs_setup_recovery(rs, calculated_dev_sectors);
+	resize = calculated_dev_sectors != rs->dev[0].rdev.sectors;
 
 	INIT_WORK(&rs->md.event_work, do_table_event);
 	ti->private = rs;
@@ -2770,8 +2771,6 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	rs_config_restore(rs, &rs_layout);
 
 	if (test_bit(MD_ARRAY_FIRST_USE, &rs->md.flags)) {
-		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
-		rs_set_new(rs);
 		/* A new raid6 set has to be recovered to ensure proper parity and Q-Syndrome */
 		if (rs_is_raid6(rs) &&
 		    test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) {
@@ -2780,16 +2779,18 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			goto bad;
 		}
 		rs_setup_recovery(rs, 0);
-	} else if (rs_is_recovering(rs) || rs_is_reshaping(rs)) {
-		/* Have to reject size change request during recovery/reshape */
-		if (calculated_dev_sectors != rs->dev[0].rdev.sectors) {
-			ti->error = rs_is_recovering(rs) ?
-				    "Can't resize a recovering raid set" :
-				    "Can't resize a reshaping raid set";
+		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+		rs_set_new(rs);
+	} else if (rs_is_recovering(rs)) {
+		; /* skip setup rs */
+	} else if (rs_is_reshaping(rs)) {
+		/* Have to reject size change request during reshape */
+		if (resize) {
+			ti->error = "Can't resize a reshaping raid set";
 			r = -EPERM;
 			goto bad;
 		}
-		/* skip setup rs */
+		; /* skip setup rs */
 	} else if (rs_takeover_requested(rs)) {
 		if (rs_is_reshaping(rs)) {
 			ti->error = "Can't takeover a reshaping raid set";
@@ -2799,8 +2800,9 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 		/*
 		 * If a takeover is needed, userspace sets any additional
-		 * devices to rebuild, so just set the level to the new
-		 * requested one and allow the raid set to run
+		 * devices to rebuild, so set the level to the new requested
+		 * one, prohibit requesting recovery, allow the raid
+		 * set to run and store superblocks during resume.
 		 */
 		r = rs_check_takeover(rs);
 		if (r)
@@ -2812,6 +2814,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 		set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
+		rs_setup_recovery(rs, MaxSector);
 		rs_set_new(rs);
 	} else if (rs_reshape_requested(rs)) {
 		if (rs_is_reshaping(rs)) {
@@ -2868,16 +2871,17 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		if (rs->md.raid_disks < rs->raid_disks)
 			set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 
-		rs_set_cur(rs);
 		rs_setup_recovery(rs, MaxSector);
-	} else {
 		rs_set_cur(rs);
+	} else {
+		/* May not set recovery when a device rebuild is requested */
 		if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
 			rs_setup_recovery(rs, MaxSector);
 			set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 		} else
 			rs_setup_recovery(rs, test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ?
-					      0 : calculated_dev_sectors);
+					      0 : (resize ? calculated_dev_sectors : MaxSector));
+		rs_set_cur(rs);
 	}
 
 	/* If constructor requested it, change data and new_data offsets */
-- 
cgit v1.2.3-70-g09d2


From 469b304b58c417874a68630c5f58cf076a34850c Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 29 Jun 2016 18:13:58 +0200
Subject: dm raid: enhance reshape check and factor out reshape setup

Enhance rs_reshape_requested() check function to be more transparent and
fix its raid10 check.

Streamline the constructor by factoring out reshaping preparation into
fucntion rs_prepare_reshape().

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 167 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 106 insertions(+), 61 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 846c58d2bcf9..473c6d9765f0 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1700,16 +1700,30 @@ static bool rs_takeover_requested(struct raid_set *rs)
 /* True if @rs is requested to reshape by ctr */
 static bool rs_reshape_requested(struct raid_set *rs)
 {
+	bool change;
 	struct mddev *mddev = &rs->md;
 
+	if (rs_takeover_requested(rs))
+		return false;
+
 	if (!mddev->level)
 		return false;
 
-	return !__is_raid10_far(mddev->new_layout) &&
-	       mddev->new_level == mddev->level &&
-	       (mddev->new_layout != mddev->layout ||
-		mddev->new_chunk_sectors != mddev->chunk_sectors ||
-		rs->raid_disks + rs->delta_disks != mddev->raid_disks);
+	change = mddev->new_layout != mddev->layout ||
+		 mddev->new_chunk_sectors != mddev->chunk_sectors ||
+		 rs->delta_disks;
+
+	/* Historical case to support raid1 reshape without delta disks */
+	if (mddev->level == 1)
+		return !change &&
+		       mddev->raid_disks != rs->raid_disks;
+
+	if (mddev->level == 10)
+		return change &&
+		       !__is_raid10_far(mddev->new_layout) &&
+		       rs->delta_disks >= 0;
+
+	return change;
 }
 
 /*  Features */
@@ -1821,7 +1835,7 @@ static int rs_check_reshape(struct raid_set *rs)
 		rs->ti->error = "Can't reshape degraded raid set";
 	else if (rs_is_recovering(rs))
 		rs->ti->error = "Convert request on recovering raid set prohibited";
-	else if (mddev->reshape_position && rs_is_reshaping(rs))
+	else if (rs_is_reshaping(rs))
 		rs->ti->error = "raid set already reshaping!";
 	else if (!(rs_is_raid10(rs) || rs_is_raid456(rs)))
 		rs->ti->error = "Reshaping only supported for raid4/5/6/10";
@@ -2518,6 +2532,69 @@ static int rs_setup_takeover(struct raid_set *rs)
 	return 0;
 }
 
+/* Prepare @rs for reshape */
+static int rs_prepare_reshape(struct raid_set *rs)
+{
+	bool reshape;
+	struct mddev *mddev = &rs->md;
+
+	if (rs_is_raid10(rs)) {
+		if (rs->raid_disks != mddev->raid_disks &&
+		    __is_raid10_near(mddev->layout) &&
+		    rs->raid10_copies &&
+		    rs->raid10_copies != __raid10_near_copies(mddev->layout)) {
+			/*
+			 * raid disk have to be multiple of data copies to allow this conversion,
+			 *
+			 * This is actually not a reshape it is a
+			 * rebuild of any additional mirrors per group
+			 */
+			if (rs->raid_disks % rs->raid10_copies) {
+				rs->ti->error = "Can't reshape raid10 mirror groups";
+				return -EINVAL;
+			}
+
+			/* Userpace reordered disks to add/remove mirrors -> adjust raid_disk indexes */
+			__reorder_raid_disk_indexes(rs);
+			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
+								   rs->raid10_copies);
+			mddev->new_layout = mddev->layout;
+			reshape = false;
+		} else
+			reshape = true;
+
+	} else if (rs_is_raid456(rs))
+		reshape = true;
+
+	/*
+	 * HM FIXME: process raid1 via delta_disks as well?
+	 *           Would cause allocations in raid1->check_reshape
+	 *           though, thus more issues with potential failures
+	 */
+	else if (rs_is_raid1(rs)) {
+		set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
+		mddev->raid_disks = rs->raid_disks;
+		reshape = false;
+
+	} else {
+		rs->ti->error = "Called with bogus raid type";
+		return -EINVAL;
+	}
+
+	if (reshape) {
+		set_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags);
+		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+		set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
+	}
+	/* Create new superblocks and bitmaps, if any */
+	if (mddev->raid_disks < rs->raid_disks) {
+		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+		rs_set_cur(rs);
+	}
+
+	return 0;
+}
+
 /*
  *
  * - change raid layout
@@ -2682,7 +2759,7 @@ static void configure_discard_support(struct raid_set *rs)
 static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	int r;
-	bool resize = false;
+	bool resize;
 	struct raid_type *rt;
 	unsigned num_raid_params, num_raid_devs;
 	sector_t calculated_dev_sectors;
@@ -2770,6 +2847,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	/* Restore any requested new layout for conversion decision */
 	rs_config_restore(rs, &rs_layout);
 
+	/*
+	 * Now that we have any superblock metadata available,
+	 * check for new, recovering, reshaping, to be taken over,
+	 * to be reshaped or an existing, unchanged raid set to
+	 * run in sequence.
+	 */
 	if (test_bit(MD_ARRAY_FIRST_USE, &rs->md.flags)) {
 		/* A new raid6 set has to be recovered to ensure proper parity and Q-Syndrome */
 		if (rs_is_raid6(rs) &&
@@ -2782,6 +2865,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 		rs_set_new(rs);
 	} else if (rs_is_recovering(rs)) {
+		/* A recovering raid set may be resized */
 		; /* skip setup rs */
 	} else if (rs_is_reshaping(rs)) {
 		/* Have to reject size change request during reshape */
@@ -2790,7 +2874,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			r = -EPERM;
 			goto bad;
 		}
-		; /* skip setup rs */
+		/* skip setup rs */
 	} else if (rs_takeover_requested(rs)) {
 		if (rs_is_reshaping(rs)) {
 			ti->error = "Can't takeover a reshaping raid set";
@@ -2800,7 +2884,9 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 		/*
 		 * If a takeover is needed, userspace sets any additional
-		 * devices to rebuild, so set the level to the new requested
+		 * devices to rebuild and we can check for a valid request here.
+		 *
+		 * If acceptible, set the level to the new requested
 		 * one, prohibit requesting recovery, allow the raid
 		 * set to run and store superblocks during resume.
 		 */
@@ -2814,63 +2900,22 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 		set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
+		/* Takeover ain't recovery, so disable recovery */
 		rs_setup_recovery(rs, MaxSector);
 		rs_set_new(rs);
 	} else if (rs_reshape_requested(rs)) {
-		if (rs_is_reshaping(rs)) {
-			ti->error = "raid set already reshaping!";
-			r = -EPERM;
-			goto bad;
-		}
-
-		if (rs_is_raid10(rs)) {
-			if (rs->raid_disks != rs->md.raid_disks &&
-			    __is_raid10_near(rs->md.layout) &&
-			    rs->raid10_copies &&
-			    rs->raid10_copies != __raid10_near_copies(rs->md.layout)) {
-				/*
-				 * raid disk have to be multiple of data copies to allow this conversion,
-				 *
-				 * This is actually not a reshape it is a
-				 * rebuild of any additional mirrors per group
-				 */
-				if (rs->raid_disks % rs->raid10_copies) {
-					ti->error = "Can't reshape raid10 mirror groups";
-					r = -EINVAL;
-					goto bad;
-				}
-
-				/* Userpace reordered disks to add/remove mirrors -> adjust raid_disk indexes */
-				__reorder_raid_disk_indexes(rs);
-				rs->md.layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
-									   rs->raid10_copies);
-				rs->md.new_layout = rs->md.layout;
-
-			} else
-				set_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags);
-
-		} else if (rs_is_raid456(rs))
-			set_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags);
-
 		/*
-		 * HM FIXME: process raid1 via delta_disks as well?
-		 *           Would cause allocations in raid1->check_reshape
-		 *           though, thus more issues with potential failures
-		 */
-		else if (rs_is_raid1(rs)) {
-			set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
-			rs->md.raid_disks = rs->raid_disks;
-		}
-
-		if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
-			set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
-			set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
-		}
-
-		/* Create new superblocks and bitmaps, if any */
-		if (rs->md.raid_disks < rs->raid_disks)
-			set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+		  * We can only prepare for a reshape here, because the
+		  * raid set needs to run to provide the repective reshape
+		  * check functions via its MD personality instance.
+		  *
+		  * So do the reshape check after md_run() succeeded.
+		  */
+		r = rs_prepare_reshape(rs);
+		if (r)
+			return r;
 
+		/* Reshaping ain't recovery, so disable recovery */
 		rs_setup_recovery(rs, MaxSector);
 		rs_set_cur(rs);
 	} else {
-- 
cgit v1.2.3-70-g09d2


From 7a7c330fc26652f71a4d73986d5308dcfdcef168 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 30 Jun 2016 13:57:08 +0200
Subject: dm raid: support delta_disks for raid1, fix table output

Add "delta_disks" constructor argument support to raid1 to allow for
consistent userspace disk addition/removal handling.

Fix raid_status() to report all raid disks with status and table output
on disk adding reshapes, not just the ones listed on the mddev; optimize
its rebuild and writemostly output.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 92 ++++++++++++++++++++++++++++------------------------
 1 file changed, 49 insertions(+), 43 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 473c6d9765f0..4caf51fe001e 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -132,6 +132,7 @@ struct raid_dev {
 				 CTR_FLAG_MAX_RECOVERY_RATE | \
 				 CTR_FLAG_MAX_WRITE_BEHIND | \
 				 CTR_FLAG_REGION_SIZE | \
+				 CTR_FLAG_DELTA_DISKS | \
 				 CTR_FLAG_DATA_OFFSET)
 
 /* "raid10" does not accept any raid1 or stripe cache options */
@@ -1714,9 +1715,13 @@ static bool rs_reshape_requested(struct raid_set *rs)
 		 rs->delta_disks;
 
 	/* Historical case to support raid1 reshape without delta disks */
-	if (mddev->level == 1)
+	if (mddev->level == 1) {
+		if (rs->delta_disks)
+			return !!rs->delta_disks;
+
 		return !change &&
 		       mddev->raid_disks != rs->raid_disks;
+	}
 
 	if (mddev->level == 10)
 		return change &&
@@ -1837,8 +1842,8 @@ static int rs_check_reshape(struct raid_set *rs)
 		rs->ti->error = "Convert request on recovering raid set prohibited";
 	else if (rs_is_reshaping(rs))
 		rs->ti->error = "raid set already reshaping!";
-	else if (!(rs_is_raid10(rs) || rs_is_raid456(rs)))
-		rs->ti->error = "Reshaping only supported for raid4/5/6/10";
+	else if (!(rs_is_raid1(rs) || rs_is_raid10(rs) || rs_is_raid456(rs)))
+		rs->ti->error = "Reshaping only supported for raid1/4/5/6/10";
 	else
 		return 0;
 
@@ -2566,16 +2571,17 @@ static int rs_prepare_reshape(struct raid_set *rs)
 	} else if (rs_is_raid456(rs))
 		reshape = true;
 
-	/*
-	 * HM FIXME: process raid1 via delta_disks as well?
-	 *           Would cause allocations in raid1->check_reshape
-	 *           though, thus more issues with potential failures
-	 */
 	else if (rs_is_raid1(rs)) {
-		set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
-		mddev->raid_disks = rs->raid_disks;
-		reshape = false;
-
+		if (rs->delta_disks) {
+			/* Process raid1 via delta_disks */
+			mddev->degraded = rs->delta_disks < 0 ? -rs->delta_disks : rs->delta_disks;
+			reshape = true;
+		} else {
+			/* Process raid1 without delta_disks */
+			mddev->raid_disks = rs->raid_disks;
+			set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
+			reshape = false;
+		}
 	} else {
 		rs->ti->error = "Called with bogus raid type";
 		return -EINVAL;
@@ -2585,12 +2591,9 @@ static int rs_prepare_reshape(struct raid_set *rs)
 		set_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags);
 		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
 		set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
-	}
-	/* Create new superblocks and bitmaps, if any */
-	if (mddev->raid_disks < rs->raid_disks) {
+	} else if (mddev->raid_disks < rs->raid_disks)
+		/* Create new superblocks and bitmaps, if any new disks */
 		set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
-		rs_set_cur(rs);
-	}
 
 	return 0;
 }
@@ -2656,7 +2659,7 @@ static int rs_setup_reshape(struct raid_set *rs)
 			rdev->raid_disk = d;
 
 			rdev->sectors = mddev->dev_sectors;
-			rdev->recovery_offset = MaxSector;
+			rdev->recovery_offset = rs_is_raid1(rs) ? 0 : MaxSector;
 		}
 
 		mddev->reshape_backwards = 0; /* adding disks -> forward reshape */
@@ -2971,10 +2974,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		/* Restore new, ctr requested layout to perform check */
 		rs_config_restore(rs, &rs_layout);
 
-		r = rs->md.pers->check_reshape(&rs->md);
-		if (r) {
-			ti->error = "Reshape check failed";
-			goto bad_check_reshape;
+		if (rs->md.pers->start_reshape) {
+			r = rs->md.pers->check_reshape(&rs->md);
+			if (r) {
+				ti->error = "Reshape check failed";
+				goto bad_check_reshape;
+			}
 		}
 	}
 
@@ -3150,10 +3155,11 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 	struct raid_set *rs = ti->private;
 	struct mddev *mddev = &rs->md;
 	struct r5conf *conf = mddev->private;
-	int max_nr_stripes = conf ? conf->max_nr_stripes : 0;
+	int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0;
 	bool array_in_sync;
 	unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
 	unsigned int sz = 0;
+	unsigned int rebuild_disks;
 	unsigned int write_mostly_params = 0;
 	sector_t progress, resync_max_sectors, resync_mismatches;
 	const char *sync_action;
@@ -3181,7 +3187,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 
 		/* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
 		rdev_for_each(rdev, mddev)
-			DMEMIT(__raid_dev_status(rdev, array_in_sync));
+		for (i = 0; i < rs->raid_disks; i++)
+			DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
 
 		/*
 		 * In-sync/Reshape ratio:
@@ -3233,11 +3240,11 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		/* Report the table line string you would use to construct this raid set */
 
 		/* Calculate raid parameter count */
-		rdev_for_each(rdev, mddev)
-			if (test_bit(WriteMostly, &rdev->flags))
+		for (i = 0; i < rs->raid_disks; i++)
+			if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
 				write_mostly_params += 2;
-		raid_param_cnt += memweight(rs->rebuild_disks,
-					    DISKS_ARRAY_ELEMS * sizeof(*rs->rebuild_disks)) * 2 +
+		rebuild_disks = memweight(rs->rebuild_disks, DISKS_ARRAY_ELEMS * sizeof(*rs->rebuild_disks));
+		raid_param_cnt += rebuild_disks * 2 +
 				  write_mostly_params +
 				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
 				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
@@ -3264,18 +3271,20 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 					  mddev->bitmap_info.daemon_sleep);
 		if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
 			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
-					 mddev->delta_disks);
+					 max(rs->delta_disks, mddev->delta_disks));
 		if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
 			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
 					 max_nr_stripes);
-		rdev_for_each(rdev, mddev)
-			if (test_bit(rdev->raid_disk, (void *) rs->rebuild_disks))
-				DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
-						 rdev->raid_disk);
-		rdev_for_each(rdev, mddev)
-			if (test_bit(WriteMostly, &rdev->flags))
-				DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY),
-						 rdev->raid_disk);
+		if (rebuild_disks)
+			for (i = 0; i < rs->raid_disks; i++)
+				if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
+					DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
+							 rs->dev[i].rdev.raid_disk);
+		if (write_mostly_params)
+			for (i = 0; i < rs->raid_disks; i++)
+				if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+					DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY),
+					       rs->dev[i].rdev.raid_disk);
 		if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
 			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
 					  mddev->bitmap_info.max_write_behind);
@@ -3286,12 +3295,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
 					 mddev->sync_speed_min);
 		DMEMIT(" %d", rs->raid_disks);
-		rdev_for_each(rdev, mddev) {
-			struct raid_dev *rd = container_of(rdev, struct raid_dev, rdev);
-
-			DMEMIT(" %s %s", __get_dev_name(rd->meta_dev),
-					 __get_dev_name(rd->data_dev));
-		}
+		for (i = 0; i < rs->raid_disks; i++)
+			DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
+					 __get_dev_name(rs->dev[i].data_dev));
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From ffeeac75157e48cf135c4c8b0c8377dd312e9036 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 30 Jun 2016 14:37:50 +0200
Subject: dm raid: use rs->raid_disks to avoid memory leaks on free

Also makes code more consistent throughout.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 4caf51fe001e..2ea3982dace9 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -717,7 +717,7 @@ static void raid_set_free(struct raid_set *rs)
 {
 	int i;
 
-	for (i = 0; i < rs->md.raid_disks; i++) {
+	for (i = 0; i < rs->raid_disks; i++) {
 		if (rs->dev[i].meta_dev)
 			dm_put_device(rs->ti, rs->dev[i].meta_dev);
 		md_rdev_clear(&rs->dev[i].rdev);
@@ -757,7 +757,7 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
 	if (!arg)
 		return -EINVAL;
 
-	for (i = 0; i < rs->md.raid_disks; i++) {
+	for (i = 0; i < rs->raid_disks; i++) {
 		rs->dev[i].rdev.raid_disk = i;
 
 		rs->dev[i].meta_dev = NULL;
@@ -961,7 +961,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
 		 *	    C	 D    D	   E	E
 		 */
 		if (__is_raid10_near(rs->md.new_layout)) {
-			for (i = 0; i < rs->raid_disks; i++) {
+			for (i = 0; i < rs->md.raid_disks; i++) {
 				if (!(i % copies))
 					rebuilds_per_group = 0;
 				if ((!rs->dev[i].rdev.sb_page ||
@@ -1085,7 +1085,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 	 * What is found in the superblocks of the devices is always
 	 * authoritative, unless 'rebuild' or '[no]sync' was specified.
 	 */
-	for (i = 0; i < rs->md.raid_disks; i++) {
+	for (i = 0; i < rs->raid_disks; i++) {
 		set_bit(In_sync, &rs->dev[i].rdev.flags);
 		rs->dev[i].rdev.recovery_offset = MaxSector;
 	}
@@ -2714,7 +2714,7 @@ static void configure_discard_support(struct raid_set *rs)
 	/* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
 	raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
 
-	for (i = 0; i < rs->md.raid_disks; i++) {
+	for (i = 0; i < rs->raid_disks; i++) {
 		struct request_queue *q;
 
 		if (!rs->dev[i].rdev.bdev)
@@ -3186,7 +3186,6 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		sync_action = decipher_sync_action(&rs->md);
 
 		/* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
-		rdev_for_each(rdev, mddev)
 		for (i = 0; i < rs->raid_disks; i++)
 			DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
 
-- 
cgit v1.2.3-70-g09d2


From 326824099fa4a3989ce649449021545397b462cb Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Thu, 30 Jun 2016 21:32:20 +0200
Subject: dm raid: use rdev_for_each in status

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 2ea3982dace9..e4d0bc185c99 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3186,8 +3186,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		sync_action = decipher_sync_action(&rs->md);
 
 		/* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
-		for (i = 0; i < rs->raid_disks; i++)
-			DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
+		rdev_for_each(rdev, mddev)
+			DMEMIT(__raid_dev_status(rdev, array_in_sync));
 
 		/*
 		 * In-sync/Reshape ratio:
-- 
cgit v1.2.3-70-g09d2


From d7ccc2e2a0e5a4cdd024f4a1349a033f7cd3aa7d Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 6 Jul 2016 18:29:22 +0200
Subject: dm raid: change logical functions to actually return bool

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index e4d0bc185c99..0aaf4ef7152c 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -69,7 +69,7 @@ struct raid_dev {
 #define __CTR_FLAG_RAID10_COPIES	10 /* 2 */ /* Only with raid10 */
 #define __CTR_FLAG_RAID10_FORMAT	11 /* 2 */ /* Only with raid10 */
 /* New for v1.9.0 */
-#define __CTR_FLAG_DELTA_DISKS		12 /* 2 */ /* Only with reshapable raid4/5/6/10! */
+#define __CTR_FLAG_DELTA_DISKS		12 /* 2 */ /* Only with reshapable raid1/4/5/6/10! */
 #define __CTR_FLAG_DATA_OFFSET		13 /* 2 */ /* Only with reshapable raid4/5/6/10! */
 #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
 
@@ -193,7 +193,7 @@ struct raid_dev {
 #define RT_FLAG_RESHAPE_RS		4
 #define RT_FLAG_KEEP_RS_FROZEN		5
 
-/* Array elements of 64 bit needed for rebuild/write_mostly bits */
+/* Array elements of 64 bit needed for rebuild/failed disk bits */
 #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
 
 /*
@@ -328,8 +328,8 @@ static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
 }
 
 /*
- * bool helpers to test for various raid levels of a raid set,
- * is. it's level as reported by the superblock rather than
+ * Bool helpers to test for various raid levels of a raid set.
+ * It's level as reported by the superblock rather than
  * the requested raid_type passed to the constructor.
  */
 /* Return true, if raid set in @rs is raid0 */
@@ -363,7 +363,7 @@ static bool rs_is_raid456(struct raid_set *rs)
 }
 
 /* Return true, if raid set in @rs is reshapable */
-static unsigned int __is_raid10_far(int layout);
+static bool __is_raid10_far(int layout);
 static bool rs_is_reshapable(struct raid_set *rs)
 {
 	return rs_is_raid456(rs) ||
@@ -383,7 +383,7 @@ static bool rs_is_reshaping(struct raid_set *rs)
 }
 
 /*
- * bool helpers to test for various raid levels of a raid type
+ * bool helpers to test for various raid levels of a raid type @rt
  */
 
 /* Return true, if raid type in @rt is raid0 */
@@ -437,7 +437,7 @@ static unsigned long __valid_flags(struct raid_set *rs)
 	else if (rt_is_raid6(rs->raid_type))
 		return RAID6_VALID_FLAGS;
 
-	return ~0;
+	return 0;
 }
 
 /*
@@ -474,19 +474,19 @@ static unsigned int __raid10_far_copies(int layout)
 }
 
 /* Return true if md raid10 offset for @layout */
-static unsigned int __is_raid10_offset(int layout)
+static bool __is_raid10_offset(int layout)
 {
-	return layout & RAID10_OFFSET;
+	return !!(layout & RAID10_OFFSET);
 }
 
 /* Return true if md raid10 near for @layout */
-static unsigned int __is_raid10_near(int layout)
+static bool __is_raid10_near(int layout)
 {
 	return !__is_raid10_offset(layout) && __raid10_near_copies(layout) > 1;
 }
 
 /* Return true if md raid10 far for @layout */
-static unsigned int __is_raid10_far(int layout)
+static bool __is_raid10_far(int layout)
 {
 	return !__is_raid10_offset(layout) && __raid10_far_copies(layout) > 1;
 }
@@ -527,8 +527,7 @@ static int raid10_name_to_format(const char *name)
 /* Return md raid10 copies for @layout */
 static unsigned int raid10_md_layout_to_copies(int layout)
 {
-	return __raid10_near_copies(layout) > 1 ?
-		__raid10_near_copies(layout) : __raid10_far_copies(layout);
+	return max(__raid10_near_copies(layout), __raid10_far_copies(layout));
 }
 
 /* Return md raid10 format id for @format string */
@@ -570,7 +569,7 @@ static int raid10_format_to_md_layout(struct raid_set *rs,
 /* END: MD raid10 bit definitions and helpers */
 
 /* Check for any of the raid10 algorithms */
-static int __got_raid10(struct raid_type *rtp, const int layout)
+static bool __got_raid10(struct raid_type *rtp, const int layout)
 {
 	if (rtp->level == 10) {
 		switch (rtp->algorithm) {
@@ -586,7 +585,7 @@ static int __got_raid10(struct raid_type *rtp, const int layout)
 		}
 	}
 
-	return 0;
+	return false;
 }
 
 /* Return raid_type for @name */
-- 
cgit v1.2.3-70-g09d2


From bd9f55ea1cf6e14eb054b06ea877d2d1fa339514 Mon Sep 17 00:00:00 2001
From: Tahsin Erdogan <tahsin@google.com>
Date: Fri, 15 Jul 2016 06:27:08 -0700
Subject: dm: fix second blk_delay_queue() parameter to be in msec units not
 jiffies

Commit d548b34b062 ("dm: reduce the queue delay used in dm_request_fn
from 100ms to 10ms") always intended the value to be 10 msecs -- it
just expressed it in jiffies because earlier commit 7eaceaccab ("block:
remove per-queue plugging") did.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Fixes: d548b34b062 ("dm: reduce the queue delay used in dm_request_fn from 100ms to 10ms")
Cc: stable@vger.kernel.org # 4.1+ -- stable@ backports must be applied to drivers/md/dm.c
---
 drivers/md/dm-rq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index aa81539374a6..7a9661868496 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -785,7 +785,7 @@ static void dm_old_request_fn(struct request_queue *q)
 		     md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
 		     md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) ||
 		    (ti->type->busy && ti->type->busy(ti))) {
-			blk_delay_queue(q, HZ / 100);
+			blk_delay_queue(q, 10);
 			return;
 		}
 
-- 
cgit v1.2.3-70-g09d2


From 9c72bad1f31af96d9012025639552cd5732bb0a5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 8 Jul 2016 21:23:51 +0900
Subject: dm: call PR reserve/unreserve on each underlying device

So far we tried to rely on the SCSI 'all target ports' bit to register
all path, but for many setups this didn't work properly as the different
paths are seen as separate initiators to the target instead of multiple
ports of the same initiator.  Because of that we'll stop setting the
'all target ports' bit in SCSI, and let device mapper handle iterating
over the device for each path and register them manually.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 65 insertions(+), 15 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2c907bc10fe9..7538b8972820 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2521,26 +2521,76 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
 	kfree(pools);
 }
 
-static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
-			  u32 flags)
+struct dm_pr {
+	u64	old_key;
+	u64	new_key;
+	u32	flags;
+	bool	fail_early;
+};
+
+static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
+		      void *data)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
-	const struct pr_ops *ops;
-	fmode_t mode;
-	int r;
+	struct dm_table *table;
+	struct dm_target *ti;
+	int ret = -ENOTTY, srcu_idx;
 
-	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
-	if (r < 0)
-		return r;
+	table = dm_get_live_table(md, &srcu_idx);
+	if (!table || !dm_table_get_size(table))
+		goto out;
 
-	ops = bdev->bd_disk->fops->pr_ops;
-	if (ops && ops->pr_register)
-		r = ops->pr_register(bdev, old_key, new_key, flags);
-	else
-		r = -EOPNOTSUPP;
+	/* We only support devices that have a single target */
+	if (dm_table_get_num_targets(table) != 1)
+		goto out;
+	ti = dm_table_get_target(table, 0);
 
-	bdput(bdev);
-	return r;
+	ret = -EINVAL;
+	if (!ti->type->iterate_devices)
+		goto out;
+
+	ret = ti->type->iterate_devices(ti, fn, data);
+out:
+	dm_put_live_table(md, srcu_idx);
+	return ret;
+}
+
+/*
+ * For register / unregister we need to manually call out to every path.
+ */
+static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
+			    sector_t start, sector_t len, void *data)
+{
+	struct dm_pr *pr = data;
+	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
+
+	if (!ops || !ops->pr_register)
+		return -EOPNOTSUPP;
+	return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
+}
+
+static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
+			  u32 flags)
+{
+	struct dm_pr pr = {
+		.old_key	= old_key,
+		.new_key	= new_key,
+		.flags		= flags,
+		.fail_early	= true,
+	};
+	int ret;
+
+	ret = dm_call_pr(bdev, __dm_pr_register, &pr);
+	if (ret && new_key) {
+		/* unregister all paths if we failed to register any path */
+		pr.old_key = new_key;
+		pr.new_key = 0;
+		pr.flags = 0;
+		pr.fail_early = false;
+		dm_call_pr(bdev, __dm_pr_register, &pr);
+	}
+
+	return ret;
 }
 
 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
-- 
cgit v1.2.3-70-g09d2


From 094f394df6a33f959888d445b362a9086823a2fb Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Tue, 19 Jul 2016 14:03:51 +0200
Subject: dm raid: address checkpatch.pl complaints

Use 'unsigned int' where appropriate.
Return negative errors.
Correct an indentation.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 0aaf4ef7152c..10c136f789b9 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -256,10 +256,10 @@ static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
 static struct raid_type {
 	const char *name;		/* RAID algorithm. */
 	const char *descr;		/* Descriptor text for logging. */
-	const unsigned parity_devs;	/* # of parity devices. */
-	const unsigned minimal_devs;	/* minimal # of devices in set. */
-	const unsigned level;		/* RAID level. */
-	const unsigned algorithm;	/* RAID algorithm. */
+	const unsigned int parity_devs;	/* # of parity devices. */
+	const unsigned int minimal_devs;/* minimal # of devices in set. */
+	const unsigned int level;	/* RAID level. */
+	const unsigned int algorithm;	/* RAID algorithm. */
 } raid_types[] = {
 	{"raid0",	  "raid0 (striping)",			    0, 2, 0,  0 /* NONE */},
 	{"raid1",	  "raid1 (mirroring)",			    0, 2, 1,  0 /* NONE */},
@@ -665,9 +665,9 @@ static void rs_set_new(struct raid_set *rs)
 }
 
 static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *raid_type,
-				       unsigned raid_devs)
+				       unsigned int raid_devs)
 {
-	unsigned i;
+	unsigned int i;
 	struct raid_set *rs;
 
 	if (raid_devs <= raid_type->parity_devs) {
@@ -920,9 +920,9 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
  */
 static int validate_raid_redundancy(struct raid_set *rs)
 {
-	unsigned i, rebuild_cnt = 0;
-	unsigned rebuilds_per_group = 0, copies;
-	unsigned group_size, last_group_start;
+	unsigned int i, rebuild_cnt = 0;
+	unsigned int rebuilds_per_group = 0, copies;
+	unsigned int group_size, last_group_start;
 
 	for (i = 0; i < rs->md.raid_disks; i++)
 		if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
@@ -1030,12 +1030,12 @@ too_many:
  *    [raid10_format <near|far|offset>] Layout algorithm.  (Default: near)
  */
 static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
-			     unsigned num_raid_params)
+			     unsigned int num_raid_params)
 {
 	int value, raid10_format = ALGORITHM_RAID10_DEFAULT;
-	unsigned raid10_copies = 2;
-	unsigned i, write_mostly = 0;
-	unsigned region_size = 0;
+	unsigned int raid10_copies = 2;
+	unsigned int i, write_mostly = 0;
+	unsigned int region_size = 0;
 	sector_t max_io_len;
 	const char *arg, *key;
 	struct raid_dev *rd;
@@ -1447,7 +1447,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
 		if (rs->raid10_copies < 2 ||
 		    delta_disks < 0) {
 			rs->ti->error = "Bogus raid10 data copies or delta disks";
-			return EINVAL;
+			return -EINVAL;
 		}
 
 		dev_sectors *= rs->raid10_copies;
@@ -1474,7 +1474,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
 	return 0;
 bad:
 	rs->ti->error = "Target length not divisible by number of data devices";
-	return EINVAL;
+	return -EINVAL;
 }
 
 /* Setup recovery on @rs */
@@ -2511,7 +2511,7 @@ static int rs_setup_takeover(struct raid_set *rs)
 			/* raid1 -> raid10_near layout */
 			mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
 								   rs->raid_disks);
-		 else
+		else
 			return -EINVAL;
 
 	}
@@ -2758,12 +2758,12 @@ static void configure_discard_support(struct raid_set *rs)
  * enforce recreation based on the passed in table parameters.
  *
  */
-static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	int r;
 	bool resize;
 	struct raid_type *rt;
-	unsigned num_raid_params, num_raid_devs;
+	unsigned int num_raid_params, num_raid_devs;
 	sector_t calculated_dev_sectors;
 	struct raid_set *rs = NULL;
 	const char *arg;
@@ -3299,7 +3299,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 	}
 }
 
-static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
+static int raid_message(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct raid_set *rs = ti->private;
 	struct mddev *mddev = &rs->md;
@@ -3351,7 +3351,7 @@ static int raid_iterate_devices(struct dm_target *ti,
 				iterate_devices_callout_fn fn, void *data)
 {
 	struct raid_set *rs = ti->private;
-	unsigned i;
+	unsigned int i;
 	int r = 0;
 
 	for (i = 0; !r && i < rs->md.raid_disks; i++)
@@ -3368,7 +3368,7 @@ static int raid_iterate_devices(struct dm_target *ti,
 static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct raid_set *rs = ti->private;
-	unsigned chunk_size = rs->md.chunk_sectors << 9;
+	unsigned int chunk_size = rs->md.chunk_sectors << 9;
 	struct r5conf *conf = rs->md.private;
 
 	blk_limits_io_min(limits, chunk_size);
-- 
cgit v1.2.3-70-g09d2


From 89d3d9a1e38a6bd453038cfdc7e1576ef2d19719 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Tue, 19 Jul 2016 13:16:24 +0200
Subject: dm raid: fix random optimal_io_size for raid0

raid_io_hints() was retrieving the number of data stripes used for the
calculation of io_opt from struct r5conf, which is not defined for raid0
mappings.

Base the calculation on the in-core raid_set structure instead.

Also, adjust to use to_bytes() for the sector -> bytes conversion
throughout.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 10c136f789b9..84983549b5e1 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -904,7 +904,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
 	/*
 	 * Convert sectors to bytes.
 	 */
-	rs->md.bitmap_info.chunksize = (region_size << 9);
+	rs->md.bitmap_info.chunksize = to_bytes(region_size);
 
 	return 0;
 }
@@ -3368,11 +3368,10 @@ static int raid_iterate_devices(struct dm_target *ti,
 static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct raid_set *rs = ti->private;
-	unsigned int chunk_size = rs->md.chunk_sectors << 9;
-	struct r5conf *conf = rs->md.private;
+	unsigned int chunk_size = to_bytes(rs->md.chunk_sectors);
 
 	blk_limits_io_min(limits, chunk_size);
-	blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
+	blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs));
 }
 
 static void raid_presuspend(struct dm_target *ti)
-- 
cgit v1.2.3-70-g09d2


From e7e0f730477dea190fbc18c2d93338dacee82cea Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 1 Jul 2016 11:09:13 +0100
Subject: dm btree: fix a bug in dm_btree_find_next_single()

dm_btree_find_next_single() can short-circuit the search for a block
with a return of -ENODATA if all entries are higher than the search key
passed to lower_bound().

This hasn't been a problem because of the way the btree has been used by
DM thinp.  But it must be fixed now in preparation for fixing the race
in DM thinp's handling of simultaneous block discard vs allocation.
Otherwise, once that fix is in place, some of the blocks in a discard
would not be unmapped as expected.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/persistent-data/dm-btree.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index ea3d3b656fd0..2cc1877804c2 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -429,7 +429,14 @@ static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t ro
 
 	if (flags & INTERNAL_NODE) {
 		i = lower_bound(n, key);
-		if (i < 0 || i >= nr_entries) {
+		if (i < 0) {
+			/*
+			 * avoid early -ENODATA return when all entries are
+			 * higher than the search @key.
+			 */
+			i = 0;
+		}
+		if (i >= nr_entries) {
 			r = -ENODATA;
 			goto out;
 		}
-- 
cgit v1.2.3-70-g09d2


From 2a0fbffb1e50939a969d5efe495667a3aa0f72f7 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 1 Jul 2016 14:00:02 +0100
Subject: dm thin: fix a race condition between discarding and provisioning a
 block

The discard passdown was being issued after the block was unmapped,
which meant the block could be reprovisioned whilst the passdown discard
was still in flight.

We can only identify unshared blocks (safe to do a passdown a discard
to) once they're unmapped and their ref count hits zero.  Block ref
counts are now used to guard against concurrent allocation of these
blocks that are being discarded.  So now we unmap the block, issue
passdown discards, and the immediately increment ref counts for regions
that have been discarded via passed down (this is safe because
allocation occurs within the same thread).  We then decrement ref counts
once the passdown discard IO is complete -- signaling these blocks may
now be allocated.

This fixes the potential for corruption that was reported here:
https://www.redhat.com/archives/dm-devel/2016-June/msg00311.html

Reported-by: Dennis Yang <dennisyang@qnap.com>
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c |  30 +++++++++++++
 drivers/md/dm-thin-metadata.h |   3 ++
 drivers/md/dm-thin.c          | 102 +++++++++++++++++++++++++++++++++++++-----
 3 files changed, 124 insertions(+), 11 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 43824d73366d..a15091a0d40c 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1677,6 +1677,36 @@ int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *resu
 	return r;
 }
 
+int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
+{
+	int r = 0;
+
+	down_write(&pmd->root_lock);
+	for (; b != e; b++) {
+		r = dm_sm_inc_block(pmd->data_sm, b);
+		if (r)
+			break;
+	}
+	up_write(&pmd->root_lock);
+
+	return r;
+}
+
+int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
+{
+	int r = 0;
+
+	down_write(&pmd->root_lock);
+	for (; b != e; b++) {
+		r = dm_sm_dec_block(pmd->data_sm, b);
+		if (r)
+			break;
+	}
+	up_write(&pmd->root_lock);
+
+	return r;
+}
+
 bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
 {
 	int r;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index a938babe4258..35e954ea20a9 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -197,6 +197,9 @@ int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
 
 int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
 
+int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e);
+int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e);
+
 /*
  * Returns -ENOSPC if the new size is too small and already allocated
  * blocks would be lost.
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 5f9e3d799d66..197ea2003400 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -253,6 +253,7 @@ struct pool {
 	struct bio_list deferred_flush_bios;
 	struct list_head prepared_mappings;
 	struct list_head prepared_discards;
+	struct list_head prepared_discards_pt2;
 	struct list_head active_thins;
 
 	struct dm_deferred_set *shared_read_ds;
@@ -269,6 +270,7 @@ struct pool {
 
 	process_mapping_fn process_prepared_mapping;
 	process_mapping_fn process_prepared_discard;
+	process_mapping_fn process_prepared_discard_pt2;
 
 	struct dm_bio_prison_cell **cell_sort_array;
 };
@@ -1001,7 +1003,8 @@ static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
 
 /*----------------------------------------------------------------*/
 
-static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
+static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m,
+						   struct bio *discard_parent)
 {
 	/*
 	 * We've already unmapped this range of blocks, but before we
@@ -1014,7 +1017,7 @@ static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m
 	dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
 	struct discard_op op;
 
-	begin_discard(&op, tc, m->bio);
+	begin_discard(&op, tc, discard_parent);
 	while (b != end) {
 		/* find start of unmapped run */
 		for (; b < end; b++) {
@@ -1049,28 +1052,101 @@ out:
 	end_discard(&op, r);
 }
 
-static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+static void queue_passdown_pt2(struct dm_thin_new_mapping *m)
+{
+	unsigned long flags;
+	struct pool *pool = m->tc->pool;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	list_add_tail(&m->list, &pool->prepared_discards_pt2);
+	spin_unlock_irqrestore(&pool->lock, flags);
+	wake_worker(pool);
+}
+
+static void passdown_endio(struct bio *bio)
+{
+	/*
+	 * It doesn't matter if the passdown discard failed, we still want
+	 * to unmap (we ignore err).
+	 */
+	queue_passdown_pt2(bio->bi_private);
+}
+
+static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
 {
 	int r;
 	struct thin_c *tc = m->tc;
 	struct pool *pool = tc->pool;
+	struct bio *discard_parent;
+	dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
 
+	/*
+	 * Only this thread allocates blocks, so we can be sure that the
+	 * newly unmapped blocks will not be allocated before the end of
+	 * the function.
+	 */
 	r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
 	if (r) {
 		metadata_operation_failed(pool, "dm_thin_remove_range", r);
 		bio_io_error(m->bio);
+		cell_defer_no_holder(tc, m->cell);
+		mempool_free(m, pool->mapping_pool);
+		return;
+	}
 
-	} else if (m->maybe_shared) {
-		passdown_double_checking_shared_status(m);
+	discard_parent = bio_alloc(GFP_NOIO, 1);
+	if (!discard_parent) {
+		DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.",
+		       dm_device_name(tc->pool->pool_md));
+		queue_passdown_pt2(m);
 
 	} else {
-		struct discard_op op;
-		begin_discard(&op, tc, m->bio);
-		r = issue_discard(&op, m->data_block,
-				  m->data_block + (m->virt_end - m->virt_begin));
-		end_discard(&op, r);
+		discard_parent->bi_end_io = passdown_endio;
+		discard_parent->bi_private = m;
+
+		if (m->maybe_shared)
+			passdown_double_checking_shared_status(m, discard_parent);
+		else {
+			struct discard_op op;
+
+			begin_discard(&op, tc, discard_parent);
+			r = issue_discard(&op, m->data_block, data_end);
+			end_discard(&op, r);
+		}
 	}
 
+	/*
+	 * Increment the unmapped blocks.  This prevents a race between the
+	 * passdown io and reallocation of freed blocks.
+	 */
+	r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
+	if (r) {
+		metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
+		bio_io_error(m->bio);
+		cell_defer_no_holder(tc, m->cell);
+		mempool_free(m, pool->mapping_pool);
+		return;
+	}
+}
+
+static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
+{
+	int r;
+	struct thin_c *tc = m->tc;
+	struct pool *pool = tc->pool;
+
+	/*
+	 * The passdown has completed, so now we can decrement all those
+	 * unmapped blocks.
+	 */
+	r = dm_pool_dec_data_range(pool->pmd, m->data_block,
+				   m->data_block + (m->virt_end - m->virt_begin));
+	if (r) {
+		metadata_operation_failed(pool, "dm_pool_dec_data_range", r);
+		bio_io_error(m->bio);
+	} else
+		bio_endio(m->bio);
+
 	cell_defer_no_holder(tc, m->cell);
 	mempool_free(m, pool->mapping_pool);
 }
@@ -2215,6 +2291,8 @@ static void do_worker(struct work_struct *ws)
 	throttle_work_update(&pool->throttle);
 	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
 	throttle_work_update(&pool->throttle);
+	process_prepared(pool, &pool->prepared_discards_pt2, &pool->process_prepared_discard_pt2);
+	throttle_work_update(&pool->throttle);
 	process_deferred_bios(pool);
 	throttle_work_complete(&pool->throttle);
 }
@@ -2343,7 +2421,8 @@ static void set_discard_callbacks(struct pool *pool)
 
 	if (passdown_enabled(pt)) {
 		pool->process_discard_cell = process_discard_cell_passdown;
-		pool->process_prepared_discard = process_prepared_discard_passdown;
+		pool->process_prepared_discard = process_prepared_discard_passdown_pt1;
+		pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;
 	} else {
 		pool->process_discard_cell = process_discard_cell_no_passdown;
 		pool->process_prepared_discard = process_prepared_discard_no_passdown;
@@ -2830,6 +2909,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	bio_list_init(&pool->deferred_flush_bios);
 	INIT_LIST_HEAD(&pool->prepared_mappings);
 	INIT_LIST_HEAD(&pool->prepared_discards);
+	INIT_LIST_HEAD(&pool->prepared_discards_pt2);
 	INIT_LIST_HEAD(&pool->active_thins);
 	pool->low_water_triggered = false;
 	pool->suspended = true;
-- 
cgit v1.2.3-70-g09d2


From 545ed20e6df68a4d2584a29a2a28ee8b2f7e9547 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Wed, 22 Jun 2016 17:54:53 -0600
Subject: dm: add infrastructure for DAX support

Change mapped device to implement direct_access function,
dm_blk_direct_access(), which calls a target direct_access function.
'struct target_type' is extended to have target direct_access interface.
This function limits direct accessible size to the dm_target's limit
with max_io_len().

Add dm_table_supports_dax() to iterate all targets and associated block
devices to check for DAX support.  To add DAX support to a DM target the
target must only implement the direct_access function.

Add a new dm type, DM_TYPE_DAX_BIO_BASED, which indicates that mapped
device supports DAX and is bio based.  This new type is used to assure
that all target devices have DAX support and remain that way after
QUEUE_FLAG_DAX is set in mapped device.

At initial table load, QUEUE_FLAG_DAX is set to mapped device when setting
DM_TYPE_DAX_BIO_BASED to the type.  Any subsequent table load to the
mapped device must have the same type, or else it fails per the check in
table_load().

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c         | 44 ++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/dm.c               | 38 +++++++++++++++++++++++++++++++++++--
 drivers/md/dm.h               |  1 +
 include/linux/device-mapper.h | 10 ++++++++++
 include/uapi/linux/dm-ioctl.h |  4 ++--
 5 files changed, 92 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 88f01744ac16..ee6f37eafbc3 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -827,6 +827,12 @@ void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
 }
 EXPORT_SYMBOL(dm_consume_args);
 
+static bool __table_type_bio_based(unsigned table_type)
+{
+	return (table_type == DM_TYPE_BIO_BASED ||
+		table_type == DM_TYPE_DAX_BIO_BASED);
+}
+
 static bool __table_type_request_based(unsigned table_type)
 {
 	return (table_type == DM_TYPE_REQUEST_BASED ||
@@ -839,6 +845,34 @@ void dm_table_set_type(struct dm_table *t, unsigned type)
 }
 EXPORT_SYMBOL_GPL(dm_table_set_type);
 
+static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
+			       sector_t start, sector_t len, void *data)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+
+	return q && blk_queue_dax(q);
+}
+
+static bool dm_table_supports_dax(struct dm_table *t)
+{
+	struct dm_target *ti;
+	unsigned i = 0;
+
+	/* Ensure that all targets support DAX. */
+	while (i < dm_table_get_num_targets(t)) {
+		ti = dm_table_get_target(t, i++);
+
+		if (!ti->type->direct_access)
+			return false;
+
+		if (!ti->type->iterate_devices ||
+		    !ti->type->iterate_devices(ti, device_supports_dax, NULL))
+			return false;
+	}
+
+	return true;
+}
+
 static int dm_table_determine_type(struct dm_table *t)
 {
 	unsigned i;
@@ -853,6 +887,7 @@ static int dm_table_determine_type(struct dm_table *t)
 		/* target already set the table's type */
 		if (t->type == DM_TYPE_BIO_BASED)
 			return 0;
+		BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED);
 		goto verify_rq_based;
 	}
 
@@ -887,6 +922,8 @@ static int dm_table_determine_type(struct dm_table *t)
 	if (bio_based) {
 		/* We must use this table as bio-based */
 		t->type = DM_TYPE_BIO_BASED;
+		if (dm_table_supports_dax(t))
+			t->type = DM_TYPE_DAX_BIO_BASED;
 		return 0;
 	}
 
@@ -979,6 +1016,11 @@ struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
 	return NULL;
 }
 
+bool dm_table_bio_based(struct dm_table *t)
+{
+	return __table_type_bio_based(dm_table_get_type(t));
+}
+
 bool dm_table_request_based(struct dm_table *t)
 {
 	return __table_type_request_based(dm_table_get_type(t));
@@ -1001,7 +1043,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 		return -EINVAL;
 	}
 
-	if (type == DM_TYPE_BIO_BASED)
+	if (__table_type_bio_based(type))
 		for (i = 0; i < t->num_targets; i++) {
 			tgt = t->targets + i;
 			per_io_data_size = max(per_io_data_size, tgt->per_io_data_size);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7538b8972820..4dca5a792e4b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -905,6 +905,33 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
 }
 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 
+static long dm_blk_direct_access(struct block_device *bdev, sector_t sector,
+				 void __pmem **kaddr, pfn_t *pfn, long size)
+{
+	struct mapped_device *md = bdev->bd_disk->private_data;
+	struct dm_table *map;
+	struct dm_target *ti;
+	int srcu_idx;
+	long len, ret = -EIO;
+
+	map = dm_get_live_table(md, &srcu_idx);
+	if (!map)
+		goto out;
+
+	ti = dm_table_find_target(map, sector);
+	if (!dm_target_is_valid(ti))
+		goto out;
+
+	len = max_io_len(sector, ti) << SECTOR_SHIFT;
+	size = min(len, size);
+
+	if (ti->type->direct_access)
+		ret = ti->type->direct_access(ti, sector, kaddr, pfn, size);
+out:
+	dm_put_live_table(md, srcu_idx);
+	return min(ret, size);
+}
+
 /*
  * A target may call dm_accept_partial_bio only from the map routine.  It is
  * allowed for all bio types except REQ_PREFLUSH.
@@ -1548,7 +1575,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 
 	if (md->bs) {
 		/* The md already has necessary mempools. */
-		if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
+		if (dm_table_bio_based(t)) {
 			/*
 			 * Reload bioset because front_pad may have changed
 			 * because a different table was loaded.
@@ -1744,8 +1771,9 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 {
 	int r;
+	unsigned type = dm_get_md_type(md);
 
-	switch (dm_get_md_type(md)) {
+	switch (type) {
 	case DM_TYPE_REQUEST_BASED:
 		r = dm_old_init_request_queue(md);
 		if (r) {
@@ -1761,6 +1789,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 		}
 		break;
 	case DM_TYPE_BIO_BASED:
+	case DM_TYPE_DAX_BIO_BASED:
 		dm_init_normal_md_queue(md);
 		blk_queue_make_request(md->queue, dm_make_request);
 		/*
@@ -1769,6 +1798,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 		 */
 		bioset_free(md->queue->bio_split);
 		md->queue->bio_split = NULL;
+
+		if (type == DM_TYPE_DAX_BIO_BASED)
+			queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
 		break;
 	}
 
@@ -2465,6 +2497,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 
 	switch (type) {
 	case DM_TYPE_BIO_BASED:
+	case DM_TYPE_DAX_BIO_BASED:
 		cachep = _io_cache;
 		pool_size = dm_get_reserved_bio_based_ios();
 		front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
@@ -2691,6 +2724,7 @@ static const struct block_device_operations dm_blk_dops = {
 	.open = dm_blk_open,
 	.release = dm_blk_close,
 	.ioctl = dm_blk_ioctl,
+	.direct_access = dm_blk_direct_access,
 	.getgeo = dm_blk_getgeo,
 	.pr_ops = &dm_pr_ops,
 	.owner = THIS_MODULE
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 2e0e4a53a312..f0aad08b9654 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -68,6 +68,7 @@ unsigned dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
+bool dm_table_bio_based(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
 bool dm_table_all_blk_mq_devices(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 2ce339212b6e..b0db857f334b 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -26,6 +26,7 @@ struct bio_vec;
 #define DM_TYPE_BIO_BASED		1
 #define DM_TYPE_REQUEST_BASED		2
 #define DM_TYPE_MQ_REQUEST_BASED	3
+#define DM_TYPE_DAX_BIO_BASED		4
 
 typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 
@@ -124,6 +125,14 @@ typedef void (*dm_io_hints_fn) (struct dm_target *ti,
  */
 typedef int (*dm_busy_fn) (struct dm_target *ti);
 
+/*
+ * Returns:
+ *  < 0 : error
+ * >= 0 : the number of bytes accessible at the address
+ */
+typedef long (*dm_direct_access_fn) (struct dm_target *ti, sector_t sector,
+				     void __pmem **kaddr, pfn_t *pfn, long size);
+
 void dm_error(const char *message);
 
 struct dm_dev {
@@ -170,6 +179,7 @@ struct target_type {
 	dm_busy_fn busy;
 	dm_iterate_devices_fn iterate_devices;
 	dm_io_hints_fn io_hints;
+	dm_direct_access_fn direct_access;
 
 	/* For internal device-mapper use. */
 	struct list_head list;
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 30afd0a23c4b..4bf9f1eabffc 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	34
+#define DM_VERSION_MINOR	35
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2015-10-28)"
+#define DM_VERSION_EXTRA	"-ioctl (2016-06-23)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3-70-g09d2


From 84b22f8378cf493524043a0a8dd567c58c64546f Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Wed, 22 Jun 2016 17:54:54 -0600
Subject: dm linear: add DAX support

Change dm-linear to implement direct_access function,
linear_direct_access(), which maps sector and calls direct_access
function of its physical target device.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-linear.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 05c35aacb3aa..6d35dd4e9efb 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -141,9 +141,27 @@ static int linear_iterate_devices(struct dm_target *ti,
 	return fn(ti, lc->dev, lc->start, ti->len, data);
 }
 
+static long linear_direct_access(struct dm_target *ti, sector_t sector,
+				 void __pmem **kaddr, pfn_t *pfn, long size)
+{
+	struct linear_c *lc = ti->private;
+	struct block_device *bdev = lc->dev->bdev;
+	struct blk_dax_ctl dax = {
+		.sector = linear_map_sector(ti, sector),
+		.size = size,
+	};
+	long ret;
+
+	ret = bdev_direct_access(bdev, &dax);
+	*kaddr = dax.addr;
+	*pfn = dax.pfn;
+
+	return ret;
+}
+
 static struct target_type linear_target = {
 	.name   = "linear",
-	.version = {1, 2, 1},
+	.version = {1, 3, 0},
 	.module = THIS_MODULE,
 	.ctr    = linear_ctr,
 	.dtr    = linear_dtr,
@@ -151,6 +169,7 @@ static struct target_type linear_target = {
 	.status = linear_status,
 	.prepare_ioctl = linear_prepare_ioctl,
 	.iterate_devices = linear_iterate_devices,
+	.direct_access = linear_direct_access,
 };
 
 int __init dm_linear_init(void)
-- 
cgit v1.2.3-70-g09d2


From f8df1fdf18839cb4ef2035310bb9b6ec02025598 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 24 Jun 2016 17:09:35 -0400
Subject: dm error: add DAX support

Allow the error target to replace an existing DAX-enabled target.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c  | 3 ++-
 drivers/md/dm-target.c | 9 ++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ee6f37eafbc3..3e407a9cde1f 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -922,7 +922,8 @@ static int dm_table_determine_type(struct dm_table *t)
 	if (bio_based) {
 		/* We must use this table as bio-based */
 		t->type = DM_TYPE_BIO_BASED;
-		if (dm_table_supports_dax(t))
+		if (dm_table_supports_dax(t) ||
+		    (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED))
 			t->type = DM_TYPE_DAX_BIO_BASED;
 		return 0;
 	}
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 5c826b450aad..6eecd6b36f76 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -148,9 +148,15 @@ static void io_err_release_clone_rq(struct request *clone)
 {
 }
 
+static long io_err_direct_access(struct dm_target *ti, sector_t sector,
+				 void __pmem **kaddr, pfn_t *pfn, long size)
+{
+	return -EIO;
+}
+
 static struct target_type error_target = {
 	.name = "error",
-	.version = {1, 4, 0},
+	.version = {1, 5, 0},
 	.features = DM_TARGET_WILDCARD,
 	.ctr  = io_err_ctr,
 	.dtr  = io_err_dtr,
@@ -158,6 +164,7 @@ static struct target_type error_target = {
 	.map_rq = io_err_map_rq,
 	.clone_and_map_rq = io_err_clone_and_map_rq,
 	.release_clone_rq = io_err_release_clone_rq,
+	.direct_access = io_err_direct_access,
 };
 
 int __init dm_target_init(void)
-- 
cgit v1.2.3-70-g09d2


From beec25b4573bc310f5a12cf33207b20ecc30945c Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Fri, 24 Jun 2016 12:23:30 -0600
Subject: dm stripe: add DAX support

Change dm-stripe to implement direct_access function,
stripe_direct_access(), which maps bdev and sector and
calls direct_access function of its physical target device.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-stripe.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 48f1c01d7b9f..01bb9cf2a8c2 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -308,6 +308,29 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 	return DM_MAPIO_REMAPPED;
 }
 
+static long stripe_direct_access(struct dm_target *ti, sector_t sector,
+				 void __pmem **kaddr, pfn_t *pfn, long size)
+{
+	struct stripe_c *sc = ti->private;
+	uint32_t stripe;
+	struct block_device *bdev;
+	struct blk_dax_ctl dax = {
+		.size = size,
+	};
+	long ret;
+
+	stripe_map_sector(sc, sector, &stripe, &dax.sector);
+
+	dax.sector += sc->stripe[stripe].physical_start;
+	bdev = sc->stripe[stripe].dev->bdev;
+
+	ret = bdev_direct_access(bdev, &dax);
+	*kaddr = dax.addr;
+	*pfn = dax.pfn;
+
+	return ret;
+}
+
 /*
  * Stripe status:
  *
@@ -416,7 +439,7 @@ static void stripe_io_hints(struct dm_target *ti,
 
 static struct target_type stripe_target = {
 	.name   = "striped",
-	.version = {1, 5, 1},
+	.version = {1, 6, 0},
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
@@ -425,6 +448,7 @@ static struct target_type stripe_target = {
 	.status = stripe_status,
 	.iterate_devices = stripe_iterate_devices,
 	.io_hints = stripe_io_hints,
+	.direct_access = stripe_direct_access,
 };
 
 int __init dm_stripe_init(void)
-- 
cgit v1.2.3-70-g09d2


From f6e629bd2379dceb547be93915314307871a7f6c Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 28 Jun 2016 13:37:16 -0600
Subject: dm snap: add fake origin_direct_access

dax-capable mapped-device is marked as DM_TYPE_DAX_BIO_BASED,
which supports both dax and bio-based operations.  dm-snap
needs to work with dax-capable device when bio-based operation
is used.

Add fake origin_direct_access() to origin device so that its
origin device is also marked as DM_TYPE_DAX_BIO_BASED for
dax-capable device.  This allows to extend target's DM table.
dm-snap works normally when bio-based operation is used.

dm-snap does not support dax operation, and mount with dax
option to a target device or snapshot device fails.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-snap.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 69ab1ff5f5c9..c472f0465f0e 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -2301,6 +2301,13 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
 	return do_origin(o->dev, bio);
 }
 
+static long origin_direct_access(struct dm_target *ti, sector_t sector,
+		void __pmem **kaddr, pfn_t *pfn, long size)
+{
+	DMWARN("device does not support dax.");
+	return -EIO;
+}
+
 /*
  * Set the target "max_io_len" field to the minimum of all the snapshots'
  * chunk sizes.
@@ -2360,6 +2367,7 @@ static struct target_type origin_target = {
 	.postsuspend = origin_postsuspend,
 	.status  = origin_status,
 	.iterate_devices = origin_iterate_devices,
+	.direct_access = origin_direct_access,
 };
 
 static struct target_type snapshot_target = {
-- 
cgit v1.2.3-70-g09d2


From b5ab4a9ba5574430870391e93b125ada31217c4c Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 28 Jun 2016 13:37:15 -0600
Subject: dm: allow bio-based table to be upgraded to bio-based with DAX
 support

Allow table type DM_TYPE_BIO_BASED to extend with DM_TYPE_DAX_BIO_BASED
since DM_TYPE_DAX_BIO_BASED supports bio-based requests.

This is needed to allow a snapshot of an LV with DAX support to be
removed.  One of the intermediate table reloads that lvm2 does switches
from DM_TYPE_BIO_BASED to DM_TYPE_DAX_BIO_BASED.  No known reason to
disallow this so...

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-ioctl.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 96df89a31f42..966eb4b61aed 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1267,6 +1267,15 @@ static int populate_table(struct dm_table *table,
 	return dm_table_complete(table);
 }
 
+static bool is_valid_type(unsigned cur, unsigned new)
+{
+	if (cur == new ||
+	    (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED))
+		return true;
+
+	return false;
+}
+
 static int table_load(struct dm_ioctl *param, size_t param_size)
 {
 	int r;
@@ -1309,7 +1318,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 			DMWARN("unable to set up device queue for new table.");
 			goto err_unlock_md_type;
 		}
-	} else if (dm_get_md_type(md) != dm_table_get_type(t)) {
+	} else if (!is_valid_type(dm_get_md_type(md), dm_table_get_type(t))) {
 		DMWARN("can't change device type after initial table load.");
 		r = -EINVAL;
 		goto err_unlock_md_type;
-- 
cgit v1.2.3-70-g09d2