summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig2
-rw-r--r--drivers/md/bcache/alloc.c9
-rw-r--r--drivers/md/bcache/bcache.h6
-rw-r--r--drivers/md/bcache/bset.c61
-rw-r--r--drivers/md/bcache/btree.c53
-rw-r--r--drivers/md/bcache/btree.h2
-rw-r--r--drivers/md/bcache/io.c12
-rw-r--r--drivers/md/bcache/journal.c141
-rw-r--r--drivers/md/bcache/journal.h4
-rw-r--r--drivers/md/bcache/super.c227
-rw-r--r--drivers/md/bcache/sysfs.c67
-rw-r--r--drivers/md/bcache/util.h2
-rw-r--r--drivers/md/bcache/writeback.c8
-rw-r--r--drivers/md/dm-bufio.c4
-rw-r--r--drivers/md/dm-crypt.c101
-rw-r--r--drivers/md/dm-flakey.c5
-rw-r--r--drivers/md/dm-init.c2
-rw-r--r--drivers/md/dm-integrity.c7
-rw-r--r--drivers/md/dm-linear.c5
-rw-r--r--drivers/md/dm-log-writes.c4
-rw-r--r--drivers/md/dm-raid.c2
-rw-r--r--drivers/md/dm-rq.c2
-rw-r--r--drivers/md/dm-snap.c186
-rw-r--r--drivers/md/dm-table.c24
-rw-r--r--drivers/md/dm-thin-metadata.c7
-rw-r--r--drivers/md/dm-zoned-metadata.c16
-rw-r--r--drivers/md/dm.c11
-rw-r--r--drivers/md/dm.h5
-rw-r--r--drivers/md/md-bitmap.c20
-rw-r--r--drivers/md/md.c132
-rw-r--r--drivers/md/md.h23
-rw-r--r--drivers/md/raid1-10.c30
-rw-r--r--drivers/md/raid1.c119
-rw-r--r--drivers/md/raid10.c86
-rw-r--r--drivers/md/raid5.c12
35 files changed, 1057 insertions, 340 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 45254b3ef715..3834332f4963 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -453,7 +453,7 @@ config DM_INIT
Enable "dm-mod.create=" parameter to create mapped devices at init time.
This option is useful to allow mounting rootfs without requiring an
initramfs.
- See Documentation/device-mapper/dm-init.txt for dm-mod.create="..."
+ See Documentation/admin-guide/device-mapper/dm-init.rst for dm-mod.create="..."
format.
If unsure, say N.
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index f8986effcb50..6f776823b9ba 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -393,6 +393,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait)
struct bucket *b;
long r;
+
+ /* No allocation if CACHE_SET_IO_DISABLE bit is set */
+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)))
+ return -1;
+
/* fastpath */
if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
fifo_pop(&ca->free[reserve], r))
@@ -484,6 +489,10 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
{
int i;
+ /* No allocation if CACHE_SET_IO_DISABLE bit is set */
+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
+ return -1;
+
lockdep_assert_held(&c->bucket_lock);
BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index fdf75352e16a..013e35a9e317 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -705,8 +705,8 @@ struct cache_set {
atomic_long_t writeback_keys_failed;
atomic_long_t reclaim;
+ atomic_long_t reclaimed_journal_buckets;
atomic_long_t flush_write;
- atomic_long_t retry_flush_write;
enum {
ON_ERROR_UNREGISTER,
@@ -726,8 +726,6 @@ struct cache_set {
#define BUCKET_HASH_BITS 12
struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
-
- DECLARE_HEAP(struct btree *, flush_btree);
};
struct bbio {
@@ -1006,7 +1004,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size);
int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
uint8_t *set_uuid);
void bch_cached_dev_detach(struct cached_dev *dc);
-void bch_cached_dev_run(struct cached_dev *dc);
+int bch_cached_dev_run(struct cached_dev *dc);
void bcache_device_stop(struct bcache_device *d);
void bch_cache_set_unregister(struct cache_set *c);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 268f1b685084..08768796b543 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -347,22 +347,19 @@ EXPORT_SYMBOL(bch_btree_keys_alloc);
void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
bool *expensive_debug_checks)
{
- unsigned int i;
-
b->ops = ops;
b->expensive_debug_checks = expensive_debug_checks;
b->nsets = 0;
b->last_set_unwritten = 0;
- /* XXX: shouldn't be needed */
- for (i = 0; i < MAX_BSETS; i++)
- b->set[i].size = 0;
/*
- * Second loop starts at 1 because b->keys[0]->data is the memory we
- * allocated
+ * struct btree_keys in embedded in struct btree, and struct
+ * bset_tree is embedded into struct btree_keys. They are all
+ * initialized as 0 by kzalloc() in mca_bucket_alloc(), and
+ * b->set[0].data is allocated in bch_btree_keys_alloc(), so we
+ * don't have to initiate b->set[].size and b->set[].data here
+ * any more.
*/
- for (i = 1; i < MAX_BSETS; i++)
- b->set[i].data = NULL;
}
EXPORT_SYMBOL(bch_btree_keys_init);
@@ -970,45 +967,25 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
unsigned int inorder, j, n = 1;
do {
- /*
- * A bit trick here.
- * If p < t->size, (int)(p - t->size) is a minus value and
- * the most significant bit is set, right shifting 31 bits
- * gets 1. If p >= t->size, the most significant bit is
- * not set, right shifting 31 bits gets 0.
- * So the following 2 lines equals to
- * if (p >= t->size)
- * p = 0;
- * but a branch instruction is avoided.
- */
unsigned int p = n << 4;
- p &= ((int) (p - t->size)) >> 31;
-
- prefetch(&t->tree[p]);
+ if (p < t->size)
+ prefetch(&t->tree[p]);
j = n;
f = &t->tree[j];
- /*
- * Similar bit trick, use subtract operation to avoid a branch
- * instruction.
- *
- * n = (f->mantissa > bfloat_mantissa())
- * ? j * 2
- * : j * 2 + 1;
- *
- * We need to subtract 1 from f->mantissa for the sign bit trick
- * to work - that's done in make_bfloat()
- */
- if (likely(f->exponent != 127))
- n = j * 2 + (((unsigned int)
- (f->mantissa -
- bfloat_mantissa(search, f))) >> 31);
- else
- n = (bkey_cmp(tree_to_bkey(t, j), search) > 0)
- ? j * 2
- : j * 2 + 1;
+ if (likely(f->exponent != 127)) {
+ if (f->mantissa >= bfloat_mantissa(search, f))
+ n = j * 2;
+ else
+ n = j * 2 + 1;
+ } else {
+ if (bkey_cmp(tree_to_bkey(t, j), search) > 0)
+ n = j * 2;
+ else
+ n = j * 2 + 1;
+ }
} while (n < t->size);
inorder = to_inorder(j, t);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 773f5fdad25f..ba434d9ac720 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -35,7 +35,7 @@
#include <linux/rcupdate.h>
#include <linux/sched/clock.h>
#include <linux/rculist.h>
-
+#include <linux/delay.h>
#include <trace/events/bcache.h>
/*
@@ -613,6 +613,10 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
static struct btree *mca_bucket_alloc(struct cache_set *c,
struct bkey *k, gfp_t gfp)
{
+ /*
+ * kzalloc() is necessary here for initialization,
+ * see code comments in bch_btree_keys_init().
+ */
struct btree *b = kzalloc(sizeof(struct btree), gfp);
if (!b)
@@ -655,7 +659,25 @@ static int mca_reap(struct btree *b, unsigned int min_order, bool flush)
up(&b->io_mutex);
}
+retry:
+ /*
+ * BTREE_NODE_dirty might be cleared in btree_flush_btree() by
+ * __bch_btree_node_write(). To avoid an extra flush, acquire
+ * b->write_lock before checking BTREE_NODE_dirty bit.
+ */
mutex_lock(&b->write_lock);
+ /*
+ * If this btree node is selected in btree_flush_write() by journal
+ * code, delay and retry until the node is flushed by journal code
+ * and BTREE_NODE_journal_flush bit cleared by btree_flush_write().
+ */
+ if (btree_node_journal_flush(b)) {
+ pr_debug("bnode %p is flushing by journal, retry", b);
+ mutex_unlock(&b->write_lock);
+ udelay(1);
+ goto retry;
+ }
+
if (btree_node_dirty(b))
__bch_btree_node_write(b, &cl);
mutex_unlock(&b->write_lock);
@@ -778,10 +800,15 @@ void bch_btree_cache_free(struct cache_set *c)
while (!list_empty(&c->btree_cache)) {
b = list_first_entry(&c->btree_cache, struct btree, list);
- if (btree_node_dirty(b))
+ /*
+ * This function is called by cache_set_free(), no I/O
+ * request on cache now, it is unnecessary to acquire
+ * b->write_lock before clearing BTREE_NODE_dirty anymore.
+ */
+ if (btree_node_dirty(b)) {
btree_complete_write(b, btree_current_write(b));
- clear_bit(BTREE_NODE_dirty, &b->flags);
-
+ clear_bit(BTREE_NODE_dirty, &b->flags);
+ }
mca_data_free(b);
}
@@ -1067,11 +1094,25 @@ static void btree_node_free(struct btree *b)
BUG_ON(b == b->c->root);
+retry:
mutex_lock(&b->write_lock);
+ /*
+ * If the btree node is selected and flushing in btree_flush_write(),
+ * delay and retry until the BTREE_NODE_journal_flush bit cleared,
+ * then it is safe to free the btree node here. Otherwise this btree
+ * node will be in race condition.
+ */
+ if (btree_node_journal_flush(b)) {
+ mutex_unlock(&b->write_lock);
+ pr_debug("bnode %p journal_flush set, retry", b);
+ udelay(1);
+ goto retry;
+ }
- if (btree_node_dirty(b))
+ if (btree_node_dirty(b)) {
btree_complete_write(b, btree_current_write(b));
- clear_bit(BTREE_NODE_dirty, &b->flags);
+ clear_bit(BTREE_NODE_dirty, &b->flags);
+ }
mutex_unlock(&b->write_lock);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index d1c72ef64edf..76cfd121a486 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -158,11 +158,13 @@ enum btree_flags {
BTREE_NODE_io_error,
BTREE_NODE_dirty,
BTREE_NODE_write_idx,
+ BTREE_NODE_journal_flush,
};
BTREE_FLAG(io_error);
BTREE_FLAG(dirty);
BTREE_FLAG(write_idx);
+BTREE_FLAG(journal_flush);
static inline struct btree_write *btree_current_write(struct btree *b)
{
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index c25097968319..4d93f07f63e5 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -58,6 +58,18 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio)
WARN_ONCE(!dc, "NULL pointer of struct cached_dev");
+ /*
+ * Read-ahead requests on a degrading and recovering md raid
+ * (e.g. raid6) device might be failured immediately by md
+ * raid code, which is not a real hardware media failure. So
+ * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors.
+ */
+ if (bio->bi_opf & REQ_RAHEAD) {
+ pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore",
+ dc->backing_dev_name);
+ return;
+ }
+
errors = atomic_add_return(1, &dc->io_errors);
if (errors < dc->error_limit)
pr_err("%s: IO error on backing device, unrecoverable",
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 12dae9348147..be2a2a201603 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -100,6 +100,20 @@ reread: left = ca->sb.bucket_size - offset;
blocks = set_blocks(j, block_bytes(ca->set));
+ /*
+ * Nodes in 'list' are in linear increasing order of
+ * i->j.seq, the node on head has the smallest (oldest)
+ * journal seq, the node on tail has the biggest
+ * (latest) journal seq.
+ */
+
+ /*
+ * Check from the oldest jset for last_seq. If
+ * i->j.seq < j->last_seq, it means the oldest jset
+ * in list is expired and useless, remove it from
+ * this list. Otherwise, j is a condidate jset for
+ * further following checks.
+ */
while (!list_empty(list)) {
i = list_first_entry(list,
struct journal_replay, list);
@@ -109,13 +123,22 @@ reread: left = ca->sb.bucket_size - offset;
kfree(i);
}
+ /* iterate list in reverse order (from latest jset) */
list_for_each_entry_reverse(i, list, list) {
if (j->seq == i->j.seq)
goto next_set;
+ /*
+ * if j->seq is less than any i->j.last_seq
+ * in list, j is an expired and useless jset.
+ */
if (j->seq < i->j.last_seq)
goto next_set;
+ /*
+ * 'where' points to first jset in list which
+ * is elder then j.
+ */
if (j->seq > i->j.seq) {
where = &i->list;
goto add;
@@ -129,10 +152,12 @@ add:
if (!i)
return -ENOMEM;
memcpy(&i->j, j, bytes);
+ /* Add to the location after 'where' points to */
list_add(&i->list, where);
ret = 1;
- ja->seq[bucket_index] = j->seq;
+ if (j->seq > ja->seq[bucket_index])
+ ja->seq[bucket_index] = j->seq;
next_set:
offset += blocks * ca->sb.block_size;
len -= blocks * ca->sb.block_size;
@@ -268,7 +293,7 @@ bsearch:
struct journal_replay,
list)->j.seq;
- return ret;
+ return 0;
#undef read_bucket
}
@@ -391,60 +416,90 @@ err:
}
/* Journalling */
-#define journal_max_cmp(l, r) \
- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \
- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
-#define journal_min_cmp(l, r) \
- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \
- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
static void btree_flush_write(struct cache_set *c)
{
- /*
- * Try to find the btree node with that references the oldest journal
- * entry, best is our current candidate and is locked if non NULL:
- */
- struct btree *b;
- int i;
+ struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR];
+ unsigned int i, n;
+
+ if (c->journal.btree_flushing)
+ return;
+
+ spin_lock(&c->journal.flush_write_lock);
+ if (c->journal.btree_flushing) {
+ spin_unlock(&c->journal.flush_write_lock);
+ return;
+ }
+ c->journal.btree_flushing = true;
+ spin_unlock(&c->journal.flush_write_lock);
atomic_long_inc(&c->flush_write);
+ memset(btree_nodes, 0, sizeof(btree_nodes));
+ n = 0;
-retry:
- spin_lock(&c->journal.lock);
- if (heap_empty(&c->flush_btree)) {
- for_each_cached_btree(b, c, i)
- if (btree_current_write(b)->journal) {
- if (!heap_full(&c->flush_btree))
- heap_add(&c->flush_btree, b,
- journal_max_cmp);
- else if (journal_max_cmp(b,
- heap_peek(&c->flush_btree))) {
- c->flush_btree.data[0] = b;
- heap_sift(&c->flush_btree, 0,
- journal_max_cmp);
- }
- }
+ mutex_lock(&c->bucket_lock);
+ list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
+ if (btree_node_journal_flush(b))
+ pr_err("BUG: flush_write bit should not be set here!");
+
+ mutex_lock(&b->write_lock);
- for (i = c->flush_btree.used / 2 - 1; i >= 0; --i)
- heap_sift(&c->flush_btree, i, journal_min_cmp);
+ if (!btree_node_dirty(b)) {
+ mutex_unlock(&b->write_lock);
+ continue;
+ }
+
+ if (!btree_current_write(b)->journal) {
+ mutex_unlock(&b->write_lock);
+ continue;
+ }
+
+ set_btree_node_journal_flush(b);
+
+ mutex_unlock(&b->write_lock);
+
+ btree_nodes[n++] = b;
+ if (n == BTREE_FLUSH_NR)
+ break;
}
+ mutex_unlock(&c->bucket_lock);
- b = NULL;
- heap_pop(&c->flush_btree, b, journal_min_cmp);
- spin_unlock(&c->journal.lock);
+ for (i = 0; i < n; i++) {
+ b = btree_nodes[i];
+ if (!b) {
+ pr_err("BUG: btree_nodes[%d] is NULL", i);
+ continue;
+ }
+
+ /* safe to check without holding b->write_lock */
+ if (!btree_node_journal_flush(b)) {
+ pr_err("BUG: bnode %p: journal_flush bit cleaned", b);
+ continue;
+ }
- if (b) {
mutex_lock(&b->write_lock);
if (!btree_current_write(b)->journal) {
+ clear_bit(BTREE_NODE_journal_flush, &b->flags);
+ mutex_unlock(&b->write_lock);
+ pr_debug("bnode %p: written by others", b);
+ continue;
+ }
+
+ if (!btree_node_dirty(b)) {
+ clear_bit(BTREE_NODE_journal_flush, &b->flags);
mutex_unlock(&b->write_lock);
- /* We raced */
- atomic_long_inc(&c->retry_flush_write);
- goto retry;
+ pr_debug("bnode %p: dirty bit cleaned by others", b);
+ continue;
}
__bch_btree_node_write(b, NULL);
+ clear_bit(BTREE_NODE_journal_flush, &b->flags);
mutex_unlock(&b->write_lock);
}
+
+ spin_lock(&c->journal.flush_write_lock);
+ c->journal.btree_flushing = false;
+ spin_unlock(&c->journal.flush_write_lock);
}
#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
@@ -559,6 +614,7 @@ static void journal_reclaim(struct cache_set *c)
k->ptr[n++] = MAKE_PTR(0,
bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
ca->sb.nr_this_dev);
+ atomic_long_inc(&c->reclaimed_journal_buckets);
}
if (n) {
@@ -811,6 +867,10 @@ atomic_t *bch_journal(struct cache_set *c,
struct journal_write *w;
atomic_t *ret;
+ /* No journaling if CACHE_SET_IO_DISABLE set already */
+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
+ return NULL;
+
if (!CACHE_SYNC(&c->sb))
return NULL;
@@ -855,7 +915,6 @@ void bch_journal_free(struct cache_set *c)
free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
free_fifo(&c->journal.pin);
- free_heap(&c->flush_btree);
}
int bch_journal_alloc(struct cache_set *c)
@@ -863,6 +922,7 @@ int bch_journal_alloc(struct cache_set *c)
struct journal *j = &c->journal;
spin_lock_init(&j->lock);
+ spin_lock_init(&j->flush_write_lock);
INIT_DELAYED_WORK(&j->work, journal_write_work);
c->journal_delay_ms = 100;
@@ -870,8 +930,7 @@ int bch_journal_alloc(struct cache_set *c)
j->w[0].c = c;
j->w[1].c = c;
- if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) ||
- !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
!(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
!(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
return -ENOMEM;
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 66f0facff84b..f2ea34d5f431 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -103,6 +103,8 @@ struct journal_write {
/* Embedded in struct cache_set */
struct journal {
spinlock_t lock;
+ spinlock_t flush_write_lock;
+ bool btree_flushing;
/* used when waiting because the journal was full */
struct closure_waitlist wait;
struct closure io;
@@ -154,6 +156,8 @@ struct journal_device {
struct bio_vec bv[8];
};
+#define BTREE_FLUSH_NR 8
+
#define journal_pin_cmp(c, l, r) \
(fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1b63ac876169..26e374fbf57c 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -40,6 +40,7 @@ static const char invalid_uuid[] = {
static struct kobject *bcache_kobj;
struct mutex bch_register_lock;
+bool bcache_is_reboot;
LIST_HEAD(bch_cache_sets);
static LIST_HEAD(uncached_devices);
@@ -49,6 +50,7 @@ static wait_queue_head_t unregister_wait;
struct workqueue_struct *bcache_wq;
struct workqueue_struct *bch_journal_wq;
+
#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
/* limitation of partitions number on single bcache device */
#define BCACHE_MINORS 128
@@ -197,7 +199,9 @@ err:
static void write_bdev_super_endio(struct bio *bio)
{
struct cached_dev *dc = bio->bi_private;
- /* XXX: error checking */
+
+ if (bio->bi_status)
+ bch_count_backing_io_errors(dc, bio);
closure_put(&dc->sb_write);
}
@@ -691,6 +695,7 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
{
unsigned int i;
struct cache *ca;
+ int ret;
for_each_cache(ca, d->c, i)
bd_link_disk_holder(ca->bdev, d->disk);
@@ -698,9 +703,13 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
snprintf(d->name, BCACHEDEVNAME_SIZE,
"%s%u", name, d->id);
- WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
- sysfs_create_link(&c->kobj, &d->kobj, d->name),
- "Couldn't create device <-> cache set symlinks");
+ ret = sysfs_create_link(&d->kobj, &c->kobj, "cache");
+ if (ret < 0)
+ pr_err("Couldn't create device -> cache set symlink");
+
+ ret = sysfs_create_link(&c->kobj, &d->kobj, d->name);
+ if (ret < 0)
+ pr_err("Couldn't create cache set -> device symlink");
clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
}
@@ -908,7 +917,7 @@ static int cached_dev_status_update(void *arg)
}
-void bch_cached_dev_run(struct cached_dev *dc)
+int bch_cached_dev_run(struct cached_dev *dc)
{
struct bcache_device *d = &dc->disk;
char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
@@ -919,11 +928,19 @@ void bch_cached_dev_run(struct cached_dev *dc)
NULL,
};
+ if (dc->io_disable) {
+ pr_err("I/O disabled on cached dev %s",
+ dc->backing_dev_name);
+ return -EIO;
+ }
+
if (atomic_xchg(&dc->running, 1)) {
kfree(env[1]);
kfree(env[2]);
kfree(buf);
- return;
+ pr_info("cached dev %s is running already",
+ dc->backing_dev_name);
+ return -EBUSY;
}
if (!d->c &&
@@ -949,8 +966,11 @@ void bch_cached_dev_run(struct cached_dev *dc)
kfree(buf);
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
- sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
- pr_debug("error creating sysfs link");
+ sysfs_create_link(&disk_to_dev(d->disk)->kobj,
+ &d->kobj, "bcache")) {
+ pr_err("Couldn't create bcache dev <-> disk sysfs symlinks");
+ return -ENOMEM;
+ }
dc->status_update_thread = kthread_run(cached_dev_status_update,
dc, "bcache_status_update");
@@ -959,6 +979,8 @@ void bch_cached_dev_run(struct cached_dev *dc)
"continue to run without monitoring backing "
"device status");
}
+
+ return 0;
}
/*
@@ -996,7 +1018,6 @@ static void cached_dev_detach_finish(struct work_struct *w)
BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
BUG_ON(refcount_read(&dc->count));
- mutex_lock(&bch_register_lock);
if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
cancel_writeback_rate_update_dwork(dc);
@@ -1012,6 +1033,8 @@ static void cached_dev_detach_finish(struct work_struct *w)
bch_write_bdev_super(dc, &cl);
closure_sync(&cl);
+ mutex_lock(&bch_register_lock);
+
calc_cached_dev_sectors(dc->disk.c);
bcache_device_detach(&dc->disk);
list_move(&dc->list, &uncached_devices);
@@ -1054,6 +1077,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
struct uuid_entry *u;
struct cached_dev *exist_dc, *t;
+ int ret = 0;
if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
(!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
@@ -1153,6 +1177,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
down_write(&dc->writeback_lock);
if (bch_cached_dev_writeback_start(dc)) {
up_write(&dc->writeback_lock);
+ pr_err("Couldn't start writeback facilities for %s",
+ dc->disk.disk->disk_name);
return -ENOMEM;
}
@@ -1163,7 +1189,22 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
bch_sectors_dirty_init(&dc->disk);
- bch_cached_dev_run(dc);
+ ret = bch_cached_dev_run(dc);
+ if (ret && (ret != -EBUSY)) {
+ up_write(&dc->writeback_lock);
+ /*
+ * bch_register_lock is held, bcache_device_stop() is not
+ * able to be directly called. The kthread and kworker
+ * created previously in bch_cached_dev_writeback_start()
+ * have to be stopped manually here.
+ */
+ kthread_stop(dc->writeback_thread);
+ cancel_writeback_rate_update_dwork(dc);
+ pr_err("Couldn't run cached device %s",
+ dc->backing_dev_name);
+ return ret;
+ }
+
bcache_device_link(&dc->disk, c, "bdev");
atomic_inc(&c->attached_dev_nr);
@@ -1190,18 +1231,16 @@ static void cached_dev_free(struct closure *cl)
{
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
- mutex_lock(&bch_register_lock);
-
if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
cancel_writeback_rate_update_dwork(dc);
if (!IS_ERR_OR_NULL(dc->writeback_thread))
kthread_stop(dc->writeback_thread);
- if (dc->writeback_write_wq)
- destroy_workqueue(dc->writeback_write_wq);
if (!IS_ERR_OR_NULL(dc->status_update_thread))
kthread_stop(dc->status_update_thread);
+ mutex_lock(&bch_register_lock);
+
if (atomic_read(&dc->running))
bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
bcache_device_free(&dc->disk);
@@ -1290,6 +1329,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page,
{
const char *err = "cannot allocate memory";
struct cache_set *c;
+ int ret = -ENOMEM;
bdevname(bdev, dc->backing_dev_name);
memcpy(&dc->sb, sb, sizeof(struct cache_sb));
@@ -1319,14 +1359,18 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page,
bch_cached_dev_attach(dc, c, NULL);
if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
- BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
- bch_cached_dev_run(dc);
+ BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) {
+ err = "failed to run cached device";
+ ret = bch_cached_dev_run(dc);
+ if (ret)
+ goto err;
+ }
return 0;
err:
pr_notice("error %s: %s", dc->backing_dev_name, err);
bcache_device_stop(&dc->disk);
- return -EIO;
+ return ret;
}
/* Flash only volumes */
@@ -1437,8 +1481,6 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
bool bch_cached_dev_error(struct cached_dev *dc)
{
- struct cache_set *c;
-
if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
return false;
@@ -1449,21 +1491,6 @@ bool bch_cached_dev_error(struct cached_dev *dc)
pr_err("stop %s: too many IO errors on backing device %s\n",
dc->disk.disk->disk_name, dc->backing_dev_name);
- /*
- * If the cached device is still attached to a cache set,
- * even dc->io_disable is true and no more I/O requests
- * accepted, cache device internal I/O (writeback scan or
- * garbage collection) may still prevent bcache device from
- * being stopped. So here CACHE_SET_IO_DISABLE should be
- * set to c->flags too, to make the internal I/O to cache
- * device rejected and stopped immediately.
- * If c is NULL, that means the bcache device is not attached
- * to any cache set, then no CACHE_SET_IO_DISABLE bit to set.
- */
- c = dc->disk.c;
- if (c && test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
- pr_info("CACHE_SET_IO_DISABLE already set");
-
bcache_device_stop(&dc->disk);
return true;
}
@@ -1564,19 +1591,23 @@ static void cache_set_flush(struct closure *cl)
kobject_put(&c->internal);
kobject_del(&c->kobj);
- if (c->gc_thread)
+ if (!IS_ERR_OR_NULL(c->gc_thread))
kthread_stop(c->gc_thread);
if (!IS_ERR_OR_NULL(c->root))
list_add(&c->root->list, &c->btree_cache);
- /* Should skip this if we're unregistering because of an error */
- list_for_each_entry(b, &c->btree_cache, list) {
- mutex_lock(&b->write_lock);
- if (btree_node_dirty(b))
- __bch_btree_node_write(b, NULL);
- mutex_unlock(&b->write_lock);
- }
+ /*
+ * Avoid flushing cached nodes if cache set is retiring
+ * due to too many I/O errors detected.
+ */
+ if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
+ list_for_each_entry(b, &c->btree_cache, list) {
+ mutex_lock(&b->write_lock);
+ if (btree_node_dirty(b))
+ __bch_btree_node_write(b, NULL);
+ mutex_unlock(&b->write_lock);
+ }
for_each_cache(ca, c, i)
if (ca->alloc_thread)
@@ -1849,6 +1880,23 @@ static int run_cache_set(struct cache_set *c)
if (bch_btree_check(c))
goto err;
+ /*
+ * bch_btree_check() may occupy too much system memory which
+ * has negative effects to user space application (e.g. data
+ * base) performance. Shrink the mca cache memory proactively
+ * here to avoid competing memory with user space workloads..
+ */
+ if (!c->shrinker_disabled) {
+ struct shrink_control sc;
+
+ sc.gfp_mask = GFP_KERNEL;
+ sc.nr_to_scan = c->btree_cache_used * c->btree_pages;
+ /* first run to clear b->accessed tag */
+ c->shrink.scan_objects(&c->shrink, &sc);
+ /* second run to reap non-accessed nodes */
+ c->shrink.scan_objects(&c->shrink, &sc);
+ }
+
bch_journal_mark(c, &journal);
bch_initial_gc_finish(c);
pr_debug("btree_check() done");
@@ -1957,7 +2005,7 @@ err:
}
closure_sync(&cl);
- /* XXX: test this, it's broken */
+
bch_cache_set_error(c, "%s", err);
return -EIO;
@@ -2251,9 +2299,13 @@ err:
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
const char *buffer, size_t size);
+static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
+ struct kobj_attribute *attr,
+ const char *buffer, size_t size);
kobj_attribute_write(register, register_bcache);
kobj_attribute_write(register_quiet, register_bcache);
+kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup);
static bool bch_is_open_backing(struct block_device *bdev)
{
@@ -2301,6 +2353,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (!try_module_get(THIS_MODULE))
return -EBUSY;
+ /* For latest state of bcache_is_reboot */
+ smp_mb();
+ if (bcache_is_reboot)
+ return -EBUSY;
+
path = kstrndup(buffer, size, GFP_KERNEL);
if (!path)
goto err;
@@ -2378,8 +2435,61 @@ err:
goto out;
}
+
+struct pdev {
+ struct list_head list;
+ struct cached_dev *dc;
+};
+
+static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
+ struct kobj_attribute *attr,
+ const char *buffer,
+ size_t size)
+{
+ LIST_HEAD(pending_devs);
+ ssize_t ret = size;
+ struct cached_dev *dc, *tdc;
+ struct pdev *pdev, *tpdev;
+ struct cache_set *c, *tc;
+
+ mutex_lock(&bch_register_lock);
+ list_for_each_entry_safe(dc, tdc, &uncached_devices, list) {
+ pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL);
+ if (!pdev)
+ break;
+ pdev->dc = dc;
+ list_add(&pdev->list, &pending_devs);
+ }
+
+ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
+ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
+ char *pdev_set_uuid = pdev->dc->sb.set_uuid;
+ char *set_uuid = c->sb.uuid;
+
+ if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
+ list_del(&pdev->list);
+ kfree(pdev);
+ break;
+ }
+ }
+ }
+ mutex_unlock(&bch_register_lock);
+
+ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
+ pr_info("delete pdev %p", pdev);
+ list_del(&pdev->list);
+ bcache_device_stop(&pdev->dc->disk);
+ kfree(pdev);
+ }
+
+ return ret;
+}
+
static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
{
+ if (bcache_is_reboot)
+ return NOTIFY_DONE;
+
if (code == SYS_DOWN ||
code == SYS_HALT ||
code == SYS_POWER_OFF) {
@@ -2392,19 +2502,45 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
mutex_lock(&bch_register_lock);
+ if (bcache_is_reboot)
+ goto out;
+
+ /* New registration is rejected since now */
+ bcache_is_reboot = true;
+ /*
+ * Make registering caller (if there is) on other CPU
+ * core know bcache_is_reboot set to true earlier
+ */
+ smp_mb();
+
if (list_empty(&bch_cache_sets) &&
list_empty(&uncached_devices))
goto out;
+ mutex_unlock(&bch_register_lock);
+
pr_info("Stopping all devices:");
+ /*
+ * The reason bch_register_lock is not held to call
+ * bch_cache_set_stop() and bcache_device_stop() is to
+ * avoid potential deadlock during reboot, because cache
+ * set or bcache device stopping process will acqurie
+ * bch_register_lock too.
+ *
+ * We are safe here because bcache_is_reboot sets to
+ * true already, register_bcache() will reject new
+ * registration now. bcache_is_reboot also makes sure
+ * bcache_reboot() won't be re-entered on by other thread,
+ * so there is no race in following list iteration by
+ * list_for_each_entry_safe().
+ */
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
- mutex_unlock(&bch_register_lock);
/*
* Give an early chance for other kthreads and
@@ -2496,6 +2632,7 @@ static int __init bcache_init(void)
static const struct attribute *files[] = {
&ksysfs_register.attr,
&ksysfs_register_quiet.attr,
+ &ksysfs_pendings_cleanup.attr,
NULL
};
@@ -2531,6 +2668,8 @@ static int __init bcache_init(void)
bch_debug_init();
closure_debug_init();
+ bcache_is_reboot = false;
+
return 0;
err:
bcache_exit();
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index bfb437ffb13c..9f0826712845 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -16,33 +16,31 @@
#include <linux/sort.h>
#include <linux/sched/clock.h>
+extern bool bcache_is_reboot;
+
/* Default is 0 ("writethrough") */
static const char * const bch_cache_modes[] = {
"writethrough",
"writeback",
"writearound",
- "none",
- NULL
+ "none"
};
/* Default is 0 ("auto") */
static const char * const bch_stop_on_failure_modes[] = {
"auto",
- "always",
- NULL
+ "always"
};
static const char * const cache_replacement_policies[] = {
"lru",
"fifo",
- "random",
- NULL
+ "random"
};
static const char * const error_actions[] = {
"unregister",
- "panic",
- NULL
+ "panic"
};
write_attribute(attach);
@@ -84,8 +82,8 @@ read_attribute(bset_tree_stats);
read_attribute(state);
read_attribute(cache_read_races);
read_attribute(reclaim);
+read_attribute(reclaimed_journal_buckets);
read_attribute(flush_write);
-read_attribute(retry_flush_write);
read_attribute(writeback_keys_done);
read_attribute(writeback_keys_failed);
read_attribute(io_errors);
@@ -180,7 +178,7 @@ SHOW(__bch_cached_dev)
var_print(writeback_percent);
sysfs_hprint(writeback_rate,
wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0);
- sysfs_hprint(io_errors, atomic_read(&dc->io_errors));
+ sysfs_printf(io_errors, "%i", atomic_read(&dc->io_errors));
sysfs_printf(io_error_limit, "%i", dc->error_limit);
sysfs_printf(io_disable, "%i", dc->io_disable);
var_print(writeback_rate_update_seconds);
@@ -271,6 +269,10 @@ STORE(__cached_dev)
struct cache_set *c;
struct kobj_uevent_env *env;
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
#define d_strtoul(var) sysfs_strtoul(var, dc->var)
#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
@@ -329,11 +331,14 @@ STORE(__cached_dev)
bch_cache_accounting_clear(&dc->accounting);
if (attr == &sysfs_running &&
- strtoul_or_return(buf))
- bch_cached_dev_run(dc);
+ strtoul_or_return(buf)) {
+ v = bch_cached_dev_run(dc);
+ if (v)
+ return v;
+ }
if (attr == &sysfs_cache_mode) {
- v = __sysfs_match_string(bch_cache_modes, -1, buf);
+ v = sysfs_match_string(bch_cache_modes, buf);
if (v < 0)
return v;
@@ -344,7 +349,7 @@ STORE(__cached_dev)
}
if (attr == &sysfs_stop_when_cache_set_failed) {
- v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf);
+ v = sysfs_match_string(bch_stop_on_failure_modes, buf);
if (v < 0)
return v;
@@ -408,6 +413,10 @@ STORE(bch_cached_dev)
struct cached_dev *dc = container_of(kobj, struct cached_dev,
disk.kobj);
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
mutex_lock(&bch_register_lock);
size = __cached_dev_store(kobj, attr, buf, size);
@@ -464,7 +473,7 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_writeback_rate_p_term_inverse,
&sysfs_writeback_rate_minimum,
&sysfs_writeback_rate_debug,
- &sysfs_errors,
+ &sysfs_io_errors,
&sysfs_io_error_limit,
&sysfs_io_disable,
&sysfs_dirty_data,
@@ -511,6 +520,10 @@ STORE(__bch_flash_dev)
kobj);
struct uuid_entry *u = &d->c->uuids[d->id];
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
sysfs_strtoul(data_csum, d->data_csum);
if (attr == &sysfs_size) {
@@ -693,12 +706,12 @@ SHOW(__bch_cache_set)
sysfs_print(reclaim,
atomic_long_read(&c->reclaim));
+ sysfs_print(reclaimed_journal_buckets,
+ atomic_long_read(&c->reclaimed_journal_buckets));
+
sysfs_print(flush_write,
atomic_long_read(&c->flush_write));
- sysfs_print(retry_flush_write,
- atomic_long_read(&c->retry_flush_write));
-
sysfs_print(writeback_keys_done,
atomic_long_read(&c->writeback_keys_done));
sysfs_print(writeback_keys_failed,
@@ -746,6 +759,10 @@ STORE(__bch_cache_set)
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
ssize_t v;
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
if (attr == &sysfs_unregister)
bch_cache_set_unregister(c);
@@ -799,7 +816,7 @@ STORE(__bch_cache_set)
0, UINT_MAX);
if (attr == &sysfs_errors) {
- v = __sysfs_match_string(error_actions, -1, buf);
+ v = sysfs_match_string(error_actions, buf);
if (v < 0)
return v;
@@ -865,6 +882,10 @@ STORE(bch_cache_set_internal)
{
struct cache_set *c = container_of(kobj, struct cache_set, internal);
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
return bch_cache_set_store(&c->kobj, attr, buf, size);
}
@@ -914,8 +935,8 @@ static struct attribute *bch_cache_set_internal_files[] = {
&sysfs_bset_tree_stats,
&sysfs_cache_read_races,
&sysfs_reclaim,
+ &sysfs_reclaimed_journal_buckets,
&sysfs_flush_write,
- &sysfs_retry_flush_write,
&sysfs_writeback_keys_done,
&sysfs_writeback_keys_failed,
@@ -1050,6 +1071,10 @@ STORE(__bch_cache)
struct cache *ca = container_of(kobj, struct cache, kobj);
ssize_t v;
+ /* no user space access if system is rebooting */
+ if (bcache_is_reboot)
+ return -EBUSY;
+
if (attr == &sysfs_discard) {
bool v = strtoul_or_return(buf);
@@ -1063,7 +1088,7 @@ STORE(__bch_cache)
}
if (attr == &sysfs_cache_replacement_policy) {
- v = __sysfs_match_string(cache_replacement_policies, -1, buf);
+ v = sysfs_match_string(cache_replacement_policies, buf);
if (v < 0)
return v;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 1fbced94e4cc..c029f7443190 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -113,8 +113,6 @@ do { \
#define heap_full(h) ((h)->used == (h)->size)
-#define heap_empty(h) ((h)->used == 0)
-
#define DECLARE_FIFO(type, name) \
struct { \
size_t front, back, size, mask; \
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 73f0efac2b9f..d60268fe49e1 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -122,6 +122,9 @@ static void __update_writeback_rate(struct cached_dev *dc)
static bool set_at_max_writeback_rate(struct cache_set *c,
struct cached_dev *dc)
{
+ /* Don't set max writeback rate if gc is running */
+ if (!c->gc_mark_valid)
+ return false;
/*
* Idle_counter is increased everytime when update_writeback_rate() is
* called. If all backing devices attached to the same cache set have
@@ -735,6 +738,10 @@ static int bch_writeback_thread(void *arg)
}
}
+ if (dc->writeback_write_wq) {
+ flush_workqueue(dc->writeback_write_wq);
+ destroy_workqueue(dc->writeback_write_wq);
+ }
cached_dev_put(dc);
wait_for_kthread_stop();
@@ -830,6 +837,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
"bcache_writeback");
if (IS_ERR(dc->writeback_thread)) {
cached_dev_put(dc);
+ destroy_workqueue(dc->writeback_write_wq);
return PTR_ERR(dc->writeback_thread);
}
dc->writeback_running = true;
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 2a48ea3f1b30..b6b5acc92ca2 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1599,9 +1599,7 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
unsigned long freed;
c = container_of(shrink, struct dm_bufio_client, shrinker);
- if (sc->gfp_mask & __GFP_FS)
- dm_bufio_lock(c);
- else if (!dm_bufio_trylock(c))
+ if (!dm_bufio_trylock(c))
return SHRINK_STOP;
freed = __scan(c, sc->nr_to_scan, sc->gfp_mask);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 1b16d34bb785..d5216bcc4649 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -120,6 +120,10 @@ struct iv_tcw_private {
u8 *whitening;
};
+struct iv_eboiv_private {
+ struct crypto_cipher *tfm;
+};
+
/*
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
@@ -159,6 +163,7 @@ struct crypt_config {
struct iv_benbi_private benbi;
struct iv_lmk_private lmk;
struct iv_tcw_private tcw;
+ struct iv_eboiv_private eboiv;
} iv_gen_private;
u64 iv_offset;
unsigned int iv_size;
@@ -291,8 +296,9 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
* Note that this encryption scheme is vulnerable to watermarking attacks
* and should be used for old compatible containers access only.
*
- * plumb: unimplemented, see:
- * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
+ * eboiv: Encrypted byte-offset IV (used in Bitlocker in CBC mode)
+ * The IV is encrypted little-endian byte-offset (with the same key
+ * and cipher as the volume).
*/
static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
@@ -841,6 +847,67 @@ static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
return 0;
}
+static void crypt_iv_eboiv_dtr(struct crypt_config *cc)
+{
+ struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv;
+
+ crypto_free_cipher(eboiv->tfm);
+ eboiv->tfm = NULL;
+}
+
+static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv;
+ struct crypto_cipher *tfm;
+
+ tfm = crypto_alloc_cipher(cc->cipher, 0, 0);
+ if (IS_ERR(tfm)) {
+ ti->error = "Error allocating crypto tfm for EBOIV";
+ return PTR_ERR(tfm);
+ }
+
+ if (crypto_cipher_blocksize(tfm) != cc->iv_size) {
+ ti->error = "Block size of EBOIV cipher does "
+ "not match IV size of block cipher";
+ crypto_free_cipher(tfm);
+ return -EINVAL;
+ }
+
+ eboiv->tfm = tfm;
+ return 0;
+}
+
+static int crypt_iv_eboiv_init(struct crypt_config *cc)
+{
+ struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv;
+ int err;
+
+ err = crypto_cipher_setkey(eboiv->tfm, cc->key, cc->key_size);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int crypt_iv_eboiv_wipe(struct crypt_config *cc)
+{
+ /* Called after cc->key is set to random key in crypt_wipe() */
+ return crypt_iv_eboiv_init(cc);
+}
+
+static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv;
+
+ memset(iv, 0, cc->iv_size);
+ *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector * cc->sector_size);
+ crypto_cipher_encrypt_one(eboiv->tfm, iv, iv);
+
+ return 0;
+}
+
static const struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
@@ -893,6 +960,14 @@ static struct crypt_iv_operations crypt_iv_random_ops = {
.generator = crypt_iv_random_gen
};
+static struct crypt_iv_operations crypt_iv_eboiv_ops = {
+ .ctr = crypt_iv_eboiv_ctr,
+ .dtr = crypt_iv_eboiv_dtr,
+ .init = crypt_iv_eboiv_init,
+ .wipe = crypt_iv_eboiv_wipe,
+ .generator = crypt_iv_eboiv_gen
+};
+
/*
* Integrity extensions
*/
@@ -2158,6 +2233,14 @@ static int crypt_wipe_key(struct crypt_config *cc)
clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
get_random_bytes(&cc->key, cc->key_size);
+
+ /* Wipe IV private keys */
+ if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
+ r = cc->iv_gen_ops->wipe(cc);
+ if (r)
+ return r;
+ }
+
kzfree(cc->key_string);
cc->key_string = NULL;
r = crypt_setkey(cc);
@@ -2288,6 +2371,8 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
cc->iv_gen_ops = &crypt_iv_benbi_ops;
else if (strcmp(ivmode, "null") == 0)
cc->iv_gen_ops = &crypt_iv_null_ops;
+ else if (strcmp(ivmode, "eboiv") == 0)
+ cc->iv_gen_ops = &crypt_iv_eboiv_ops;
else if (strcmp(ivmode, "lmk") == 0) {
cc->iv_gen_ops = &crypt_iv_lmk_ops;
/*
@@ -2699,7 +2784,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -EINVAL;
}
- cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
+ cc = kzalloc(struct_size(cc, key, key_size), GFP_KERNEL);
if (!cc) {
ti->error = "Cannot allocate encryption context";
return -ENOMEM;
@@ -3050,14 +3135,8 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv,
memset(cc->key, 0, cc->key_size * sizeof(u8));
return ret;
}
- if (argc == 2 && !strcasecmp(argv[1], "wipe")) {
- if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
- ret = cc->iv_gen_ops->wipe(cc);
- if (ret)
- return ret;
- }
+ if (argc == 2 && !strcasecmp(argv[1], "wipe"))
return crypt_wipe_key(cc);
- }
}
error:
@@ -3094,7 +3173,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 18, 1},
+ .version = {1, 19, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index a9bc518156f2..2900fbde89b3 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -461,15 +461,14 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
#ifdef CONFIG_BLK_DEV_ZONED
static int flakey_report_zones(struct dm_target *ti, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones,
- gfp_t gfp_mask)
+ struct blk_zone *zones, unsigned int *nr_zones)
{
struct flakey_c *fc = ti->private;
int ret;
/* Do report and remap it */
ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector),
- zones, nr_zones, gfp_mask);
+ zones, nr_zones);
if (ret != 0)
return ret;
diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c
index 728733a514c7..b869316d3722 100644
--- a/drivers/md/dm-init.c
+++ b/drivers/md/dm-init.c
@@ -25,7 +25,7 @@ static char *create;
* Format: dm-mod.create=<name>,<uuid>,<minor>,<flags>,<table>[,<table>+][;<name>,<uuid>,<minor>,<flags>,<table>[,<table>+]+]
* Table format: <start_sector> <num_sectors> <target_type> <target_args>
*
- * See Documentation/device-mapper/dm-init.txt for dm-mod.create="..." format
+ * See Documentation/admin-guide/device-mapper/dm-init.rst for dm-mod.create="..." format
* details.
*/
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 44e76cda087a..b1b0de402dfc 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -476,6 +476,9 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
io_loc.sector = ic->start;
io_loc.count = SB_SECTORS;
+ if (op == REQ_OP_WRITE)
+ sb_set_version(ic);
+
return dm_io(&io_req, 1, &io_loc, NULL);
}
@@ -2317,7 +2320,6 @@ static void recalc_write_super(struct dm_integrity_c *ic)
if (dm_integrity_failed(ic))
return;
- sb_set_version(ic);
r = sync_rw_sb(ic, REQ_OP_WRITE, 0);
if (unlikely(r))
dm_integrity_io_error(ic, "writing superblock", r);
@@ -3358,7 +3360,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
goto bad;
}
- crypt_iv = kmalloc(ivsize, GFP_KERNEL);
+ crypt_iv = kzalloc(ivsize, GFP_KERNEL);
if (!crypt_iv) {
*error = "Could not allocate iv";
r = -ENOMEM;
@@ -3387,7 +3389,6 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
sg_set_buf(&sg[i], va, PAGE_SIZE);
}
sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
- memset(crypt_iv, 0x00, ivsize);
skcipher_request_set_crypt(req, sg, sg,
PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index ad980a38fb1e..ecefe6703736 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -137,15 +137,14 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
#ifdef CONFIG_BLK_DEV_ZONED
static int linear_report_zones(struct dm_target *ti, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones,
- gfp_t gfp_mask)
+ struct blk_zone *zones, unsigned int *nr_zones)
{
struct linear_c *lc = (struct linear_c *) ti->private;
int ret;
/* Do report and remap it */
ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector),
- zones, nr_zones, gfp_mask);
+ zones, nr_zones);
if (ret != 0)
return ret;
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index e549392e0ea5..99721c76225d 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -40,7 +40,7 @@
*
* Would result in the log looking like this:
*
- * c,a,flush,fuad,b,<other writes>,<next flush>
+ * c,a,b,flush,fuad,<other writes>,<next flush>
*
* This is meant to help expose problems where file systems do not properly wait
* on data being written before invoking a FLUSH. FUA bypasses cache so once it
@@ -699,7 +699,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio)
if (discard_bio)
alloc_size = sizeof(struct pending_block);
else
- alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio);
+ alloc_size = struct_size(block, vecs, bio_segments(bio));
block = kzalloc(alloc_size, GFP_NOIO);
if (!block) {
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 9fdef6897316..8a60a4a070ac 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3558,7 +3558,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
* v1.5.0+:
*
* Sync action:
- * See Documentation/device-mapper/dm-raid.txt for
+ * See Documentation/admin-guide/device-mapper/dm-raid.rst for
* information on each of these states.
*/
DMEMIT(" %s", sync_action);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 5f7063f05ae0..c9e44ac1f9a6 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -115,7 +115,7 @@ static void end_clone_bio(struct bio *clone)
/*
* Update the original request.
- * Do not use blk_end_request() here, because it may complete
+ * Do not use blk_mq_end_request() here, because it may complete
* the original request before the clone, and break the ordering.
*/
if (is_last)
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 3107f2b1988b..63916e1dc569 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1,6 +1,4 @@
/*
- * dm-snapshot.c
- *
* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
*
* This file is released under the GPL.
@@ -134,7 +132,10 @@ struct dm_snapshot {
* - I/O error while merging
* => stop merging; set merge_failed; process I/O normally.
*/
- int merge_failed;
+ bool merge_failed:1;
+
+ bool discard_zeroes_cow:1;
+ bool discard_passdown_origin:1;
/*
* Incoming bios that overlap with chunks being merged must wait
@@ -1173,12 +1174,64 @@ static void stop_merge(struct dm_snapshot *s)
clear_bit(SHUTDOWN_MERGE, &s->state_bits);
}
+static int parse_snapshot_features(struct dm_arg_set *as, struct dm_snapshot *s,
+ struct dm_target *ti)
+{
+ int r;
+ unsigned argc;
+ const char *arg_name;
+
+ static const struct dm_arg _args[] = {
+ {0, 2, "Invalid number of feature arguments"},
+ };
+
+ /*
+ * No feature arguments supplied.
+ */
+ if (!as->argc)
+ return 0;
+
+ r = dm_read_arg_group(_args, as, &argc, &ti->error);
+ if (r)
+ return -EINVAL;
+
+ while (argc && !r) {
+ arg_name = dm_shift_arg(as);
+ argc--;
+
+ if (!strcasecmp(arg_name, "discard_zeroes_cow"))
+ s->discard_zeroes_cow = true;
+
+ else if (!strcasecmp(arg_name, "discard_passdown_origin"))
+ s->discard_passdown_origin = true;
+
+ else {
+ ti->error = "Unrecognised feature requested";
+ r = -EINVAL;
+ break;
+ }
+ }
+
+ if (!s->discard_zeroes_cow && s->discard_passdown_origin) {
+ /*
+ * TODO: really these are disjoint.. but ti->num_discard_bios
+ * and dm_bio_get_target_bio_nr() require rigid constraints.
+ */
+ ti->error = "discard_passdown_origin feature depends on discard_zeroes_cow";
+ r = -EINVAL;
+ }
+
+ return r;
+}
+
/*
- * Construct a snapshot mapping: <origin_dev> <COW-dev> <p|po|n> <chunk-size>
+ * Construct a snapshot mapping:
+ * <origin_dev> <COW-dev> <p|po|n> <chunk-size> [<# feature args> [<arg>]*]
*/
static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct dm_snapshot *s;
+ struct dm_arg_set as;
int i;
int r = -EINVAL;
char *origin_path, *cow_path;
@@ -1186,8 +1239,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
unsigned args_used, num_flush_bios = 1;
fmode_t origin_mode = FMODE_READ;
- if (argc != 4) {
- ti->error = "requires exactly 4 arguments";
+ if (argc < 4) {
+ ti->error = "requires 4 or more arguments";
r = -EINVAL;
goto bad;
}
@@ -1204,6 +1257,13 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
+ as.argc = argc;
+ as.argv = argv;
+ dm_consume_args(&as, 4);
+ r = parse_snapshot_features(&as, s, ti);
+ if (r)
+ goto bad_features;
+
origin_path = argv[0];
argv++;
argc--;
@@ -1289,6 +1349,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->private = s;
ti->num_flush_bios = num_flush_bios;
+ if (s->discard_zeroes_cow)
+ ti->num_discard_bios = (s->discard_passdown_origin ? 2 : 1);
ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk);
/* Add snapshot to the list of snapshots for this origin */
@@ -1336,29 +1398,22 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
bad_read_metadata:
unregister_snapshot(s);
-
bad_load_and_register:
mempool_exit(&s->pending_pool);
-
bad_pending_pool:
dm_kcopyd_client_destroy(s->kcopyd_client);
-
bad_kcopyd:
dm_exception_table_exit(&s->pending, pending_cache);
dm_exception_table_exit(&s->complete, exception_cache);
-
bad_hash_tables:
dm_exception_store_destroy(s->store);
-
bad_store:
dm_put_device(ti, s->cow);
-
bad_cow:
dm_put_device(ti, s->origin);
-
bad_origin:
+bad_features:
kfree(s);
-
bad:
return r;
}
@@ -1806,6 +1861,37 @@ static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
(bio->bi_iter.bi_sector & s->store->chunk_mask);
}
+static void zero_callback(int read_err, unsigned long write_err, void *context)
+{
+ struct bio *bio = context;
+ struct dm_snapshot *s = bio->bi_private;
+
+ up(&s->cow_count);
+ bio->bi_status = write_err ? BLK_STS_IOERR : 0;
+ bio_endio(bio);
+}
+
+static void zero_exception(struct dm_snapshot *s, struct dm_exception *e,
+ struct bio *bio, chunk_t chunk)
+{
+ struct dm_io_region dest;
+
+ dest.bdev = s->cow->bdev;
+ dest.sector = bio->bi_iter.bi_sector;
+ dest.count = s->store->chunk_size;
+
+ down(&s->cow_count);
+ WARN_ON_ONCE(bio->bi_private);
+ bio->bi_private = s;
+ dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio);
+}
+
+static bool io_overlaps_chunk(struct dm_snapshot *s, struct bio *bio)
+{
+ return bio->bi_iter.bi_size ==
+ (s->store->chunk_size << SECTOR_SHIFT);
+}
+
static int snapshot_map(struct dm_target *ti, struct bio *bio)
{
struct dm_exception *e;
@@ -1839,10 +1925,43 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
goto out_unlock;
}
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+ if (s->discard_passdown_origin && dm_bio_get_target_bio_nr(bio)) {
+ /*
+ * passdown discard to origin (without triggering
+ * snapshot exceptions via do_origin; doing so would
+ * defeat the goal of freeing space in origin that is
+ * implied by the "discard_passdown_origin" feature)
+ */
+ bio_set_dev(bio, s->origin->bdev);
+ track_chunk(s, bio, chunk);
+ goto out_unlock;
+ }
+ /* discard to snapshot (target_bio_nr == 0) zeroes exceptions */
+ }
+
/* If the block is already remapped - use that, else remap it */
e = dm_lookup_exception(&s->complete, chunk);
if (e) {
remap_exception(s, e, bio, chunk);
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD) &&
+ io_overlaps_chunk(s, bio)) {
+ dm_exception_table_unlock(&lock);
+ up_read(&s->lock);
+ zero_exception(s, e, bio, chunk);
+ r = DM_MAPIO_SUBMITTED; /* discard is not issued */
+ goto out;
+ }
+ goto out_unlock;
+ }
+
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+ /*
+ * If no exception exists, complete discard immediately
+ * otherwise it'll trigger copy-out.
+ */
+ bio_endio(bio);
+ r = DM_MAPIO_SUBMITTED;
goto out_unlock;
}
@@ -1890,9 +2009,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
r = DM_MAPIO_SUBMITTED;
- if (!pe->started &&
- bio->bi_iter.bi_size ==
- (s->store->chunk_size << SECTOR_SHIFT)) {
+ if (!pe->started && io_overlaps_chunk(s, bio)) {
pe->started = 1;
dm_exception_table_unlock(&lock);
@@ -2138,6 +2255,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
{
unsigned sz = 0;
struct dm_snapshot *snap = ti->private;
+ unsigned num_features;
switch (type) {
case STATUSTYPE_INFO:
@@ -2178,8 +2296,16 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
* make sense.
*/
DMEMIT("%s %s", snap->origin->name, snap->cow->name);
- snap->store->type->status(snap->store, type, result + sz,
- maxlen - sz);
+ sz += snap->store->type->status(snap->store, type, result + sz,
+ maxlen - sz);
+ num_features = snap->discard_zeroes_cow + snap->discard_passdown_origin;
+ if (num_features) {
+ DMEMIT(" %u", num_features);
+ if (snap->discard_zeroes_cow)
+ DMEMIT(" discard_zeroes_cow");
+ if (snap->discard_passdown_origin)
+ DMEMIT(" discard_passdown_origin");
+ }
break;
}
}
@@ -2198,6 +2324,22 @@ static int snapshot_iterate_devices(struct dm_target *ti,
return r;
}
+static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct dm_snapshot *snap = ti->private;
+
+ if (snap->discard_zeroes_cow) {
+ struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
+
+ (void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL);
+ if (snap_src && snap_dest)
+ snap = snap_src;
+
+ /* All discards are split on chunk_size boundary */
+ limits->discard_granularity = snap->store->chunk_size;
+ limits->max_discard_sectors = snap->store->chunk_size;
+ }
+}
/*-----------------------------------------------------------------
* Origin methods
@@ -2522,7 +2664,7 @@ static struct target_type origin_target = {
static struct target_type snapshot_target = {
.name = "snapshot",
- .version = {1, 15, 0},
+ .version = {1, 16, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
@@ -2532,11 +2674,12 @@ static struct target_type snapshot_target = {
.resume = snapshot_resume,
.status = snapshot_status,
.iterate_devices = snapshot_iterate_devices,
+ .io_hints = snapshot_io_hints,
};
static struct target_type merge_target = {
.name = dm_snapshot_merge_target_name,
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
@@ -2547,6 +2690,7 @@ static struct target_type merge_target = {
.resume = snapshot_merge_resume,
.status = snapshot_status,
.iterate_devices = snapshot_iterate_devices,
+ .io_hints = snapshot_io_hints,
};
static int __init dm_snapshot_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ec8b27e20de3..caaee8032afe 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -881,7 +881,7 @@ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
EXPORT_SYMBOL_GPL(dm_table_set_type);
/* validate the dax capability of the target device span */
-static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
+int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
int blocksize = *(int *) data;
@@ -890,7 +890,15 @@ static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
start, len);
}
-bool dm_table_supports_dax(struct dm_table *t, int blocksize)
+/* Check devices support synchronous DAX */
+static int device_synchronous(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ return dax_synchronous(dev->dax_dev);
+}
+
+bool dm_table_supports_dax(struct dm_table *t,
+ iterate_devices_callout_fn iterate_fn, int *blocksize)
{
struct dm_target *ti;
unsigned i;
@@ -903,8 +911,7 @@ bool dm_table_supports_dax(struct dm_table *t, int blocksize)
return false;
if (!ti->type->iterate_devices ||
- !ti->type->iterate_devices(ti, device_supports_dax,
- &blocksize))
+ !ti->type->iterate_devices(ti, iterate_fn, blocksize))
return false;
}
@@ -940,6 +947,7 @@ static int dm_table_determine_type(struct dm_table *t)
struct dm_target *tgt;
struct list_head *devices = dm_table_get_devices(t);
enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
+ int page_size = PAGE_SIZE;
if (t->type != DM_TYPE_NONE) {
/* target already set the table's type */
@@ -984,7 +992,7 @@ static int dm_table_determine_type(struct dm_table *t)
verify_bio_based:
/* We must use this table as bio-based */
t->type = DM_TYPE_BIO_BASED;
- if (dm_table_supports_dax(t, PAGE_SIZE) ||
+ if (dm_table_supports_dax(t, device_supports_dax, &page_size) ||
(list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
t->type = DM_TYPE_DAX_BIO_BASED;
} else {
@@ -1883,6 +1891,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
{
bool wc = false, fua = false;
+ int page_size = PAGE_SIZE;
/*
* Copy table's limits to the DM device's request_queue
@@ -1910,8 +1919,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
}
blk_queue_write_cache(q, wc, fua);
- if (dm_table_supports_dax(t, PAGE_SIZE))
+ if (dm_table_supports_dax(t, device_supports_dax, &page_size)) {
blk_queue_flag_set(QUEUE_FLAG_DAX, q);
+ if (dm_table_supports_dax(t, device_synchronous, NULL))
+ set_dax_synchronous(t->md->dax_dev);
+ }
else
blk_queue_flag_clear(QUEUE_FLAG_DAX, q);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 7f0840601737..4c68a7b93d5e 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -2046,16 +2046,19 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
{
- int r;
+ int r = -EINVAL;
struct dm_block *sblock;
struct thin_disk_superblock *disk_super;
pmd_write_lock(pmd);
+ if (pmd->fail_io)
+ goto out;
+
pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
r = superblock_lock(pmd, &sblock);
if (r) {
- DMERR("couldn't read superblock");
+ DMERR("couldn't lock superblock");
goto out;
}
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index d8334cd45d7c..9faf3e49c7af 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/crc32.h>
+#include <linux/sched/mm.h>
#define DM_MSG_PREFIX "zoned metadata"
@@ -1162,8 +1163,7 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
while (sector < dev->capacity) {
/* Get zone information */
nr_blkz = DMZ_REPORT_NR_ZONES;
- ret = blkdev_report_zones(dev->bdev, sector, blkz,
- &nr_blkz, GFP_KERNEL);
+ ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz);
if (ret) {
dmz_dev_err(dev, "Report zones failed %d", ret);
goto out;
@@ -1201,12 +1201,20 @@ out:
static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
{
unsigned int nr_blkz = 1;
+ unsigned int noio_flag;
struct blk_zone blkz;
int ret;
- /* Get zone information from disk */
+ /*
+ * Get zone information from disk. Since blkdev_report_zones() uses
+ * GFP_KERNEL by default for memory allocations, set the per-task
+ * PF_MEMALLOC_NOIO flag so that all allocations are done as if
+ * GFP_NOIO was specified.
+ */
+ noio_flag = memalloc_noio_save();
ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
- &blkz, &nr_blkz, GFP_NOIO);
+ &blkz, &nr_blkz);
+ memalloc_noio_restore(noio_flag);
if (!nr_blkz)
ret = -EIO;
if (ret) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5475081dcbd6..d0beef033e2f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -441,8 +441,7 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
}
static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones,
- gfp_t gfp_mask)
+ struct blk_zone *zones, unsigned int *nr_zones)
{
#ifdef CONFIG_BLK_DEV_ZONED
struct mapped_device *md = disk->private_data;
@@ -480,8 +479,7 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
* So there is no need to loop here trying to fill the entire array
* of zones.
*/
- ret = tgt->type->report_zones(tgt, sector, zones,
- nr_zones, gfp_mask);
+ ret = tgt->type->report_zones(tgt, sector, zones, nr_zones);
out:
dm_put_live_table(md, srcu_idx);
@@ -1119,7 +1117,7 @@ static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bd
if (!map)
return false;
- ret = dm_table_supports_dax(map, blocksize);
+ ret = dm_table_supports_dax(map, device_supports_dax, &blocksize);
dm_put_live_table(md, srcu_idx);
@@ -1991,7 +1989,8 @@ static struct mapped_device *alloc_dev(int minor)
sprintf(md->disk->disk_name, "dm-%d", minor);
if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
- md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
+ md->dax_dev = alloc_dax(md, md->disk->disk_name,
+ &dm_dax_ops, 0);
if (!md->dax_dev)
goto bad;
}
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 17e3db54404c..0475673337f3 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -72,7 +72,10 @@ bool dm_table_bio_based(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
-bool dm_table_supports_dax(struct dm_table *t, int blocksize);
+bool dm_table_supports_dax(struct dm_table *t, iterate_devices_callout_fn fn,
+ int *blocksize);
+int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data);
void dm_lock_md_type(struct mapped_device *md);
void dm_unlock_md_type(struct mapped_device *md);
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index c01d41198f5e..b092c7b5282f 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1790,6 +1790,8 @@ void md_bitmap_destroy(struct mddev *mddev)
return;
md_bitmap_wait_behind_writes(mddev);
+ mempool_destroy(mddev->wb_info_pool);
+ mddev->wb_info_pool = NULL;
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
@@ -1900,10 +1902,14 @@ int md_bitmap_load(struct mddev *mddev)
sector_t start = 0;
sector_t sector = 0;
struct bitmap *bitmap = mddev->bitmap;
+ struct md_rdev *rdev;
if (!bitmap)
goto out;
+ rdev_for_each(rdev, mddev)
+ mddev_create_wb_pool(mddev, rdev, true);
+
if (mddev_is_clustered(mddev))
md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
@@ -2462,12 +2468,26 @@ static ssize_t
backlog_store(struct mddev *mddev, const char *buf, size_t len)
{
unsigned long backlog;
+ unsigned long old_mwb = mddev->bitmap_info.max_write_behind;
int rv = kstrtoul(buf, 10, &backlog);
if (rv)
return rv;
if (backlog > COUNTER_MAX)
return -EINVAL;
mddev->bitmap_info.max_write_behind = backlog;
+ if (!backlog && mddev->wb_info_pool) {
+ /* wb_info_pool is not needed if backlog is zero */
+ mempool_destroy(mddev->wb_info_pool);
+ mddev->wb_info_pool = NULL;
+ } else if (backlog && !mddev->wb_info_pool) {
+ /* wb_info_pool is needed since backlog is not zero */
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev)
+ mddev_create_wb_pool(mddev, rdev, false);
+ }
+ if (old_mwb != backlog)
+ md_bitmap_update_sb(mddev->bitmap);
return len;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9801d540fea1..24638ccedce4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -37,6 +37,7 @@
*/
+#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/kthread.h>
#include <linux/blkdev.h>
@@ -124,6 +125,77 @@ static inline int speed_max(struct mddev *mddev)
mddev->sync_speed_max : sysctl_speed_limit_max;
}
+static int rdev_init_wb(struct md_rdev *rdev)
+{
+ if (rdev->bdev->bd_queue->nr_hw_queues == 1)
+ return 0;
+
+ spin_lock_init(&rdev->wb_list_lock);
+ INIT_LIST_HEAD(&rdev->wb_list);
+ init_waitqueue_head(&rdev->wb_io_wait);
+ set_bit(WBCollisionCheck, &rdev->flags);
+
+ return 1;
+}
+
+/*
+ * Create wb_info_pool if rdev is the first multi-queue device flaged
+ * with writemostly, also write-behind mode is enabled.
+ */
+void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend)
+{
+ if (mddev->bitmap_info.max_write_behind == 0)
+ return;
+
+ if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev))
+ return;
+
+ if (mddev->wb_info_pool == NULL) {
+ unsigned int noio_flag;
+
+ if (!is_suspend)
+ mddev_suspend(mddev);
+ noio_flag = memalloc_noio_save();
+ mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS,
+ sizeof(struct wb_info));
+ memalloc_noio_restore(noio_flag);
+ if (!mddev->wb_info_pool)
+ pr_err("can't alloc memory pool for writemostly\n");
+ if (!is_suspend)
+ mddev_resume(mddev);
+ }
+}
+EXPORT_SYMBOL_GPL(mddev_create_wb_pool);
+
+/*
+ * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck.
+ */
+static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev)
+{
+ if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags))
+ return;
+
+ if (mddev->wb_info_pool) {
+ struct md_rdev *temp;
+ int num = 0;
+
+ /*
+ * Check if other rdevs need wb_info_pool.
+ */
+ rdev_for_each(temp, mddev)
+ if (temp != rdev &&
+ test_bit(WBCollisionCheck, &temp->flags))
+ num++;
+ if (!num) {
+ mddev_suspend(rdev->mddev);
+ mempool_destroy(mddev->wb_info_pool);
+ mddev->wb_info_pool = NULL;
+ mddev_resume(rdev->mddev);
+ }
+ }
+}
+
static struct ctl_table_header *raid_table_header;
static struct ctl_table raid_table[] = {
@@ -2210,6 +2282,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
rdev->mddev = mddev;
pr_debug("md: bind<%s>\n", b);
+ if (mddev->raid_disks)
+ mddev_create_wb_pool(mddev, rdev, false);
+
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
goto fail;
@@ -2246,6 +2321,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
+ mddev_destroy_wb_pool(rdev->mddev, rdev);
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
@@ -2758,8 +2834,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
+ mddev_create_wb_pool(rdev->mddev, rdev, false);
err = 0;
} else if (cmd_match(buf, "-writemostly")) {
+ mddev_destroy_wb_pool(rdev->mddev, rdev);
clear_bit(WriteMostly, &rdev->flags);
err = 0;
} else if (cmd_match(buf, "blocked")) {
@@ -3356,7 +3434,7 @@ rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
if (!entry->show)
return -EIO;
if (!rdev->mddev)
- return -EBUSY;
+ return -ENODEV;
return entry->show(rdev, page);
}
@@ -5238,7 +5316,8 @@ int mddev_init_writes_pending(struct mddev *mddev)
{
if (mddev->writes_pending.percpu_count_ptr)
return 0;
- if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
+ if (percpu_ref_init(&mddev->writes_pending, no_op,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
return -ENOMEM;
/* We want to start with the refcount at zero */
percpu_ref_put(&mddev->writes_pending);
@@ -5588,15 +5667,28 @@ int md_run(struct mddev *mddev)
mddev->bitmap = bitmap;
}
- if (err) {
- mddev_detach(mddev);
- if (mddev->private)
- pers->free(mddev, mddev->private);
- mddev->private = NULL;
- module_put(pers->owner);
- md_bitmap_destroy(mddev);
- goto abort;
+ if (err)
+ goto bitmap_abort;
+
+ if (mddev->bitmap_info.max_write_behind > 0) {
+ bool creat_pool = false;
+
+ rdev_for_each(rdev, mddev) {
+ if (test_bit(WriteMostly, &rdev->flags) &&
+ rdev_init_wb(rdev))
+ creat_pool = true;
+ }
+ if (creat_pool && mddev->wb_info_pool == NULL) {
+ mddev->wb_info_pool =
+ mempool_create_kmalloc_pool(NR_WB_INFOS,
+ sizeof(struct wb_info));
+ if (!mddev->wb_info_pool) {
+ err = -ENOMEM;
+ goto bitmap_abort;
+ }
+ }
}
+
if (mddev->queue) {
bool nonrot = true;
@@ -5639,8 +5731,7 @@ int md_run(struct mddev *mddev)
spin_unlock(&mddev->lock);
rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0)
- if (sysfs_link_rdev(mddev, rdev))
- /* failure here is OK */;
+ sysfs_link_rdev(mddev, rdev); /* failure here is OK */
if (mddev->degraded && !mddev->ro)
/* This ensures that recovering status is reported immediately
@@ -5658,6 +5749,13 @@ int md_run(struct mddev *mddev)
sysfs_notify(&mddev->kobj, NULL, "degraded");
return 0;
+bitmap_abort:
+ mddev_detach(mddev);
+ if (mddev->private)
+ pers->free(mddev, mddev->private);
+ mddev->private = NULL;
+ module_put(pers->owner);
+ md_bitmap_destroy(mddev);
abort:
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
@@ -5826,6 +5924,8 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
+ mempool_destroy(mddev->wb_info_pool);
+ mddev->wb_info_pool = NULL;
}
void md_stop_writes(struct mddev *mddev)
@@ -8198,8 +8298,7 @@ void md_do_sync(struct md_thread *thread)
{
struct mddev *mddev = thread->mddev;
struct mddev *mddev2;
- unsigned int currspeed = 0,
- window;
+ unsigned int currspeed = 0, window;
sector_t max_sectors,j, io_sectors, recovery_done;
unsigned long mark[SYNC_MARKS];
unsigned long update_time;
@@ -8256,7 +8355,7 @@ void md_do_sync(struct md_thread *thread)
* 0 == not engaged in resync at all
* 2 == checking that there is no conflict with another sync
* 1 == like 2, but have yielded to allow conflicting resync to
- * commense
+ * commence
* other == active in resync - this many blocks
*
* Before starting a resync we must have set curr_resync to
@@ -8387,7 +8486,7 @@ void md_do_sync(struct md_thread *thread)
/*
* Tune reconstruction:
*/
- window = 32*(PAGE_SIZE/512);
+ window = 32 * (PAGE_SIZE / 512);
pr_debug("md: using %dk window, over a total of %lluk.\n",
window/2, (unsigned long long)max_sectors/2);
@@ -9200,7 +9299,6 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
* perform resync with the new activated disk */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
-
}
/* device faulty
* We just want to do the minimum to mark the disk
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7c930c091193..10f98200e2f8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -109,6 +109,14 @@ struct md_rdev {
* for reporting to userspace and storing
* in superblock.
*/
+
+ /*
+ * The members for check collision of write behind IOs.
+ */
+ struct list_head wb_list;
+ spinlock_t wb_list_lock;
+ wait_queue_head_t wb_io_wait;
+
struct work_struct del_work; /* used for delayed sysfs removal */
struct kernfs_node *sysfs_state; /* handle for 'state'
@@ -193,6 +201,10 @@ enum flag_bits {
* it didn't fail, so don't use FailFast
* any more for metadata
*/
+ WBCollisionCheck, /*
+ * multiqueue device should check if there
+ * is collision between write behind bios.
+ */
};
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -245,6 +257,14 @@ enum mddev_sb_flags {
MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
};
+#define NR_WB_INFOS 8
+/* record current range of write behind IOs */
+struct wb_info {
+ sector_t lo;
+ sector_t hi;
+ struct list_head list;
+};
+
struct mddev {
void *private;
struct md_personality *pers;
@@ -461,6 +481,7 @@ struct mddev {
*/
struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
+ mempool_t *wb_info_pool;
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info;
unsigned int good_device_nr; /* good device num within cluster raid */
@@ -709,6 +730,8 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
+extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 400001b815db..54db34163968 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -3,12 +3,42 @@
#define RESYNC_BLOCK_SIZE (64*1024)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+/*
+ * Number of guaranteed raid bios in case of extreme VM load:
+ */
+#define NR_RAID_BIOS 256
+
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error. To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio *)1)
+/* When we successfully write to a known bad-block, we need to remove the
+ * bad-block marking which must be done from process context. So we record
+ * the success by setting devs[n].bio to IO_MADE_GOOD
+ */
+#define IO_MADE_GOOD ((struct bio *)2)
+
+#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
+
+/* When there are this many requests queue to be written by
+ * the raid thread, we become 'congested' to provide back-pressure
+ * for writeback.
+ */
+static int max_queued_requests = 1024;
+
/* for managing resync I/O pages */
struct resync_pages {
void *raid_bio;
struct page *pages[RESYNC_PAGES];
};
+static void rbio_pool_free(void *rbio, void *data)
+{
+ kfree(rbio);
+}
+
static inline int resync_alloc_pages(struct resync_pages *rp,
gfp_t gfp_flags)
{
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 2aa36e570e04..34e26834ad28 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -42,31 +42,6 @@
(1L << MD_HAS_PPL) | \
(1L << MD_HAS_MULTIPLE_PPLS))
-/*
- * Number of guaranteed r1bios in case of extreme VM load:
- */
-#define NR_RAID1_BIOS 256
-
-/* when we get a read error on a read-only array, we redirect to another
- * device without failing the first device, or trying to over-write to
- * correct the read error. To keep track of bad blocks on a per-bio
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
- */
-#define IO_BLOCKED ((struct bio *)1)
-/* When we successfully write to a known bad-block, we need to remove the
- * bad-block marking which must be done from process context. So we record
- * the success by setting devs[n].bio to IO_MADE_GOOD
- */
-#define IO_MADE_GOOD ((struct bio *)2)
-
-#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
-
-/* When there are this many requests queue to be written by
- * the raid1 thread, we become 'congested' to provide back-pressure
- * for writeback.
- */
-static int max_queued_requests = 1024;
-
static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
@@ -75,6 +50,57 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#include "raid1-10.c"
+static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
+{
+ struct wb_info *wi, *temp_wi;
+ unsigned long flags;
+ int ret = 0;
+ struct mddev *mddev = rdev->mddev;
+
+ wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO);
+
+ spin_lock_irqsave(&rdev->wb_list_lock, flags);
+ list_for_each_entry(temp_wi, &rdev->wb_list, list) {
+ /* collision happened */
+ if (hi > temp_wi->lo && lo < temp_wi->hi) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+
+ if (!ret) {
+ wi->lo = lo;
+ wi->hi = hi;
+ list_add(&wi->list, &rdev->wb_list);
+ } else
+ mempool_free(wi, mddev->wb_info_pool);
+ spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
+
+ return ret;
+}
+
+static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
+{
+ struct wb_info *wi;
+ unsigned long flags;
+ int found = 0;
+ struct mddev *mddev = rdev->mddev;
+
+ spin_lock_irqsave(&rdev->wb_list_lock, flags);
+ list_for_each_entry(wi, &rdev->wb_list, list)
+ if (hi == wi->hi && lo == wi->lo) {
+ list_del(&wi->list);
+ mempool_free(wi, mddev->wb_info_pool);
+ found = 1;
+ break;
+ }
+
+ if (!found)
+ WARN(1, "The write behind IO is not recorded\n");
+ spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
+ wake_up(&rdev->wb_io_wait);
+}
+
/*
* for resync bio, r1bio pointer can be retrieved from the per-bio
* 'struct resync_pages'.
@@ -93,11 +119,6 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
return kzalloc(size, gfp_flags);
}
-static void r1bio_pool_free(void *r1_bio, void *data)
-{
- kfree(r1_bio);
-}
-
#define RESYNC_DEPTH 32
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
@@ -173,7 +194,7 @@ out_free_bio:
kfree(rps);
out_free_r1bio:
- r1bio_pool_free(r1_bio, data);
+ rbio_pool_free(r1_bio, data);
return NULL;
}
@@ -193,7 +214,7 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
/* resync pages array stored in the 1st bio's .bi_private */
kfree(rp);
- r1bio_pool_free(r1bio, data);
+ rbio_pool_free(r1bio, data);
}
static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
@@ -476,6 +497,12 @@ static void raid1_end_write_request(struct bio *bio)
}
if (behind) {
+ if (test_bit(WBCollisionCheck, &rdev->flags)) {
+ sector_t lo = r1_bio->sector;
+ sector_t hi = r1_bio->sector + r1_bio->sectors;
+
+ remove_wb(rdev, lo, hi);
+ }
if (test_bit(WriteMostly, &rdev->flags))
atomic_dec(&r1_bio->behind_remaining);
@@ -1449,7 +1476,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (!r1_bio->bios[i])
continue;
-
if (first_clone) {
/* do behind I/O ?
* Not if there are too many, or cannot
@@ -1474,7 +1500,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (r1_bio->behind_master_bio) {
- if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
+
+ if (test_bit(WBCollisionCheck, &rdev->flags)) {
+ sector_t lo = r1_bio->sector;
+ sector_t hi = r1_bio->sector + r1_bio->sectors;
+
+ wait_event(rdev->wb_io_wait,
+ check_and_add_wb(rdev, lo, hi) == 0);
+ }
+ if (test_bit(WriteMostly, &rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
}
@@ -1729,9 +1764,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
first = last = rdev->saved_raid_disk;
for (mirror = first; mirror <= last; mirror++) {
- p = conf->mirrors+mirror;
+ p = conf->mirrors + mirror;
if (!p->rdev) {
-
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
@@ -2888,7 +2922,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
generic_make_request(bio);
-
}
return nr_sectors;
}
@@ -2947,8 +2980,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf->poolinfo)
goto abort;
conf->poolinfo->raid_disks = mddev->raid_disks * 2;
- err = mempool_init(&conf->r1bio_pool, NR_RAID1_BIOS, r1bio_pool_alloc,
- r1bio_pool_free, conf->poolinfo);
+ err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc,
+ rbio_pool_free, conf->poolinfo);
if (err)
goto abort;
@@ -3089,7 +3122,7 @@ static int raid1_run(struct mddev *mddev)
}
mddev->degraded = 0;
- for (i=0; i < conf->raid_disks; i++)
+ for (i = 0; i < conf->raid_disks; i++)
if (conf->mirrors[i].rdev == NULL ||
!test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
test_bit(Faulty, &conf->mirrors[i].rdev->flags))
@@ -3124,7 +3157,7 @@ static int raid1_run(struct mddev *mddev)
mddev->queue);
}
- ret = md_integrity_register(mddev);
+ ret = md_integrity_register(mddev);
if (ret) {
md_unregister_thread(&mddev->thread);
raid1_free(mddev, conf);
@@ -3232,8 +3265,8 @@ static int raid1_reshape(struct mddev *mddev)
newpoolinfo->mddev = mddev;
newpoolinfo->raid_disks = raid_disks * 2;
- ret = mempool_init(&newpool, NR_RAID1_BIOS, r1bio_pool_alloc,
- r1bio_pool_free, newpoolinfo);
+ ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc,
+ rbio_pool_free, newpoolinfo);
if (ret) {
kfree(newpoolinfo);
return ret;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index aea11476fee6..8a1354a08a1a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -64,31 +64,6 @@
* [B A] [D C] [B A] [E C D]
*/
-/*
- * Number of guaranteed r10bios in case of extreme VM load:
- */
-#define NR_RAID10_BIOS 256
-
-/* when we get a read error on a read-only array, we redirect to another
- * device without failing the first device, or trying to over-write to
- * correct the read error. To keep track of bad blocks on a per-bio
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
- */
-#define IO_BLOCKED ((struct bio *)1)
-/* When we successfully write to a known bad-block, we need to remove the
- * bad-block marking which must be done from process context. So we record
- * the success by setting devs[n].bio to IO_MADE_GOOD
- */
-#define IO_MADE_GOOD ((struct bio *)2)
-
-#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
-
-/* When there are this many requests queued to be written by
- * the raid10 thread, we become 'congested' to provide back-pressure
- * for writeback.
- */
-static int max_queued_requests = 1024;
-
static void allow_barrier(struct r10conf *conf);
static void lower_barrier(struct r10conf *conf);
static int _enough(struct r10conf *conf, int previous, int ignore);
@@ -123,11 +98,6 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
return kzalloc(size, gfp_flags);
}
-static void r10bio_pool_free(void *r10_bio, void *data)
-{
- kfree(r10_bio);
-}
-
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
/* amount of memory to reserve for resync requests */
#define RESYNC_WINDOW (1024*1024)
@@ -233,7 +203,7 @@ out_free_bio:
}
kfree(rps);
out_free_r10bio:
- r10bio_pool_free(r10_bio, conf);
+ rbio_pool_free(r10_bio, conf);
return NULL;
}
@@ -261,7 +231,7 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
/* resync pages array stored in the 1st bio's .bi_private */
kfree(rp);
- r10bio_pool_free(r10bio, conf);
+ rbio_pool_free(r10bio, conf);
}
static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
@@ -737,15 +707,19 @@ static struct md_rdev *read_balance(struct r10conf *conf,
int sectors = r10_bio->sectors;
int best_good_sectors;
sector_t new_distance, best_dist;
- struct md_rdev *best_rdev, *rdev = NULL;
+ struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL;
int do_balance;
- int best_slot;
+ int best_dist_slot, best_pending_slot;
+ bool has_nonrot_disk = false;
+ unsigned int min_pending;
struct geom *geo = &conf->geo;
raid10_find_phys(conf, r10_bio);
rcu_read_lock();
- best_slot = -1;
- best_rdev = NULL;
+ best_dist_slot = -1;
+ min_pending = UINT_MAX;
+ best_dist_rdev = NULL;
+ best_pending_rdev = NULL;
best_dist = MaxSector;
best_good_sectors = 0;
do_balance = 1;
@@ -767,6 +741,8 @@ static struct md_rdev *read_balance(struct r10conf *conf,
sector_t first_bad;
int bad_sectors;
sector_t dev_sector;
+ unsigned int pending;
+ bool nonrot;
if (r10_bio->devs[slot].bio == IO_BLOCKED)
continue;
@@ -803,8 +779,8 @@ static struct md_rdev *read_balance(struct r10conf *conf,
first_bad - dev_sector;
if (good_sectors > best_good_sectors) {
best_good_sectors = good_sectors;
- best_slot = slot;
- best_rdev = rdev;
+ best_dist_slot = slot;
+ best_dist_rdev = rdev;
}
if (!do_balance)
/* Must read from here */
@@ -817,14 +793,23 @@ static struct md_rdev *read_balance(struct r10conf *conf,
if (!do_balance)
break;
- if (best_slot >= 0)
+ nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
+ has_nonrot_disk |= nonrot;
+ pending = atomic_read(&rdev->nr_pending);
+ if (min_pending > pending && nonrot) {
+ min_pending = pending;
+ best_pending_slot = slot;
+ best_pending_rdev = rdev;
+ }
+
+ if (best_dist_slot >= 0)
/* At least 2 disks to choose from so failfast is OK */
set_bit(R10BIO_FailFast, &r10_bio->state);
/* This optimisation is debatable, and completely destroys
* sequential read speed for 'far copies' arrays. So only
* keep it for 'near' arrays, and review those later.
*/
- if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
+ if (geo->near_copies > 1 && !pending)
new_distance = 0;
/* for far > 1 always use the lowest address */
@@ -833,15 +818,21 @@ static struct md_rdev *read_balance(struct r10conf *conf,
else
new_distance = abs(r10_bio->devs[slot].addr -
conf->mirrors[disk].head_position);
+
if (new_distance < best_dist) {
best_dist = new_distance;
- best_slot = slot;
- best_rdev = rdev;
+ best_dist_slot = slot;
+ best_dist_rdev = rdev;
}
}
if (slot >= conf->copies) {
- slot = best_slot;
- rdev = best_rdev;
+ if (has_nonrot_disk) {
+ slot = best_pending_slot;
+ rdev = best_pending_rdev;
+ } else {
+ slot = best_dist_slot;
+ rdev = best_dist_rdev;
+ }
}
if (slot >= 0) {
@@ -3675,8 +3666,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
conf->geo = geo;
conf->copies = copies;
- err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc,
- r10bio_pool_free, conf);
+ err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
+ rbio_pool_free, conf);
if (err)
goto out;
@@ -4780,8 +4771,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
int idx = 0;
struct page **pages;
- r10b = kmalloc(sizeof(*r10b) +
- sizeof(struct r10dev) * conf->copies, GFP_NOIO);
+ r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
if (!r10b) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return -ENOMEM;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b83bce2beb66..3de4e13bde98 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5251,7 +5251,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
rcu_read_unlock();
raid_bio->bi_next = (void*)rdev;
bio_set_dev(align_bi, rdev->bdev);
- bio_clear_flag(align_bi, BIO_SEG_VALID);
if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
bio_sectors(align_bi),
@@ -7672,7 +7671,7 @@ abort:
static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r5conf *conf = mddev->private;
- int err = -EEXIST;
+ int ret, err = -EEXIST;
int disk;
struct disk_info *p;
int first = 0;
@@ -7687,7 +7686,14 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* The array is in readonly mode if journal is missing, so no
* write requests running. We should be safe
*/
- log_init(conf, rdev, false);
+ ret = log_init(conf, rdev, false);
+ if (ret)
+ return ret;
+
+ ret = r5l_start(conf->log);
+ if (ret)
+ return ret;
+
return 0;
}
if (mddev->recovery_disabled == conf->recovery_disabled)