From 280249b9d9b9a62562ddeb5429a7d29d2f03ba1c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 16 Jan 2021 15:40:33 -0500 Subject: bcachefs: Correctly order flushes and journal writes on multi device filesystems All writes prior to a journal write need to be flushed before the journal write itself happens. On single device filesystems, it suffices to mark the write with REQ_PREFLUSH|REQ_FUA, but on multi device filesystems we need to issue flushes to every device - and wait for them to complete - before issuing the journal writes. Previously, we were issuing flushes to every device, but we weren't waiting for them to complete before issuing the journal writes. Signed-off-by: Kent Overstreet Signed-off-by: Kent Overstreet --- fs/bcachefs/io.c | 3 -- fs/bcachefs/journal.c | 1 + fs/bcachefs/journal.h | 5 --- fs/bcachefs/journal_io.c | 99 ++++++++++++++++++++++++++++----------------- fs/bcachefs/journal_types.h | 1 + 5 files changed, 65 insertions(+), 44 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index bc1e2dc04850..8a4d05eee381 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -509,9 +509,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->submit_time = local_clock(); n->bio.bi_iter.bi_sector = ptr->offset; - if (!journal_flushes_device(ca)) - n->bio.bi_opf |= REQ_FUA; - if (likely(n->have_ioref)) { this_cpu_add(ca->io_done->sectors[WRITE][type], bio_sectors(&n->bio)); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index e90fe042302f..6f84a5dd06bc 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -81,6 +81,7 @@ static void bch2_journal_buf_init(struct journal *j) bkey_extent_init(&buf->key); buf->noflush = false; buf->must_flush = false; + buf->separate_flush = false; memset(buf->has_inode, 0, sizeof(buf->has_inode)); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index df353a18011b..547c735ce3cb 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -496,11 +496,6 @@ static inline int bch2_journal_error(struct journal *j) struct bch_dev; -static inline bool journal_flushes_device(struct bch_dev *ca) -{ - return true; -} - static inline void bch2_journal_set_replay_done(struct journal *j) { BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index f6c9681badea..40da18d778a3 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1188,6 +1188,51 @@ static void journal_write_endio(struct bio *bio) percpu_ref_put(&ca->io_ref); } +static void do_journal_write(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct journal_buf *w = journal_last_unwritten_buf(j); + struct bch_extent_ptr *ptr; + struct bio *bio; + unsigned sectors = vstruct_sectors(w->data, c->block_bits); + + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { + ca = bch_dev_bkey_exists(c, ptr->dev); + if (!percpu_ref_tryget(&ca->io_ref)) { + /* XXX: fix this */ + bch_err(c, "missing device for journal write\n"); + continue; + } + + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], + sectors); + + bio = ca->journal.bio; + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); + bio->bi_iter.bi_sector = ptr->offset; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + + if (!JSET_NO_FLUSH(w->data)) + bio->bi_opf |= REQ_FUA; + if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) + bio->bi_opf |= REQ_PREFLUSH; + + bch2_bio_map(bio, w->data, sectors << 9); + + trace_journal_write(bio); + closure_bio_submit(bio, cl); + + ca->journal.bucket_seq[ca->journal.cur_idx] = + le64_to_cpu(w->data->seq); + } + + continue_at(cl, journal_write_done, system_highpri_wq); + return; +} + void bch2_journal_write(struct closure *cl) { struct journal *j = container_of(cl, struct journal, io); @@ -1197,9 +1242,8 @@ void bch2_journal_write(struct closure *cl) struct jset_entry *start, *end; struct jset *jset; struct bio *bio; - struct bch_extent_ptr *ptr; bool validate_before_checksum = false; - unsigned i, sectors, bytes, u64s; + unsigned i, sectors, bytes, u64s, nr_rw_members = 0; int ret; BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); @@ -1329,45 +1373,28 @@ retry_alloc: if (c->opts.nochanges) goto no_io; - extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - if (!percpu_ref_tryget(&ca->io_ref)) { - /* XXX: fix this */ - bch_err(c, "missing device for journal write\n"); - continue; - } - - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], - sectors); + for_each_rw_member(ca, c, i) + nr_rw_members++; - bio = ca->journal.bio; - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); - bio->bi_iter.bi_sector = ptr->offset; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - if (!JSET_NO_FLUSH(jset)) - bio->bi_opf |= REQ_PREFLUSH|REQ_FUA; - bch2_bio_map(bio, jset, sectors << 9); + if (nr_rw_members > 1) + w->separate_flush = true; - trace_journal_write(bio); - closure_bio_submit(bio, cl); + if (!JSET_NO_FLUSH(jset) && w->separate_flush) { + for_each_rw_member(ca, c, i) { + percpu_ref_get(&ca->io_ref); - ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); + bio = ca->journal.bio; + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + closure_bio_submit(bio, cl); + } } - if (!JSET_NO_FLUSH(jset)) { - for_each_rw_member(ca, c, i) - if (journal_flushes_device(ca) && - !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { - percpu_ref_get(&ca->io_ref); - - bio = ca->journal.bio; - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - closure_bio_submit(bio, cl); - } - } + bch2_bucket_seq_cleanup(c); + + continue_at(cl, do_journal_write, system_highpri_wq); + return; no_io: bch2_bucket_seq_cleanup(c); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 150e691d5317..8ad10e46dd5d 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -31,6 +31,7 @@ struct journal_buf { unsigned u64s_reserved; bool noflush; /* write has already been kicked off, and was noflush */ bool must_flush; /* something wants a flush */ + bool separate_flush; /* bloom filter: */ unsigned long has_inode[1024 / sizeof(unsigned long)]; }; -- cgit v1.2.3-70-g09d2