From 5fa89fb9a86bcc0f0b3f21ab6087a8a4170dcd2c Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:35 -0700 Subject: bcache: don't write back data if reading it failed If an IO operation fails, and we didn't successfully read data from the cache, don't writeback invalid/partial data to the backing disk. Signed-off-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'drivers/md/bcache/writeback.c') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index e663ca082183..5e65a392287d 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -179,13 +179,21 @@ static void write_dirty(struct closure *cl) struct dirty_io *io = container_of(cl, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; - dirty_init(w); - bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); - io->bio.bi_iter.bi_sector = KEY_START(&w->key); - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; + /* + * IO errors are signalled using the dirty bit on the key. + * If we failed to read, we should not attempt to write to the + * backing device. Instead, immediately go to write_dirty_finish + * to clean up. + */ + if (KEY_DIRTY(&w->key)) { + dirty_init(w); + bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); + io->bio.bi_iter.bi_sector = KEY_START(&w->key); + bio_set_dev(&io->bio, io->dc->bdev); + io->bio.bi_end_io = dirty_endio; - closure_bio_submit(&io->bio, cl); + closure_bio_submit(&io->bio, cl); + } continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); } -- cgit v1.2.3-70-g09d2 From 1d316e658374f700fdfff9299c70ce65d8d145e6 Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:36 -0700 Subject: bcache: implement PI controller for writeback rate bcache uses a control system to attempt to keep the amount of dirty data in cache at a user-configured level, while not responding excessively to transients and variations in write rate. Previously, the system was a PD controller; but the output from it was integrated, turning the Proportional term into an Integral term, and turning the Derivative term into a crude Proportional term. Performance of the controller has been uneven in production, and it has tended to respond slowly, oscillate, and overshoot. This patch set replaces the current control system with an explicit PI controller and tuning that should be correct for most hardware. By default, it attempts to write at a rate that would retire 1/40th of the current excess blocks per second. An integral term in turn works to remove steady state errors. IMO, this yields benefits in simplicity (removing weighted average filtering, etc) and system performance. Another small change is a tunable parameter is introduced to allow the user to specify a minimum rate at which dirty blocks are retired. There is a slight difference from earlier versions of the patch in integral handling to prevent excessive negative integral windup. Signed-off-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 9 ++--- drivers/md/bcache/sysfs.c | 18 +++++---- drivers/md/bcache/writeback.c | 91 ++++++++++++++++++++++++------------------- 3 files changed, 66 insertions(+), 52 deletions(-) (limited to 'drivers/md/bcache/writeback.c') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 2ed9bd231d84..eb83be693d60 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -265,9 +265,6 @@ struct bcache_device { atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; - unsigned long sectors_dirty_last; - long sectors_dirty_derivative; - struct bio_set *bio_split; unsigned data_csum:1; @@ -362,12 +359,14 @@ struct cached_dev { uint64_t writeback_rate_target; int64_t writeback_rate_proportional; - int64_t writeback_rate_derivative; + int64_t writeback_rate_integral; + int64_t writeback_rate_integral_scaled; int64_t writeback_rate_change; unsigned writeback_rate_update_seconds; - unsigned writeback_rate_d_term; + unsigned writeback_rate_i_term_inverse; unsigned writeback_rate_p_term_inverse; + unsigned writeback_rate_minimum; }; enum alloc_reserve { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 69f355b9650c..2290bffd4922 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -81,8 +81,9 @@ rw_attribute(writeback_delay); rw_attribute(writeback_rate); rw_attribute(writeback_rate_update_seconds); -rw_attribute(writeback_rate_d_term); +rw_attribute(writeback_rate_i_term_inverse); rw_attribute(writeback_rate_p_term_inverse); +rw_attribute(writeback_rate_minimum); read_attribute(writeback_rate_debug); read_attribute(stripe_size); @@ -130,15 +131,16 @@ SHOW(__bch_cached_dev) sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); var_print(writeback_rate_update_seconds); - var_print(writeback_rate_d_term); + var_print(writeback_rate_i_term_inverse); var_print(writeback_rate_p_term_inverse); + var_print(writeback_rate_minimum); if (attr == &sysfs_writeback_rate_debug) { char rate[20]; char dirty[20]; char target[20]; char proportional[20]; - char derivative[20]; + char integral[20]; char change[20]; s64 next_io; @@ -146,7 +148,7 @@ SHOW(__bch_cached_dev) bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); bch_hprint(target, dc->writeback_rate_target << 9); bch_hprint(proportional,dc->writeback_rate_proportional << 9); - bch_hprint(derivative, dc->writeback_rate_derivative << 9); + bch_hprint(integral, dc->writeback_rate_integral_scaled << 9); bch_hprint(change, dc->writeback_rate_change << 9); next_io = div64_s64(dc->writeback_rate.next - local_clock(), @@ -157,11 +159,11 @@ SHOW(__bch_cached_dev) "dirty:\t\t%s\n" "target:\t\t%s\n" "proportional:\t%s\n" - "derivative:\t%s\n" + "integral:\t%s\n" "change:\t\t%s/sec\n" "next io:\t%llims\n", rate, dirty, target, proportional, - derivative, change, next_io); + integral, change, next_io); } sysfs_hprint(dirty_data, @@ -213,7 +215,7 @@ STORE(__cached_dev) dc->writeback_rate.rate, 1, INT_MAX); d_strtoul_nonzero(writeback_rate_update_seconds); - d_strtoul(writeback_rate_d_term); + d_strtoul(writeback_rate_i_term_inverse); d_strtoul_nonzero(writeback_rate_p_term_inverse); d_strtoi_h(sequential_cutoff); @@ -319,7 +321,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_percent, &sysfs_writeback_rate, &sysfs_writeback_rate_update_seconds, - &sysfs_writeback_rate_d_term, + &sysfs_writeback_rate_i_term_inverse, &sysfs_writeback_rate_p_term_inverse, &sysfs_writeback_rate_debug, &sysfs_dirty_data, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 5e65a392287d..cac8678da5d0 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -25,48 +25,62 @@ static void __update_writeback_rate(struct cached_dev *dc) bcache_flash_devs_sectors_dirty(c); uint64_t cache_dirty_target = div_u64(cache_sectors * dc->writeback_percent, 100); - int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), c->cached_dev_sectors); - /* PD controller */ - + /* + * PI controller: + * Figures out the amount that should be written per second. + * + * First, the error (number of sectors that are dirty beyond our + * target) is calculated. The error is accumulated (numerically + * integrated). + * + * Then, the proportional value and integral value are scaled + * based on configured values. These are stored as inverses to + * avoid fixed point math and to make configuration easy-- e.g. + * the default value of 40 for writeback_rate_p_term_inverse + * attempts to write at a rate that would retire all the dirty + * blocks in 40 seconds. + * + * The writeback_rate_i_inverse value of 10000 means that 1/10000th + * of the error is accumulated in the integral term per second. + * This acts as a slow, long-term average that is not subject to + * variations in usage like the p term. + */ int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); - int64_t derivative = dirty - dc->disk.sectors_dirty_last; - int64_t proportional = dirty - target; - int64_t change; - - dc->disk.sectors_dirty_last = dirty; - - /* Scale to sectors per second */ - - proportional *= dc->writeback_rate_update_seconds; - proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); - - derivative = div_s64(derivative, dc->writeback_rate_update_seconds); - - derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, - (dc->writeback_rate_d_term / - dc->writeback_rate_update_seconds) ?: 1, 0); - - derivative *= dc->writeback_rate_d_term; - derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); - - change = proportional + derivative; + int64_t error = dirty - target; + int64_t proportional_scaled = + div_s64(error, dc->writeback_rate_p_term_inverse); + int64_t integral_scaled, new_rate; + + if ((error < 0 && dc->writeback_rate_integral > 0) || + (error > 0 && time_before64(local_clock(), + dc->writeback_rate.next + NSEC_PER_MSEC))) { + /* + * Only decrease the integral term if it's more than + * zero. Only increase the integral term if the device + * is keeping up. (Don't wind up the integral + * ineffectively in either case). + * + * It's necessary to scale this by + * writeback_rate_update_seconds to keep the integral + * term dimensioned properly. + */ + dc->writeback_rate_integral += error * + dc->writeback_rate_update_seconds; + } - /* Don't increase writeback rate if the device isn't keeping up */ - if (change > 0 && - time_after64(local_clock(), - dc->writeback_rate.next + NSEC_PER_MSEC)) - change = 0; + integral_scaled = div_s64(dc->writeback_rate_integral, + dc->writeback_rate_i_term_inverse); - dc->writeback_rate.rate = - clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, - 1, NSEC_PER_MSEC); + new_rate = clamp_t(int64_t, (proportional_scaled + integral_scaled), + dc->writeback_rate_minimum, NSEC_PER_MSEC); - dc->writeback_rate_proportional = proportional; - dc->writeback_rate_derivative = derivative; - dc->writeback_rate_change = change; + dc->writeback_rate_proportional = proportional_scaled; + dc->writeback_rate_integral_scaled = integral_scaled; + dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; + dc->writeback_rate.rate = new_rate; dc->writeback_rate_target = target; } @@ -499,8 +513,6 @@ void bch_sectors_dirty_init(struct bcache_device *d) bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); - - d->sectors_dirty_last = bcache_dev_sectors_dirty(d); } void bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -514,10 +526,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_percent = 10; dc->writeback_delay = 30; dc->writeback_rate.rate = 1024; + dc->writeback_rate_minimum = 1; dc->writeback_rate_update_seconds = 5; - dc->writeback_rate_d_term = 30; - dc->writeback_rate_p_term_inverse = 6000; + dc->writeback_rate_p_term_inverse = 40; + dc->writeback_rate_i_term_inverse = 10000; INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); } -- cgit v1.2.3-70-g09d2 From ae82ddbfeb359fcffa97be5fb5bcd59165f2864f Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:37 -0700 Subject: bcache: smooth writeback rate control This works in conjunction with the new PI controller. Currently, in real-world workloads, the rate controller attempts to write back 1 sector per second. In practice, these minimum-rate writebacks are between 4k and 60k in test scenarios, since bcache aggregates and attempts to do contiguous writes and because filesystems on top of bcachefs typically write 4k or more. Previously, bcache used to guarantee to write at least once per second. This means that the actual writeback rate would exceed the configured amount by a factor of 8-120 or more. This patch adjusts to be willing to sleep up to 2.5 seconds, and to target writing 4k/second. On the smallest writes, it will sleep 1 second like before, but many times it will sleep longer and load the backing device less. This keeps the loading on the cache and backing device related to writeback more consistent when writing back at low rates. Signed-off-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/util.c | 10 ++++++++-- drivers/md/bcache/writeback.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/md/bcache/writeback.c') diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 176d3c2ef5f5..4dbe37e82877 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -232,8 +232,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) d->next += div_u64(done * NSEC_PER_SEC, d->rate); - if (time_before64(now + NSEC_PER_SEC, d->next)) - d->next = now + NSEC_PER_SEC; + /* Bound the time. Don't let us fall further than 2 seconds behind + * (this prevents unnecessary backlog that would make it impossible + * to catch up). If we're ahead of the desired writeback rate, + * don't let us sleep more than 2.5 seconds (so we can notice/respond + * if the control system tells us to speed up!). + */ + if (time_before64(now + NSEC_PER_SEC * 5 / 2, d->next)) + d->next = now + NSEC_PER_SEC * 5 / 2; if (time_after64(now - NSEC_PER_SEC * 2, d->next)) d->next = now - NSEC_PER_SEC * 2; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index cac8678da5d0..8deb721c355e 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -526,7 +526,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_percent = 10; dc->writeback_delay = 30; dc->writeback_rate.rate = 1024; - dc->writeback_rate_minimum = 1; + dc->writeback_rate_minimum = 8; dc->writeback_rate_update_seconds = 5; dc->writeback_rate_p_term_inverse = 40; -- cgit v1.2.3-70-g09d2 From e41166c5c44e30dbd620f7c77a27efe5d5cc551a Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:38 -0700 Subject: bcache: writeback rate shouldn't artifically clamp The previous code artificially limited writeback rate to 1000000 blocks/second (NSEC_PER_MSEC), which is a rate that can be met on fast hardware. The rate limiting code works fine (though with decreased precision) up to 3 orders of magnitude faster, so use NSEC_PER_SEC. Additionally, ensure that uint32_t is used as a type for rate throughout the rate management so that type checking/clamp_t can work properly. bch_next_delay should be rewritten for increased precision and better handling of high rates and long sleep periods, but this is adequate for now. Signed-off-by: Michael Lyle Reported-by: Coly Li Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/util.h | 4 ++-- drivers/md/bcache/writeback.c | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/md/bcache/writeback.c') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index eb83be693d60..d77c4829c497 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -361,7 +361,7 @@ struct cached_dev { int64_t writeback_rate_proportional; int64_t writeback_rate_integral; int64_t writeback_rate_integral_scaled; - int64_t writeback_rate_change; + int32_t writeback_rate_change; unsigned writeback_rate_update_seconds; unsigned writeback_rate_i_term_inverse; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index cb8d2ccbb6c6..8f509290bb02 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -441,10 +441,10 @@ struct bch_ratelimit { uint64_t next; /* - * Rate at which we want to do work, in units per nanosecond + * Rate at which we want to do work, in units per second * The units here correspond to the units passed to bch_next_delay() */ - unsigned rate; + uint32_t rate; }; static inline void bch_ratelimit_reset(struct bch_ratelimit *d) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 8deb721c355e..897d28050656 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -52,7 +52,8 @@ static void __update_writeback_rate(struct cached_dev *dc) int64_t error = dirty - target; int64_t proportional_scaled = div_s64(error, dc->writeback_rate_p_term_inverse); - int64_t integral_scaled, new_rate; + int64_t integral_scaled; + uint32_t new_rate; if ((error < 0 && dc->writeback_rate_integral > 0) || (error > 0 && time_before64(local_clock(), @@ -74,8 +75,8 @@ static void __update_writeback_rate(struct cached_dev *dc) integral_scaled = div_s64(dc->writeback_rate_integral, dc->writeback_rate_i_term_inverse); - new_rate = clamp_t(int64_t, (proportional_scaled + integral_scaled), - dc->writeback_rate_minimum, NSEC_PER_MSEC); + new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled), + dc->writeback_rate_minimum, NSEC_PER_SEC); dc->writeback_rate_proportional = proportional_scaled; dc->writeback_rate_integral_scaled = integral_scaled; -- cgit v1.2.3-70-g09d2 From a8500fc816b19795756d27c762daa5e19f5e1b6f Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:39 -0700 Subject: bcache: rearrange writeback main thread ratelimit The time spent searching for things to write back "counts" for the actual rate achieved, so don't flush the accumulated rate with each chunk. This will maintain better fidelity to user-commanded rates, but it may slightly increase the burstiness of writeback. The writeback lock needs improvement to help mitigate this. Signed-off-by: Michael Lyle Reviewed-by: Kent Overstreet Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/md/bcache/writeback.c') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 897d28050656..9b770b13bdf6 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -440,6 +440,8 @@ static int bch_writeback_thread(void *arg) struct cached_dev *dc = arg; bool searched_full_index; + bch_ratelimit_reset(&dc->writeback_rate); + while (!kthread_should_stop()) { down_write(&dc->writeback_lock); if (!atomic_read(&dc->has_dirty) || @@ -467,7 +469,6 @@ static int bch_writeback_thread(void *arg) up_write(&dc->writeback_lock); - bch_ratelimit_reset(&dc->writeback_rate); read_dirty(dc); if (searched_full_index) { @@ -477,6 +478,8 @@ static int bch_writeback_thread(void *arg) !kthread_should_stop() && !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) delay = schedule_timeout_interruptible(delay); + + bch_ratelimit_reset(&dc->writeback_rate); } } -- cgit v1.2.3-70-g09d2