diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-08-31 14:55:09 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-08-31 14:55:09 -0700 |
commit | efa916af13206eb15916e102c45c99a13ea78f33 (patch) | |
tree | ef890b2cb3f982427c3a10ada5904a095f5898ea /drivers/md/dm-writecache.c | |
parent | a998a62be9cdb509491731ffe81575aa09943a32 (diff) | |
parent | d3703ef331297b6daa97f5228cbe2a657d0cfd21 (diff) |
Merge tag 'for-5.15/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- Add DM infrastructure for IMA-based remote attestion. These changes
are the basis for deploying DM-based storage in a "cloud" that must
validate configurations end-users run to maintain trust. These DM
changes allow supported DM targets' configurations to be measured via
IMA. But the policy and enforcement (of which configurations are
valid) is managed by something outside the kernel (e.g. Keylime).
- Fix DM crypt scalability regression on systems with many cpus due to
percpu_counter spinlock contention in crypt_page_alloc().
- Use in_hardirq() instead of deprecated in_irq() in DM crypt.
- Add event counters to DM writecache to allow users to further assess
how the writecache is performing.
- Various code cleanup in DM writecache's main IO mapping function.
* tag 'for-5.15/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
dm crypt: use in_hardirq() instead of deprecated in_irq()
dm ima: update dm documentation for ima measurement support
dm ima: update dm target attributes for ima measurements
dm ima: add a warning in dm_init if duplicate ima events are not measured
dm ima: prefix ima event name related to device mapper with dm_
dm ima: add version info to dm related events in ima log
dm ima: prefix dm table hashes in ima log with hash algorithm
dm crypt: Avoid percpu_counter spinlock contention in crypt_page_alloc()
dm: add documentation for IMA measurement support
dm: update target status functions to support IMA measurement
dm ima: measure data on device rename
dm ima: measure data on table clear
dm ima: measure data on device remove
dm ima: measure data on device resume
dm ima: measure data on table load
dm writecache: add event counters
dm writecache: report invalid return from writecache_map helpers
dm writecache: further writecache_map() cleanup
dm writecache: factor out writecache_map_remap_origin()
dm writecache: split up writecache_map() to improve code readability
Diffstat (limited to 'drivers/md/dm-writecache.c')
-rw-r--r-- | drivers/md/dm-writecache.c | 467 |
1 files changed, 285 insertions, 182 deletions
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 3d2cf811ec3e..18320444fb0a 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -206,6 +206,19 @@ struct dm_writecache { struct bio_set bio_set; mempool_t copy_pool; + + struct { + unsigned long long reads; + unsigned long long read_hits; + unsigned long long writes; + unsigned long long write_hits_uncommitted; + unsigned long long write_hits_committed; + unsigned long long writes_around; + unsigned long long writes_allocate; + unsigned long long writes_blocked_on_freelist; + unsigned long long flushes; + unsigned long long discards; + } stats; }; #define WB_LIST_INLINE 16 @@ -1157,6 +1170,18 @@ static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache return 0; } +static int process_clear_stats_mesg(unsigned argc, char **argv, struct dm_writecache *wc) +{ + if (argc != 1) + return -EINVAL; + + wc_lock(wc); + memset(&wc->stats, 0, sizeof wc->stats); + wc_unlock(wc); + + return 0; +} + static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, char *result, unsigned maxlen) { @@ -1169,6 +1194,8 @@ static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, r = process_flush_on_suspend_mesg(argc, argv, wc); else if (!strcasecmp(argv[0], "cleaner")) r = process_cleaner_mesg(argc, argv, wc); + else if (!strcasecmp(argv[0], "clear_stats")) + r = process_clear_stats_mesg(argc, argv, wc); else DMERR("unrecognised message received: %s", argv[0]); @@ -1293,216 +1320,278 @@ static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio) bio_list_add(&wc->flush_list, bio); } -static int writecache_map(struct dm_target *ti, struct bio *bio) +enum wc_map_op { + WC_MAP_SUBMIT, + WC_MAP_REMAP, + WC_MAP_REMAP_ORIGIN, + WC_MAP_RETURN, + WC_MAP_ERROR, +}; + +static enum wc_map_op writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio, + struct wc_entry *e) { - struct wc_entry *e; - struct dm_writecache *wc = ti->private; + if (e) { + sector_t next_boundary = + read_original_sector(wc, e) - bio->bi_iter.bi_sector; + if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) + dm_accept_partial_bio(bio, next_boundary); + } - bio->bi_private = NULL; + return WC_MAP_REMAP_ORIGIN; +} - wc_lock(wc); +static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio) +{ + enum wc_map_op map_op; + struct wc_entry *e; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - if (writecache_has_error(wc)) - goto unlock_error; +read_next_block: + wc->stats.reads++; + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); + if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) { + wc->stats.read_hits++; if (WC_MODE_PMEM(wc)) { - writecache_flush(wc); - if (writecache_has_error(wc)) - goto unlock_error; - if (unlikely(wc->cleaner) || unlikely(wc->metadata_only)) - goto unlock_remap_origin; - goto unlock_submit; + bio_copy_block(wc, bio, memory_data(wc, e)); + if (bio->bi_iter.bi_size) + goto read_next_block; + map_op = WC_MAP_SUBMIT; } else { - if (dm_bio_get_target_bio_nr(bio)) - goto unlock_remap_origin; - writecache_offload_bio(wc, bio); - goto unlock_return; + dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); + bio_set_dev(bio, wc->ssd_dev->bdev); + bio->bi_iter.bi_sector = cache_sector(wc, e); + if (!writecache_entry_is_committed(wc, e)) + writecache_wait_for_ios(wc, WRITE); + map_op = WC_MAP_REMAP; } + } else { + map_op = writecache_map_remap_origin(wc, bio, e); } - bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); + return map_op; +} - if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) & - (wc->block_size / 512 - 1)) != 0)) { - DMERR("I/O is not aligned, sector %llu, size %u, block size %u", - (unsigned long long)bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, wc->block_size); - goto unlock_error; - } +static enum wc_map_op writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio, + struct wc_entry *e, bool search_used) +{ + unsigned bio_size = wc->block_size; + sector_t start_cache_sec = cache_sector(wc, e); + sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); - if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { - if (writecache_has_error(wc)) - goto unlock_error; - if (WC_MODE_PMEM(wc)) { - writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio)); - goto unlock_remap_origin; + while (bio_size < bio->bi_iter.bi_size) { + if (!search_used) { + struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); + if (!f) + break; + write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + + (bio_size >> SECTOR_SHIFT), wc->seq_count); + writecache_insert_entry(wc, f); + wc->uncommitted_blocks++; } else { - writecache_offload_bio(wc, bio); - goto unlock_return; + struct wc_entry *f; + struct rb_node *next = rb_next(&e->rb_node); + if (!next) + break; + f = container_of(next, struct wc_entry, rb_node); + if (f != e + 1) + break; + if (read_original_sector(wc, f) != + read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) + break; + if (unlikely(f->write_in_progress)) + break; + if (writecache_entry_is_committed(wc, f)) + wc->overwrote_committed = true; + e = f; } + bio_size += wc->block_size; + current_cache_sec += wc->block_size >> SECTOR_SHIFT; } - if (bio_data_dir(bio) == READ) { -read_next_block: - e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); - if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) { - if (WC_MODE_PMEM(wc)) { - bio_copy_block(wc, bio, memory_data(wc, e)); - if (bio->bi_iter.bi_size) - goto read_next_block; - goto unlock_submit; - } else { - dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); - bio_set_dev(bio, wc->ssd_dev->bdev); - bio->bi_iter.bi_sector = cache_sector(wc, e); - if (!writecache_entry_is_committed(wc, e)) - writecache_wait_for_ios(wc, WRITE); - goto unlock_remap; + bio_set_dev(bio, wc->ssd_dev->bdev); + bio->bi_iter.bi_sector = start_cache_sec; + dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); + + if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { + wc->uncommitted_blocks = 0; + queue_work(wc->writeback_wq, &wc->flush_work); + } else { + writecache_schedule_autocommit(wc); + } + + return WC_MAP_REMAP; +} + +static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio) +{ + struct wc_entry *e; + + do { + bool found_entry = false; + bool search_used = false; + wc->stats.writes++; + if (writecache_has_error(wc)) + return WC_MAP_ERROR; + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); + if (e) { + if (!writecache_entry_is_committed(wc, e)) { + wc->stats.write_hits_uncommitted++; + search_used = true; + goto bio_copy; } - } else { - if (e) { - sector_t next_boundary = - read_original_sector(wc, e) - bio->bi_iter.bi_sector; - if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) { - dm_accept_partial_bio(bio, next_boundary); - } + wc->stats.write_hits_committed++; + if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { + wc->overwrote_committed = true; + search_used = true; + goto bio_copy; } - goto unlock_remap_origin; + found_entry = true; + } else { + if (unlikely(wc->cleaner) || + (wc->metadata_only && !(bio->bi_opf & REQ_META))) + goto direct_write; } - } else { - do { - bool found_entry = false; - bool search_used = false; - if (writecache_has_error(wc)) - goto unlock_error; - e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); - if (e) { - if (!writecache_entry_is_committed(wc, e)) { - search_used = true; - goto bio_copy; - } - if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { - wc->overwrote_committed = true; - search_used = true; - goto bio_copy; - } - found_entry = true; - } else { - if (unlikely(wc->cleaner) || - (wc->metadata_only && !(bio->bi_opf & REQ_META))) - goto direct_write; - } - e = writecache_pop_from_freelist(wc, (sector_t)-1); - if (unlikely(!e)) { - if (!WC_MODE_PMEM(wc) && !found_entry) { + e = writecache_pop_from_freelist(wc, (sector_t)-1); + if (unlikely(!e)) { + if (!WC_MODE_PMEM(wc) && !found_entry) { direct_write: - e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); - if (e) { - sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector; - BUG_ON(!next_boundary); - if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) { - dm_accept_partial_bio(bio, next_boundary); - } - } - goto unlock_remap_origin; - } - writecache_wait_on_freelist(wc); - continue; + wc->stats.writes_around++; + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); + return writecache_map_remap_origin(wc, bio, e); } - write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count); - writecache_insert_entry(wc, e); - wc->uncommitted_blocks++; + wc->stats.writes_blocked_on_freelist++; + writecache_wait_on_freelist(wc); + continue; + } + write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count); + writecache_insert_entry(wc, e); + wc->uncommitted_blocks++; + wc->stats.writes_allocate++; bio_copy: - if (WC_MODE_PMEM(wc)) { - bio_copy_block(wc, bio, memory_data(wc, e)); - } else { - unsigned bio_size = wc->block_size; - sector_t start_cache_sec = cache_sector(wc, e); - sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); - - while (bio_size < bio->bi_iter.bi_size) { - if (!search_used) { - struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); - if (!f) - break; - write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + - (bio_size >> SECTOR_SHIFT), wc->seq_count); - writecache_insert_entry(wc, f); - wc->uncommitted_blocks++; - } else { - struct wc_entry *f; - struct rb_node *next = rb_next(&e->rb_node); - if (!next) - break; - f = container_of(next, struct wc_entry, rb_node); - if (f != e + 1) - break; - if (read_original_sector(wc, f) != - read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) - break; - if (unlikely(f->write_in_progress)) - break; - if (writecache_entry_is_committed(wc, f)) - wc->overwrote_committed = true; - e = f; - } - bio_size += wc->block_size; - current_cache_sec += wc->block_size >> SECTOR_SHIFT; - } + if (WC_MODE_PMEM(wc)) + bio_copy_block(wc, bio, memory_data(wc, e)); + else + return writecache_bio_copy_ssd(wc, bio, e, search_used); + } while (bio->bi_iter.bi_size); - bio_set_dev(bio, wc->ssd_dev->bdev); - bio->bi_iter.bi_sector = start_cache_sec; - dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); + if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks)) + writecache_flush(wc); + else + writecache_schedule_autocommit(wc); - if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { - wc->uncommitted_blocks = 0; - queue_work(wc->writeback_wq, &wc->flush_work); - } else { - writecache_schedule_autocommit(wc); - } - goto unlock_remap; - } - } while (bio->bi_iter.bi_size); + return WC_MAP_SUBMIT; +} - if (unlikely(bio->bi_opf & REQ_FUA || - wc->uncommitted_blocks >= wc->autocommit_blocks)) - writecache_flush(wc); - else - writecache_schedule_autocommit(wc); - goto unlock_submit; +static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio) +{ + if (writecache_has_error(wc)) + return WC_MAP_ERROR; + + if (WC_MODE_PMEM(wc)) { + wc->stats.flushes++; + writecache_flush(wc); + if (writecache_has_error(wc)) + return WC_MAP_ERROR; + else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only)) + return WC_MAP_REMAP_ORIGIN; + return WC_MAP_SUBMIT; } + /* SSD: */ + if (dm_bio_get_target_bio_nr(bio)) + return WC_MAP_REMAP_ORIGIN; + wc->stats.flushes++; + writecache_offload_bio(wc, bio); + return WC_MAP_RETURN; +} -unlock_remap_origin: - if (likely(wc->pause != 0)) { - if (bio_op(bio) == REQ_OP_WRITE) { - dm_iot_io_begin(&wc->iot, 1); - bio->bi_private = (void *)2; - } +static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio) +{ + wc->stats.discards++; + + if (writecache_has_error(wc)) + return WC_MAP_ERROR; + + if (WC_MODE_PMEM(wc)) { + writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio)); + return WC_MAP_REMAP_ORIGIN; } - bio_set_dev(bio, wc->dev->bdev); - wc_unlock(wc); - return DM_MAPIO_REMAPPED; + /* SSD: */ + writecache_offload_bio(wc, bio); + return WC_MAP_RETURN; +} -unlock_remap: - /* make sure that writecache_end_io decrements bio_in_progress: */ - bio->bi_private = (void *)1; - atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]); - wc_unlock(wc); - return DM_MAPIO_REMAPPED; +static int writecache_map(struct dm_target *ti, struct bio *bio) +{ + struct dm_writecache *wc = ti->private; + enum wc_map_op map_op; -unlock_submit: - wc_unlock(wc); - bio_endio(bio); - return DM_MAPIO_SUBMITTED; + bio->bi_private = NULL; -unlock_return: - wc_unlock(wc); - return DM_MAPIO_SUBMITTED; + wc_lock(wc); -unlock_error: - wc_unlock(wc); - bio_io_error(bio); - return DM_MAPIO_SUBMITTED; + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { + map_op = writecache_map_flush(wc, bio); + goto done; + } + + bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); + + if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) & + (wc->block_size / 512 - 1)) != 0)) { + DMERR("I/O is not aligned, sector %llu, size %u, block size %u", + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, wc->block_size); + map_op = WC_MAP_ERROR; + goto done; + } + + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { + map_op = writecache_map_discard(wc, bio); + goto done; + } + + if (bio_data_dir(bio) == READ) + map_op = writecache_map_read(wc, bio); + else + map_op = writecache_map_write(wc, bio); +done: + switch (map_op) { + case WC_MAP_REMAP_ORIGIN: + if (likely(wc->pause != 0)) { + if (bio_op(bio) == REQ_OP_WRITE) { + dm_iot_io_begin(&wc->iot, 1); + bio->bi_private = (void *)2; + } + } + bio_set_dev(bio, wc->dev->bdev); + wc_unlock(wc); + return DM_MAPIO_REMAPPED; + + case WC_MAP_REMAP: + /* make sure that writecache_end_io decrements bio_in_progress: */ + bio->bi_private = (void *)1; + atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]); + wc_unlock(wc); + return DM_MAPIO_REMAPPED; + + case WC_MAP_SUBMIT: + wc_unlock(wc); + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + + case WC_MAP_RETURN: + wc_unlock(wc); + return DM_MAPIO_SUBMITTED; + + case WC_MAP_ERROR: + wc_unlock(wc); + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + + default: + BUG(); + return -1; + } } static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) @@ -2568,9 +2657,20 @@ static void writecache_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: - DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc), + DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu", + writecache_has_error(wc), (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size, - (unsigned long long)wc->writeback_size); + (unsigned long long)wc->writeback_size, + wc->stats.reads, + wc->stats.read_hits, + wc->stats.writes, + wc->stats.write_hits_uncommitted, + wc->stats.write_hits_committed, + wc->stats.writes_around, + wc->stats.writes_allocate, + wc->stats.writes_blocked_on_freelist, + wc->stats.flushes, + wc->stats.discards); break; case STATUSTYPE_TABLE: DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', @@ -2623,12 +2723,15 @@ static void writecache_status(struct dm_target *ti, status_type_t type, if (wc->pause_set) DMEMIT(" pause_writeback %u", wc->pause_value); break; + case STATUSTYPE_IMA: + *result = '\0'; + break; } } static struct target_type writecache_target = { .name = "writecache", - .version = {1, 5, 0}, + .version = {1, 6, 0}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, |