diff options
Diffstat (limited to 'fs/bcachefs/btree_cache.c')
-rw-r--r-- | fs/bcachefs/btree_cache.c | 273 |
1 files changed, 180 insertions, 93 deletions
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index e52a06d3418c..6e4afb2b5441 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -15,11 +15,12 @@ #include <linux/prefetch.h> #include <linux/sched/mm.h> +#include <linux/swap.h> #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ do { \ if (shrinker_counter) \ - bc->not_freed_##counter++; \ + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ } while (0) const char * const bch2_btree_node_flags[] = { @@ -31,24 +32,29 @@ const char * const bch2_btree_node_flags[] = { void bch2_recalc_btree_reserve(struct bch_fs *c) { - unsigned i, reserve = 16; + unsigned reserve = 16; if (!c->btree_roots_known[0].b) reserve += 8; - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (r->b) reserve += min_t(unsigned, 1, r->b->c.level) * 8; } - c->btree_cache.reserve = reserve; + c->btree_cache.nr_reserve = reserve; } -static inline unsigned btree_cache_can_free(struct btree_cache *bc) +static inline size_t btree_cache_can_free(struct btree_cache_list *list) { - return max_t(int, 0, bc->used - bc->reserve); + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + + size_t can_free = list->nr; + if (!list->idx) + can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); + return can_free; } static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) @@ -63,6 +69,18 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; + BUG_ON(btree_node_hashed(b)); + + /* + * This should really be done in slub/vmalloc, but we're using the + * kmalloc_large() path, so we're working around a slub bug by doing + * this here: + */ + if (b->data) + mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); + if (b->aux_data) + mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); + EBUG_ON(btree_node_write_in_flight(b)); clear_btree_node_just_written(b); @@ -76,7 +94,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) #endif b->aux_data = NULL; - bc->used--; + bc->nr_freeable--; btree_node_to_freedlist(bc, b); } @@ -102,6 +120,8 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); + gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; + b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) return -BCH_ERR_ENOMEM_btree_node_mem_alloc; @@ -154,7 +174,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) bch2_btree_lock_init(&b->c, 0); - bc->used++; + bc->nr_freeable++; list_add(&b->list, &bc->freeable); return b; } @@ -169,10 +189,56 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) six_unlock_intent(&b->c.lock); } +static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) +{ + struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); + + u64 mask = bc->pinned_nodes_mask[!!b->c.level]; + + return ((mask & BIT_ULL(b->c.btree_id)) && + bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && + bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); +} + +void bch2_node_pin(struct bch_fs *c, struct btree *b) +{ + struct btree_cache *bc = &c->btree_cache; + + mutex_lock(&bc->lock); + BUG_ON(!__btree_node_pinned(bc, b)); + if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { + set_btree_node_pinned(b); + list_move(&b->list, &bc->live[1].list); + bc->live[0].nr--; + bc->live[1].nr++; + } + mutex_unlock(&bc->lock); +} + +void bch2_btree_cache_unpin(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b, *n; + + mutex_lock(&bc->lock); + c->btree_cache.pinned_nodes_mask[0] = 0; + c->btree_cache.pinned_nodes_mask[1] = 0; + + list_for_each_entry_safe(b, n, &bc->live[1].list, list) { + clear_btree_node_pinned(b); + list_move(&b->list, &bc->live[0].list); + bc->live[0].nr++; + bc->live[1].nr--; + } + + mutex_unlock(&bc->lock); +} + /* Btree in memory cache - hash table */ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) { + lockdep_assert_held(&bc->lock); int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); BUG_ON(ret); @@ -181,7 +247,11 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) b->hash_val = 0; if (b->c.btree_id < BTREE_ID_NR) - --bc->used_by_btree[b->c.btree_id]; + --bc->nr_by_btree[b->c.btree_id]; + + bc->live[btree_node_pinned(b)].nr--; + bc->nr_freeable++; + list_move(&b->list, &bc->freeable); } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) @@ -191,23 +261,30 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, bch_btree_cache_params); - if (!ret && b->c.btree_id < BTREE_ID_NR) - bc->used_by_btree[b->c.btree_id]++; - return ret; + if (ret) + return ret; + + if (b->c.btree_id < BTREE_ID_NR) + bc->nr_by_btree[b->c.btree_id]++; + + bool p = __btree_node_pinned(bc, b); + mod_bit(BTREE_NODE_pinned, &b->flags, p); + + list_move_tail(&b->list, &bc->live[p].list); + bc->live[p].nr++; + + bc->nr_freeable--; + return 0; } int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, unsigned level, enum btree_id id) { - int ret; - b->c.level = level; b->c.btree_id = id; mutex_lock(&bc->lock); - ret = __bch2_btree_node_hash_insert(bc, b); - if (!ret) - list_add_tail(&b->list, &bc->live); + int ret = __bch2_btree_node_hash_insert(bc, b); mutex_unlock(&bc->lock); return ret; @@ -261,18 +338,6 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b int ret = 0; lockdep_assert_held(&bc->lock); - - struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); - - u64 mask = b->c.level - ? bc->pinned_nodes_interior_mask - : bc->pinned_nodes_leaf_mask; - - if ((mask & BIT_ULL(b->c.btree_id)) && - bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && - bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) - return -BCH_ERR_ENOMEM_btree_node_reclaim; - wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| @@ -377,8 +442,9 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = shrink->private_data; - struct btree_cache *bc = &c->btree_cache; + struct btree_cache_list *list = shrink->private_data; + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); struct btree *b, *t; unsigned long nr = sc->nr_to_scan; unsigned long can_free = 0; @@ -386,8 +452,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long touched = 0; unsigned i, flags; unsigned long ret = SHRINK_STOP; - bool trigger_writes = atomic_read(&bc->dirty) + nr >= - bc->used * 3 / 4; + bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; if (bch2_btree_shrinker_disabled) return SHRINK_STOP; @@ -402,7 +467,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, * succeed, so that inserting keys into the btree can always succeed and * IO can always make forward progress: */ - can_free = btree_cache_can_free(bc); + can_free = btree_cache_can_free(list); nr = min_t(unsigned long, nr, can_free); i = 0; @@ -424,22 +489,24 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; - bc->freed++; + bc->nr_freed++; } } restart: - list_for_each_entry_safe(b, t, &bc->live, list) { + list_for_each_entry_safe(b, t, &list->list, list) { touched++; if (btree_node_accessed(b)) { clear_btree_node_accessed(b); - bc->not_freed_access_bit++; + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; + --touched;; } else if (!btree_node_reclaim(c, b, true)) { + bch2_btree_node_hash_remove(bc, b); + freed++; btree_node_data_free(c, b); - bc->freed++; + bc->nr_freed++; - bch2_btree_node_hash_remove(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -450,7 +517,7 @@ restart: !btree_node_will_make_reachable(b) && !btree_node_write_blocked(b) && six_trylock_read(&b->c.lock)) { - list_move(&bc->live, &b->list); + list_move(&list->list, &b->list); mutex_unlock(&bc->lock); __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); six_unlock_read(&b->c.lock); @@ -464,8 +531,8 @@ restart: break; } out_rotate: - if (&t->list != &bc->live) - list_move_tail(&bc->live, &t->list); + if (&t->list != &list->list) + list_move_tail(&list->list, &t->list); out: mutex_unlock(&bc->lock); out_nounlock: @@ -478,44 +545,45 @@ out_nounlock: static unsigned long bch2_btree_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = shrink->private_data; - struct btree_cache *bc = &c->btree_cache; + struct btree_cache_list *list = shrink->private_data; if (bch2_btree_shrinker_disabled) return 0; - return btree_cache_can_free(bc); + return btree_cache_can_free(list); } void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; - struct btree *b; - unsigned i, flags; + struct btree *b, *t; + unsigned long flags; - shrinker_free(bc->shrink); + shrinker_free(bc->live[1].shrink); + shrinker_free(bc->live[0].shrink); /* vfree() can allocate memory: */ flags = memalloc_nofs_save(); mutex_lock(&bc->lock); if (c->verify_data) - list_move(&c->verify_data->list, &bc->live); + list_move(&c->verify_data->list, &bc->live[0].list); kvfree(c->verify_ondisk); - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (r->b) - list_add(&r->b->list, &bc->live); + list_add(&r->b->list, &bc->live[0].list); } - list_splice(&bc->freeable, &bc->live); - - while (!list_empty(&bc->live)) { - b = list_first_entry(&bc->live, struct btree, list); + list_for_each_entry_safe(b, t, &bc->live[1].list, list) + bch2_btree_node_hash_remove(bc, b); + list_for_each_entry_safe(b, t, &bc->live[0].list, list) + bch2_btree_node_hash_remove(bc, b); + list_for_each_entry_safe(b, t, &bc->freeable, list) { BUG_ON(btree_node_read_in_flight(b) || btree_node_write_in_flight(b)); @@ -523,12 +591,11 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) } BUG_ON(!bch2_journal_error(&c->journal) && - atomic_read(&c->btree_cache.dirty)); + atomic_long_read(&c->btree_cache.nr_dirty)); list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); - while (!list_empty(&bc->freed_nonpcpu)) { - b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); + list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { list_del(&b->list); six_lock_exit(&b->c.lock); kfree(b); @@ -537,6 +604,12 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) mutex_unlock(&bc->lock); memalloc_nofs_restore(flags); + for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) + BUG_ON(bc->nr_by_btree[i]); + BUG_ON(bc->live[0].nr); + BUG_ON(bc->live[1].nr); + BUG_ON(bc->nr_freeable); + if (bc->table_init_done) rhashtable_destroy(&bc->table); } @@ -556,22 +629,32 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bch2_recalc_btree_reserve(c); - for (i = 0; i < bc->reserve; i++) + for (i = 0; i < bc->nr_reserve; i++) if (!__bch2_btree_node_mem_alloc(c)) goto err; - list_splice_init(&bc->live, &bc->freeable); + list_splice_init(&bc->live[0].list, &bc->freeable); mutex_init(&c->verify_lock); shrink = shrinker_alloc(0, "%s-btree_cache", c->name); if (!shrink) goto err; - bc->shrink = shrink; + bc->live[0].shrink = shrink; + shrink->count_objects = bch2_btree_cache_count; + shrink->scan_objects = bch2_btree_cache_scan; + shrink->seeks = 2; + shrink->private_data = &bc->live[0]; + shrinker_register(shrink); + + shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); + if (!shrink) + goto err; + bc->live[1].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; - shrink->seeks = 4; - shrink->private_data = c; + shrink->seeks = 8; + shrink->private_data = &bc->live[1]; shrinker_register(shrink); return 0; @@ -582,7 +665,10 @@ err: void bch2_fs_btree_cache_init_early(struct btree_cache *bc) { mutex_init(&bc->lock); - INIT_LIST_HEAD(&bc->live); + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { + bc->live[i].idx = i; + INIT_LIST_HEAD(&bc->live[i].list); + } INIT_LIST_HEAD(&bc->freeable); INIT_LIST_HEAD(&bc->freed_pcpu); INIT_LIST_HEAD(&bc->freed_nonpcpu); @@ -644,14 +730,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) struct btree_cache *bc = &c->btree_cache; struct btree *b; - list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_reclaim(c, b, false)) - return b; + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) + list_for_each_entry_reverse(b, &bc->live[i].list, list) + if (!btree_node_reclaim(c, b, false)) + return b; while (1) { - list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_write_and_reclaim(c, b)) - return b; + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) + list_for_each_entry_reverse(b, &bc->live[i].list, list) + if (!btree_node_write_and_reclaim(c, b)) + return b; /* * Rare case: all nodes were intent-locked. @@ -671,9 +759,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea : &bc->freed_nonpcpu; struct btree *b, *b2; u64 start_time = local_clock(); - unsigned flags; - flags = memalloc_nofs_save(); mutex_lock(&bc->lock); /* @@ -725,7 +811,7 @@ got_node: } mutex_lock(&bc->lock); - bc->used++; + bc->nr_freeable++; got_mem: mutex_unlock(&bc->lock); @@ -745,8 +831,6 @@ out: bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); - memalloc_nofs_restore(flags); - int ret = bch2_trans_relock(trans); if (unlikely(ret)) { bch2_btree_node_to_freelist(c, b); @@ -781,7 +865,6 @@ err: } mutex_unlock(&bc->lock); - memalloc_nofs_restore(flags); return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); } @@ -1269,8 +1352,8 @@ wait_on_io: BUG_ON(btree_node_dirty(b)); mutex_lock(&bc->lock); - btree_node_data_free(c, b); bch2_btree_node_hash_remove(bc, b); + btree_node_data_free(c, b); mutex_unlock(&bc->lock); out: six_unlock_write(&b->c.lock); @@ -1342,13 +1425,20 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc } static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, - const char *label, unsigned nr) + const char *label, size_t nr) { prt_printf(out, "%s\t", label); prt_human_readable_u64(out, nr * c->opts.btree_node_size); - prt_printf(out, " (%u)\n", nr); + prt_printf(out, " (%zu)\n", nr); } +static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { +#define x(n) #n, + BCH_BTREE_CACHE_NOT_FREED_REASONS() +#undef x + NULL +}; + void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); @@ -1356,24 +1446,21 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); - prt_btree_cache_line(out, c, "total:", bc->used); - prt_btree_cache_line(out, c, "nr dirty:", atomic_read(&bc->dirty)); + prt_btree_cache_line(out, c, "live:", bc->live[0].nr); + prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); + prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); + prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); prt_newline(out); - for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++) - prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]); + for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) + prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]); prt_newline(out); - prt_printf(out, "freed:\t%u\n", bc->freed); + prt_printf(out, "freed:\t%zu\n", bc->nr_freed); prt_printf(out, "not freed:\n"); - prt_printf(out, " dirty\t%u\n", bc->not_freed_dirty); - prt_printf(out, " write in flight\t%u\n", bc->not_freed_write_in_flight); - prt_printf(out, " read in flight\t%u\n", bc->not_freed_read_in_flight); - prt_printf(out, " lock intent failed\t%u\n", bc->not_freed_lock_intent); - prt_printf(out, " lock write failed\t%u\n", bc->not_freed_lock_write); - prt_printf(out, " access bit\t%u\n", bc->not_freed_access_bit); - prt_printf(out, " no evict failed\t%u\n", bc->not_freed_noevict); - prt_printf(out, " write blocked\t%u\n", bc->not_freed_write_blocked); - prt_printf(out, " will make reachable\t%u\n", bc->not_freed_will_make_reachable); + + for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) + prt_printf(out, " %s\t%llu\n", + bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); } |