diff options
author | Kent Overstreet <kent.overstreet@linux.dev> | 2024-06-06 13:25:28 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2024-07-14 19:00:15 -0400 |
commit | 2574e95a8b78ef853100d6889f154883fec989a2 (patch) | |
tree | ddc315624c4ba849ea387100f917f9e8762ea6d6 /fs/bcachefs | |
parent | b5597347a5c7182dba80f9fb561c404abb3cead8 (diff) |
bcachefs: Refactor disk accounting data structures
Break up the percpu counter allocations into individual allocations for
each disk accounting counter; this fixes an issue on large systems where
we have too many replica entries to for the percpu allocator's max
practical size.
Also, use just one eytzinger tree for the normal set of counters and the
gc counters; this simplifies accounting_gc_done() where we need the same
set of counters to be present in both tables.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs/bcachefs')
-rw-r--r-- | fs/bcachefs/bcachefs.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/btree_gc.c | 7 | ||||
-rw-r--r-- | fs/bcachefs/disk_accounting.c | 184 | ||||
-rw-r--r-- | fs/bcachefs/disk_accounting.h | 31 | ||||
-rw-r--r-- | fs/bcachefs/disk_accounting_types.h | 11 |
5 files changed, 109 insertions, 126 deletions
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index e0e9afb08ef6..ea4bf11fb8dd 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -742,7 +742,7 @@ struct bch_fs { struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; - struct bch_accounting_mem accounting[2]; + struct bch_accounting_mem accounting; struct bch_replicas_cpu replicas; struct bch_replicas_cpu replicas_gc; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 88f7c7d64a1d..dfc842d6e822 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -735,7 +735,7 @@ static int bch2_mark_superblocks(struct bch_fs *c) static void bch2_gc_free(struct bch_fs *c) { - bch2_accounting_free(&c->accounting[1]); + bch2_accounting_gc_free(c); genradix_free(&c->reflink_gc_table); genradix_free(&c->gc_stripes); @@ -1105,7 +1105,8 @@ int bch2_check_allocations(struct bch_fs *c) bch2_btree_interior_updates_flush(c); - ret = bch2_gc_start(c) ?: + ret = bch2_gc_accounting_start(c) ?: + bch2_gc_start(c) ?: bch2_gc_alloc_start(c) ?: bch2_gc_reflink_start(c); if (ret) @@ -1125,7 +1126,7 @@ int bch2_check_allocations(struct bch_fs *c) c->gc_count++; ret = bch2_gc_alloc_done(c) ?: - bch2_accounting_gc_done(c) ?: + bch2_gc_accounting_done(c) ?: bch2_gc_stripes_done(c) ?: bch2_gc_reflink_done(c); out: diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 29bc4c816f95..3327d465908d 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -218,57 +218,56 @@ int bch2_accounting_update_sb(struct btree_trans *trans) return 0; } -static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) +static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a) { - struct bch_replicas_padded r; - - if (accounting_to_replicas(&r.e, a.k->p) && - !bch2_replicas_marked_locked(c, &r.e)) - return -BCH_ERR_btree_insert_need_mark_replicas; + struct bch_accounting_mem *acc = &c->accounting; /* raced with another insert, already present: */ - struct bch_accounting_mem *acc = &c->accounting[gc]; if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &a.k->p) < acc->k.nr) return 0; - unsigned new_nr_counters = acc->nr_counters + bch2_accounting_counters(a.k); - - u64 __percpu *new_counters = __alloc_percpu_gfp(new_nr_counters * sizeof(u64), - sizeof(u64), GFP_KERNEL); - if (!new_counters) - return -BCH_ERR_ENOMEM_disk_accounting; - - preempt_disable(); - memcpy(this_cpu_ptr(new_counters), - bch2_acc_percpu_u64s(acc->v, acc->nr_counters), - acc->nr_counters * sizeof(u64)); - preempt_enable(); - - struct accounting_pos_offset n = { + struct accounting_mem_entry n = { .pos = a.k->p, .version = a.k->version, - .offset = acc->nr_counters, .nr_counters = bch2_accounting_counters(a.k), + .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), + sizeof(u64), GFP_KERNEL), }; - if (darray_push(&acc->k, n)) { - free_percpu(new_counters); - return -BCH_ERR_ENOMEM_disk_accounting; + + if (!n.v[0]) + goto err; + + if (acc->gc_running) { + n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), + sizeof(u64), GFP_KERNEL); + if (!n.v[1]) + goto err; } - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); + if (darray_push(&acc->k, n)) + goto err; - free_percpu(acc->v); - acc->v = new_counters; - acc->nr_counters = new_nr_counters; + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); return 0; +err: + free_percpu(n.v[1]); + free_percpu(n.v[0]); + return -BCH_ERR_ENOMEM_disk_accounting; } int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) { + struct bch_replicas_padded r; + + if (accounting_to_replicas(&r.e, a.k->p) && + !bch2_replicas_marked_locked(c, &r.e)) + return -BCH_ERR_btree_insert_need_mark_replicas; + percpu_up_read(&c->mark_lock); percpu_down_write(&c->mark_lock); - int ret = __bch2_accounting_mem_insert(c, a, gc); + int ret = __bch2_accounting_mem_insert(c, a); percpu_up_write(&c->mark_lock); percpu_down_read(&c->mark_lock); return ret; @@ -284,7 +283,7 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, b */ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) { - struct bch_accounting_mem *acc = &c->accounting[0]; + struct bch_accounting_mem *acc = &c->accounting; int ret = 0; darray_init(usage); @@ -300,7 +299,7 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) continue; u64 sectors; - bch2_accounting_mem_read(c, i->pos, §ors, 1); + bch2_accounting_mem_read_counters(acc, i - acc->k.data, §ors, 1, false); u.r.sectors = sectors; ret = darray_make_room(usage, replicas_usage_bytes(&u.r)); @@ -320,7 +319,7 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask) { - struct bch_accounting_mem *acc = &c->accounting[0]; + struct bch_accounting_mem *acc = &c->accounting; int ret = 0; darray_init(out_buf); @@ -342,7 +341,8 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc bkey_accounting_init((void *) &darray_top(*out_buf)); set_bkey_val_u64s(&a_out->k, i->nr_counters); a_out->k.p = i->pos; - bch2_accounting_mem_read(c, i->pos, a_out->v.d, i->nr_counters); + bch2_accounting_mem_read_counters(acc, i - acc->k.data, + a_out->v.d, i->nr_counters, false); if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out))) out_buf->nr += bkey_bytes(&a_out->k); @@ -357,7 +357,7 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c) { - struct bch_accounting_mem *acc = &c->accounting[0]; + struct bch_accounting_mem *acc = &c->accounting; percpu_down_read(&c->mark_lock); out->atomic++; @@ -369,7 +369,7 @@ void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c) bch2_accounting_key_to_text(out, &acc_k); u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(c, i, v, ARRAY_SIZE(v), false); + bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); prt_str(out, ":"); for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) @@ -381,81 +381,56 @@ void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c) percpu_up_read(&c->mark_lock); } -/* Ensures all counters in @src exist in @dst: */ -static int copy_counters(struct bch_accounting_mem *dst, - struct bch_accounting_mem *src) +static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) { - unsigned orig_dst_k_nr = dst->k.nr; - unsigned dst_counters = dst->nr_counters; - - darray_for_each(src->k, i) - if (eytzinger0_find(dst->k.data, orig_dst_k_nr, sizeof(dst->k.data[0]), - accounting_pos_cmp, &i->pos) >= orig_dst_k_nr) { - if (darray_push(&dst->k, ((struct accounting_pos_offset) { - .pos = i->pos, - .offset = dst_counters, - .nr_counters = i->nr_counters }))) - goto err; - - dst_counters += i->nr_counters; - } - - if (dst->k.nr == orig_dst_k_nr) - return 0; - - u64 __percpu *new_counters = __alloc_percpu_gfp(dst_counters * sizeof(u64), - sizeof(u64), GFP_KERNEL); - if (!new_counters) - goto err; + darray_for_each(acc->k, e) { + free_percpu(e->v[gc]); + e->v[gc] = NULL; + } +} - preempt_disable(); - memcpy(this_cpu_ptr(new_counters), - bch2_acc_percpu_u64s(dst->v, dst->nr_counters), - dst->nr_counters * sizeof(u64)); - preempt_enable(); +int bch2_gc_accounting_start(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + int ret = 0; - free_percpu(dst->v); - dst->v = new_counters; - dst->nr_counters = dst_counters; + percpu_down_write(&c->mark_lock); + darray_for_each(acc->k, e) { + e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64), + sizeof(u64), GFP_KERNEL); + if (!e->v[1]) { + bch2_accounting_free_counters(acc, true); + ret = -BCH_ERR_ENOMEM_disk_accounting; + break; + } + } - eytzinger0_sort(dst->k.data, dst->k.nr, sizeof(dst->k.data[0]), accounting_pos_cmp, NULL); + acc->gc_running = !ret; + percpu_up_write(&c->mark_lock); - return 0; -err: - dst->k.nr = orig_dst_k_nr; - return -BCH_ERR_ENOMEM_disk_accounting; + return ret; } -int bch2_accounting_gc_done(struct bch_fs *c) +int bch2_gc_accounting_done(struct bch_fs *c) { - struct bch_accounting_mem *dst = &c->accounting[0]; - struct bch_accounting_mem *src = &c->accounting[1]; + struct bch_accounting_mem *acc = &c->accounting; struct btree_trans *trans = bch2_trans_get(c); struct printbuf buf = PRINTBUF; int ret = 0; - percpu_down_write(&c->mark_lock); - - ret = copy_counters(dst, src) ?: - copy_counters(src, dst); - if (ret) - goto err; - - BUG_ON(dst->k.nr != src->k.nr); - - for (unsigned i = 0; i < src->k.nr; i++) { - BUG_ON(src->k.data[i].nr_counters != dst->k.data[i].nr_counters); - BUG_ON(!bpos_eq(dst->k.data[i].pos, src->k.data[i].pos)); + percpu_down_read(&c->mark_lock); + darray_for_each(acc->k, e) { struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, src->k.data[i].pos); + bpos_to_disk_accounting_pos(&acc_k, e->pos); - unsigned nr = src->k.data[i].nr_counters; u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS]; u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(c, i, dst_v, nr, false); - bch2_accounting_mem_read_counters(c, i, src_v, nr, true); + unsigned idx = e - acc->k.data; + unsigned nr = e->nr_counters; + bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false); + bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true); if (memcmp(dst_v, src_v, nr * sizeof(u64))) { printbuf_reset(&buf); @@ -497,7 +472,7 @@ int bch2_accounting_gc_done(struct bch_fs *c) } err: fsck_err: - percpu_up_write(&c->mark_lock); + percpu_up_read(&c->mark_lock); printbuf_exit(&buf); bch2_trans_put(trans); bch_err_fn(c, ret); @@ -540,7 +515,7 @@ fsck_err: */ int bch2_accounting_read(struct bch_fs *c) { - struct bch_accounting_mem *acc = &c->accounting[0]; + struct bch_accounting_mem *acc = &c->accounting; struct btree_trans *trans = bch2_trans_get(c); int ret = for_each_btree_key(trans, iter, @@ -600,7 +575,7 @@ int bch2_accounting_read(struct bch_fs *c) bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(c, i, v, ARRAY_SIZE(v), false); + bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); switch (k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: @@ -754,15 +729,20 @@ void bch2_verify_accounting_clean(struct bch_fs *c) WARN_ON(mismatch); } -void bch2_accounting_free(struct bch_accounting_mem *acc) +void bch2_accounting_gc_free(struct bch_fs *c) { - darray_exit(&acc->k); - free_percpu(acc->v); - acc->v = NULL; - acc->nr_counters = 0; + lockdep_assert_held(&c->mark_lock); + + struct bch_accounting_mem *acc = &c->accounting; + + bch2_accounting_free_counters(acc, true); + acc->gc_running = false; } void bch2_fs_accounting_exit(struct bch_fs *c) { - bch2_accounting_free(&c->accounting[0]); + struct bch_accounting_mem *acc = &c->accounting; + + bch2_accounting_free_counters(acc, false); + darray_exit(&acc->k); } diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 4b37f0c24b4e..81dab01d1eb8 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -108,9 +108,11 @@ int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) { - struct bch_accounting_mem *acc = &c->accounting[gc]; + struct bch_accounting_mem *acc = &c->accounting; unsigned idx; + EBUG_ON(gc && !acc->gc_running); + while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { int ret = bch2_accounting_mem_insert(c, a, gc); @@ -118,12 +120,12 @@ static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_ac return ret; } - unsigned offset = acc->k.data[idx].offset; + struct accounting_mem_entry *e = &acc->k.data[idx]; - EBUG_ON(bch2_accounting_counters(a.k) != acc->k.data[idx].nr_counters); + EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters); for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) - this_cpu_add(acc->v[offset + i], a.v->d[i]); + this_cpu_add(e->v[gc][i], a.v->d[i]); return 0; } @@ -170,37 +172,38 @@ static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey return ret; } -static inline void bch2_accounting_mem_read_counters(struct bch_fs *c, unsigned idx, - u64 *v, unsigned nr, bool gc) +static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc, + unsigned idx, u64 *v, unsigned nr, bool gc) { memset(v, 0, sizeof(*v) * nr); - struct bch_accounting_mem *acc = &c->accounting[gc]; if (unlikely(idx >= acc->k.nr)) return; - unsigned offset = acc->k.data[idx].offset; - nr = min_t(unsigned, nr, acc->k.data[idx].nr_counters); + struct accounting_mem_entry *e = &acc->k.data[idx]; + + nr = min_t(unsigned, nr, e->nr_counters); for (unsigned i = 0; i < nr; i++) - v[i] = percpu_u64_get(acc->v + offset + i); + v[i] = percpu_u64_get(e->v[gc] + i); } static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, u64 *v, unsigned nr) { - struct bch_accounting_mem *acc = &c->accounting[0]; + struct bch_accounting_mem *acc = &c->accounting; unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &p); - bch2_accounting_mem_read_counters(c, idx, v, nr, false); + bch2_accounting_mem_read_counters(acc, idx, v, nr, false); } int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *); int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned); void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *); -int bch2_accounting_gc_done(struct bch_fs *); +int bch2_gc_accounting_start(struct bch_fs *); +int bch2_gc_accounting_done(struct bch_fs *); int bch2_accounting_read(struct bch_fs *); @@ -209,7 +212,7 @@ int bch2_dev_usage_init(struct bch_dev *, bool); void bch2_verify_accounting_clean(struct bch_fs *c); -void bch2_accounting_free(struct bch_accounting_mem *); +void bch2_accounting_gc_free(struct bch_fs *); void bch2_fs_accounting_exit(struct bch_fs *); #endif /* _BCACHEFS_DISK_ACCOUNTING_H */ diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h index 5656ac540a10..1687a45177a7 100644 --- a/fs/bcachefs/disk_accounting_types.h +++ b/fs/bcachefs/disk_accounting_types.h @@ -4,17 +4,16 @@ #include "darray.h" -struct accounting_pos_offset { +struct accounting_mem_entry { struct bpos pos; struct bversion version; - u32 offset:24, - nr_counters:8; + unsigned nr_counters; + u64 __percpu *v[2]; }; struct bch_accounting_mem { - DARRAY(struct accounting_pos_offset) k; - u64 __percpu *v; - unsigned nr_counters; + DARRAY(struct accounting_mem_entry) k; + bool gc_running; }; #endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */ |