diff options
author | Kent Overstreet <kent.overstreet@linux.dev> | 2023-12-27 18:31:46 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2024-07-14 19:00:13 -0400 |
commit | 2744e5c9eb1a1090b5f61c955e934c70bfe6b04c (patch) | |
tree | 5ba22b05e8865c1718b0f405c302a950090e721a | |
parent | 929d954330142a6273697b2cbf855f0f904a12f5 (diff) |
bcachefs: KEY_TYPE_accounting
New key type for the disk space accounting rewrite.
- Holds a variable sized array of u64s (may be more than one for
accounting e.g. compressed and uncompressed size, or buckets and
sectors for a given data type)
- Updates are deltas, not new versions of the key: this means updates
to accounting can happen via the btree write buffer, which we'll be
teaching to accumulate deltas.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r-- | fs/bcachefs/Makefile | 3 | ||||
-rw-r--r-- | fs/bcachefs/bcachefs_format.h | 56 | ||||
-rw-r--r-- | fs/bcachefs/bkey_methods.c | 1 | ||||
-rw-r--r-- | fs/bcachefs/disk_accounting.c | 70 | ||||
-rw-r--r-- | fs/bcachefs/disk_accounting.h | 52 | ||||
-rw-r--r-- | fs/bcachefs/disk_accounting_format.h | 144 | ||||
-rw-r--r-- | fs/bcachefs/recovery.c | 1 | ||||
-rw-r--r-- | fs/bcachefs/sb-downgrade.c | 25 |
8 files changed, 303 insertions, 49 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 66ca0bbee639..0ab533a2b03b 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -29,10 +29,11 @@ bcachefs-y := \ clock.o \ compress.o \ darray.o \ + data_update.o \ debug.o \ dirent.o \ + disk_accounting.o \ disk_groups.o \ - data_update.o \ ec.o \ errcode.o \ error.o \ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index cdcb9bc4cc14..6ad5104e97a1 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -417,7 +417,8 @@ static inline void bkey_init(struct bkey *k) x(bucket_gens, 30) \ x(snapshot_tree, 31) \ x(logged_op_truncate, 32) \ - x(logged_op_finsert, 33) + x(logged_op_finsert, 33) \ + x(accounting, 34) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -505,6 +506,9 @@ struct bch_sb_field { x(downgrade, 14) #include "alloc_background_format.h" +#include "dirent_format.h" +#include "disk_accounting_format.h" +#include "disk_groups_format.h" #include "extents_format.h" #include "ec_format.h" #include "dirent_format.h" @@ -602,49 +606,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); -#define BCH_DATA_TYPES() \ - x(free, 0) \ - x(sb, 1) \ - x(journal, 2) \ - x(btree, 3) \ - x(user, 4) \ - x(cached, 5) \ - x(parity, 6) \ - x(stripe, 7) \ - x(need_gc_gens, 8) \ - x(need_discard, 9) \ - x(unstriped, 10) - -enum bch_data_type { -#define x(t, n) BCH_DATA_##t, - BCH_DATA_TYPES() -#undef x - BCH_DATA_NR -}; - -static inline bool data_type_is_empty(enum bch_data_type type) -{ - switch (type) { - case BCH_DATA_free: - case BCH_DATA_need_gc_gens: - case BCH_DATA_need_discard: - return true; - default: - return false; - } -} - -static inline bool data_type_is_hidden(enum bch_data_type type) -{ - switch (type) { - case BCH_DATA_sb: - case BCH_DATA_journal: - return true; - default: - return false; - } -} - /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: @@ -724,7 +685,8 @@ struct bch_sb_field_ext { x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ x(btree_subvolume_children, BCH_VERSION(1, 6)) \ x(mi_btree_bitmap, BCH_VERSION(1, 7)) \ - x(bucket_stripe_sectors, BCH_VERSION(1, 8)) + x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \ + x(disk_accounting_v2, BCH_VERSION(1, 9)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1377,7 +1339,9 @@ enum btree_id_flags { x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ x(subvolume_children, 19, 0, \ - BIT_ULL(KEY_TYPE_set)) + BIT_ULL(KEY_TYPE_set)) \ + x(accounting, 20, BTREE_ID_SNAPSHOT_FIELD, \ + BIT_ULL(KEY_TYPE_accounting)) \ enum btree_id { #define x(name, nr, ...) BTREE_ID_##name = nr, diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index bd32aac05192..5f07cf853d0c 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -7,6 +7,7 @@ #include "btree_types.h" #include "alloc_background.h" #include "dirent.h" +#include "disk_accounting.h" #include "ec.h" #include "error.h" #include "extents.h" diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c new file mode 100644 index 000000000000..db501c6cf3ee --- /dev/null +++ b/fs/bcachefs/disk_accounting.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_update.h" +#include "buckets.h" +#include "disk_accounting.h" +#include "replicas.h" + +static const char * const disk_accounting_type_strs[] = { +#define x(t, n, ...) [n] = #t, + BCH_DISK_ACCOUNTING_TYPES() +#undef x + NULL +}; + +int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags, + struct printbuf *err) +{ + return 0; +} + +void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k) +{ + if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) { + prt_printf(out, "unknown type %u", k->type); + return; + } + + prt_str(out, disk_accounting_type_strs[k->type]); + prt_str(out, " "); + + switch (k->type) { + case BCH_DISK_ACCOUNTING_nr_inodes: + break; + case BCH_DISK_ACCOUNTING_persistent_reserved: + prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas); + break; + case BCH_DISK_ACCOUNTING_replicas: + bch2_replicas_entry_to_text(out, &k->replicas); + break; + case BCH_DISK_ACCOUNTING_dev_data_type: + prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev); + bch2_prt_data_type(out, k->dev_data_type.data_type); + break; + case BCH_DISK_ACCOUNTING_dev_stripe_buckets: + prt_printf(out, "dev=%u", k->dev_stripe_buckets.dev); + break; + } +} + +void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k); + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + bch2_accounting_key_to_text(out, &acc_k); + + for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++) + prt_printf(out, " %lli", acc.v->d[i]); +} + +void bch2_accounting_swab(struct bkey_s k) +{ + for (u64 *p = (u64 *) k.v; + p < (u64 *) bkey_val_end(k); + p++) + *p = swab64(*p); +} diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h new file mode 100644 index 000000000000..178ec25b7ef4 --- /dev/null +++ b/fs/bcachefs/disk_accounting.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_ACCOUNTING_H +#define _BCACHEFS_DISK_ACCOUNTING_H + +static inline unsigned bch2_accounting_counters(const struct bkey *k) +{ + return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64); +} + +static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst, + struct bkey_s_c_accounting src) +{ + EBUG_ON(dst->k.u64s != src.k->u64s); + + for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++) + dst->v.d[i] += src.v->d[i]; + if (bversion_cmp(dst->k.version, src.k->version) < 0) + dst->k.version = src.k->version; +} + +static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p) +{ + acc->_pad = p; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + bch2_bpos_swab(&acc->_pad); +#endif +} + +static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *k) +{ + struct bpos ret = k->_pad; + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + bch2_bpos_swab(&ret); +#endif + return ret; +} + +int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c, + enum bch_validate_flags, struct printbuf *); +void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); +void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_accounting_swab(struct bkey_s); + +#define bch2_bkey_ops_accounting ((struct bkey_ops) { \ + .key_invalid = bch2_accounting_invalid, \ + .val_to_text = bch2_accounting_to_text, \ + .swab = bch2_accounting_swab, \ + .min_val_size = 8, \ +}) + +#endif /* _BCACHEFS_DISK_ACCOUNTING_H */ diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h new file mode 100644 index 000000000000..4ff42466f2a6 --- /dev/null +++ b/fs/bcachefs/disk_accounting_format.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H +#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H + +#include "replicas_format.h" + +/* + * Disk accounting - KEY_TYPE_accounting - on disk format: + * + * Here, the key has considerably more structure than a typical key (bpos); an + * accounting key is 'struct disk_accounting_pos', which is a union of bpos. + * + * More specifically: a key is just a muliword integer (where word endianness + * matches native byte order), so we're treating bpos as an opaque 20 byte + * integer and mapping bch_accounting_key to that. + * + * This is a type-tagged union of all our various subtypes; a disk accounting + * key can be device counters, replicas counters, et cetera - it's extensible. + * + * The value is a list of u64s or s64s; the number of counters is specific to a + * given accounting type. + * + * Unlike with other key types, updates are _deltas_, and the deltas are not + * resolved until the update to the underlying btree, done by btree write buffer + * flush or journal replay. + * + * Journal replay in particular requires special handling. The journal tracks a + * range of entries which may possibly have not yet been applied to the btree + * yet - it does not know definitively whether individual entries are dirty and + * still need to be applied. + * + * To handle this, we use the version field of struct bkey, and give every + * accounting update a unique version number - a total ordering in time; the + * version number is derived from the key's position in the journal. Then + * journal replay can compare the version number of the key from the journal + * with the version number of the key in the btree to determine if a key needs + * to be replayed. + * + * For this to work, we must maintain this strict time ordering of updates as + * they are flushed to the btree, both via write buffer flush and via journal + * replay. This has complications for the write buffer code while journal replay + * is still in progress; the write buffer cannot flush any accounting keys to + * the btree until journal replay has finished replaying its accounting keys, or + * the (newer) version number of the keys from the write buffer will cause + * updates from journal replay to be lost. + */ + +struct bch_accounting { + struct bch_val v; + __u64 d[]; +}; + +#define BCH_ACCOUNTING_MAX_COUNTERS 3 + +#define BCH_DATA_TYPES() \ + x(free, 0) \ + x(sb, 1) \ + x(journal, 2) \ + x(btree, 3) \ + x(user, 4) \ + x(cached, 5) \ + x(parity, 6) \ + x(stripe, 7) \ + x(need_gc_gens, 8) \ + x(need_discard, 9) \ + x(unstriped, 10) + +enum bch_data_type { +#define x(t, n) BCH_DATA_##t, + BCH_DATA_TYPES() +#undef x + BCH_DATA_NR +}; + +static inline bool data_type_is_empty(enum bch_data_type type) +{ + switch (type) { + case BCH_DATA_free: + case BCH_DATA_need_gc_gens: + case BCH_DATA_need_discard: + return true; + default: + return false; + } +} + +static inline bool data_type_is_hidden(enum bch_data_type type) +{ + switch (type) { + case BCH_DATA_sb: + case BCH_DATA_journal: + return true; + default: + return false; + } +} + +#define BCH_DISK_ACCOUNTING_TYPES() \ + x(nr_inodes, 0) \ + x(persistent_reserved, 1) \ + x(replicas, 2) \ + x(dev_data_type, 3) \ + x(dev_stripe_buckets, 4) + +enum disk_accounting_type { +#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr, + BCH_DISK_ACCOUNTING_TYPES() +#undef x + BCH_DISK_ACCOUNTING_TYPE_NR, +}; + +struct bch_nr_inodes { +}; + +struct bch_persistent_reserved { + __u8 nr_replicas; +}; + +struct bch_dev_data_type { + __u8 dev; + __u8 data_type; +}; + +struct bch_dev_stripe_buckets { + __u8 dev; +}; + +struct disk_accounting_pos { + union { + struct { + __u8 type; + union { + struct bch_nr_inodes nr_inodes; + struct bch_persistent_reserved persistent_reserved; + struct bch_replicas_entry_v1 replicas; + struct bch_dev_data_type dev_data_type; + struct bch_dev_stripe_buckets dev_stripe_buckets; + }; + }; + struct bpos _pad; + }; +}; + +#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 34c4899c4599..d336a7c69edd 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -90,6 +90,7 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); bch2_write_super(c); diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 9596f470e166..dfbbd33c8731 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -54,11 +54,32 @@ BCH_FSCK_ERR_subvol_children_not_set) \ x(mi_btree_bitmap, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_btree_bitmap_not_marked) + BCH_FSCK_ERR_btree_bitmap_not_marked) \ + x(disk_accounting_v2, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_bkey_version_in_future, \ + BCH_FSCK_ERR_dev_usage_buckets_wrong, \ + BCH_FSCK_ERR_dev_usage_sectors_wrong, \ + BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ + BCH_FSCK_ERR_accounting_mismatch) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ - 0) + 0) \ + x(disk_accounting_v2, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_dev_usage_buckets_wrong, \ + BCH_FSCK_ERR_dev_usage_sectors_wrong, \ + BCH_FSCK_ERR_dev_usage_fragmented_wrong, \ + BCH_FSCK_ERR_fs_usage_hidden_wrong, \ + BCH_FSCK_ERR_fs_usage_btree_wrong, \ + BCH_FSCK_ERR_fs_usage_data_wrong, \ + BCH_FSCK_ERR_fs_usage_cached_wrong, \ + BCH_FSCK_ERR_fs_usage_reserved_wrong, \ + BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \ + BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \ + BCH_FSCK_ERR_fs_usage_replicas_wrong, \ + BCH_FSCK_ERR_bkey_version_in_future) struct upgrade_downgrade_entry { u64 recovery_passes; |