summaryrefslogtreecommitdiff
path: root/fs/btrfs/zoned.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-10-30 10:42:06 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2023-10-30 10:42:06 -1000
commitd5acbc60fafbe0fc94c552ce916dd592cd4c6371 (patch)
treec2d70058845399ebcf894e551e6b0c053dd3e836 /fs/btrfs/zoned.c
parent8829687a4ac1d484639425a691da46f6e361aec1 (diff)
parentc6e8f898f56fae2cb5bc4396bec480f23cd8b066 (diff)
Merge tag 'for-6.7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "New features: - raid-stripe-tree New tree for logical file extent mapping where the physical mapping may not match on multiple devices. This is now used in zoned mode to implement RAID0/RAID1* profiles, but can be used in non-zoned mode as well. The support for RAID56 is in development and will eventually fix the problems with the current implementation. This is a backward incompatible feature and has to be enabled at mkfs time. - simple quota accounting (squota) A simplified mode of qgroup that accounts all space on the initial extent owners (a subvolume), the snapshots are then cheap to create and delete. The deletion of snapshots in fully accounting qgroups is a known CPU/IO performance bottleneck. The squota is not suitable for the general use case but works well for containers where the original subvolume exists for the whole time. This is a backward incompatible feature as it needs extending some structures, but can be enabled on an existing filesystem. - temporary filesystem fsid (temp_fsid) The fsid identifies a filesystem and is hard coded in the structures, which disallows mounting the same fsid found on different devices. For a single device filesystem this is not strictly necessary, a new temporary fsid can be generated on mount e.g. after a device is cloned. This will be used by Steam Deck for root partition A/B testing, or can be used for VM root images. Other user visible changes: - filesystems with partially finished metadata_uuid conversion cannot be mounted anymore and the uuid fixup has to be done by btrfs-progs (btrfstune). Performance improvements: - reduce reservations for checksum deletions (with enabled free space tree by factor of 4), on a sample workload on file with many extents the deletion time decreased by 12% - make extent state merges more efficient during insertions, reduce rb-tree iterations (run time of critical functions reduced by 5%) Core changes: - the integrity check functionality has been removed, this was a debugging feature and removal does not affect other integrity checks like checksums or tree-checker - space reservation changes: - more efficient delayed ref reservations, this avoids building up too much work or overusing or exhausting the global block reserve in some situations - move delayed refs reservation to the transaction start time, this prevents some ENOSPC corner cases related to exhaustion of global reserve - improvements in reducing excessive reservations for block group items - adjust overcommit logic in near full situations, account for one more chunk to eventually allocate metadata chunk, this is mostly relevant for small filesystems (<10GiB) - single device filesystems are scanned but not registered (except seed devices), this allows temp_fsid to work - qgroup iterations do not need GFP_ATOMIC allocations anymore - cleanups, refactoring, reduced data structure size, function parameter simplifications, error handling fixes" * tag 'for-6.7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (156 commits) btrfs: open code timespec64 in struct btrfs_inode btrfs: remove redundant log root tree index assignment during log sync btrfs: remove redundant initialization of variable dirty in btrfs_update_time() btrfs: sysfs: show temp_fsid feature btrfs: disable the device add feature for temp-fsid btrfs: disable the seed feature for temp-fsid btrfs: update comment for temp-fsid, fsid, and metadata_uuid btrfs: remove pointless empty log context list check when syncing log btrfs: update comment for struct btrfs_inode::lock btrfs: remove pointless barrier from btrfs_sync_file() btrfs: add and use helpers for reading and writing last_trans_committed btrfs: add and use helpers for reading and writing fs_info->generation btrfs: add and use helpers for reading and writing log_transid btrfs: add and use helpers for reading and writing last_log_commit btrfs: support cloned-device mount capability btrfs: add helper function find_fsid_by_disk btrfs: stop reserving excessive space for block group item insertions btrfs: stop reserving excessive space for block group item updates btrfs: reorder btrfs_inode to fill gaps btrfs: open code btrfs_ordered_inode_tree in btrfs_inode ...
Diffstat (limited to 'fs/btrfs/zoned.c')
-rw-r--r--fs/btrfs/zoned.c452
1 files changed, 289 insertions, 163 deletions
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 09bc325d075d..3504ade30cb0 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1282,21 +1282,284 @@ out:
return ret;
}
+struct zone_info {
+ u64 physical;
+ u64 capacity;
+ u64 alloc_offset;
+};
+
+static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
+ struct zone_info *info, unsigned long *active,
+ struct map_lookup *map)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *device = map->stripes[zone_idx].dev;
+ int dev_replace_is_ongoing = 0;
+ unsigned int nofs_flag;
+ struct blk_zone zone;
+ int ret;
+
+ info->physical = map->stripes[zone_idx].physical;
+
+ if (!device->bdev) {
+ info->alloc_offset = WP_MISSING_DEV;
+ return 0;
+ }
+
+ /* Consider a zone as active if we can allow any number of active zones. */
+ if (!device->zone_info->max_active_zones)
+ __set_bit(zone_idx, active);
+
+ if (!btrfs_dev_is_sequential(device, info->physical)) {
+ info->alloc_offset = WP_CONVENTIONAL;
+ return 0;
+ }
+
+ /* This zone will be used for allocation, so mark this zone non-empty. */
+ btrfs_dev_clear_zone_empty(device, info->physical);
+
+ down_read(&dev_replace->rwsem);
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
+ btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical);
+ up_read(&dev_replace->rwsem);
+
+ /*
+ * The group is mapped to a sequential zone. Get the zone write pointer
+ * to determine the allocation offset within the zone.
+ */
+ WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
+ nofs_flag = memalloc_nofs_save();
+ ret = btrfs_get_dev_zone(device, info->physical, &zone);
+ memalloc_nofs_restore(nofs_flag);
+ if (ret) {
+ if (ret != -EIO && ret != -EOPNOTSUPP)
+ return ret;
+ info->alloc_offset = WP_MISSING_DEV;
+ return 0;
+ }
+
+ if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
+ zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
+ device->devid);
+ return -EIO;
+ }
+
+ info->capacity = (zone.capacity << SECTOR_SHIFT);
+
+ switch (zone.cond) {
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ btrfs_err(fs_info,
+ "zoned: offline/readonly zone %llu on device %s (devid %llu)",
+ (info->physical >> device->zone_info->zone_size_shift),
+ rcu_str_deref(device->name), device->devid);
+ info->alloc_offset = WP_MISSING_DEV;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ info->alloc_offset = 0;
+ break;
+ case BLK_ZONE_COND_FULL:
+ info->alloc_offset = info->capacity;
+ break;
+ default:
+ /* Partially used zone. */
+ info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT);
+ __set_bit(zone_idx, active);
+ break;
+ }
+
+ return 0;
+}
+
+static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
+ struct zone_info *info,
+ unsigned long *active)
+{
+ if (info->alloc_offset == WP_MISSING_DEV) {
+ btrfs_err(bg->fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ info->physical);
+ return -EIO;
+ }
+
+ bg->alloc_offset = info->alloc_offset;
+ bg->zone_capacity = info->capacity;
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ return 0;
+}
+
+static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree");
+ return -EINVAL;
+ }
+
+ if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
+ btrfs_err(bg->fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ zone_info[0].physical);
+ return -EIO;
+ }
+ if (zone_info[1].alloc_offset == WP_MISSING_DEV) {
+ btrfs_err(bg->fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ zone_info[1].physical);
+ return -EIO;
+ }
+ if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
+ btrfs_err(bg->fs_info,
+ "zoned: write pointer offset mismatch of zones in DUP profile");
+ return -EIO;
+ }
+
+ if (test_bit(0, active) != test_bit(1, active)) {
+ if (!btrfs_zone_activate(bg))
+ return -EIO;
+ } else if (test_bit(0, active)) {
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+
+ bg->alloc_offset = zone_info[0].alloc_offset;
+ bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity);
+ return 0;
+}
+
+static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ int i;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
+ !btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_err(fs_info,
+ "zoned: write pointer offset mismatch of zones in %s profile",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EIO;
+ }
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_test_opt(fs_info, DEGRADED) &&
+ !btrfs_zone_activate(bg)) {
+ return -EIO;
+ }
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+ /* In case a device is missing we have a cap of 0, so don't use it. */
+ bg->zone_capacity = min_not_zero(zone_info[0].capacity,
+ zone_info[1].capacity);
+ }
+
+ if (zone_info[0].alloc_offset != WP_MISSING_DEV)
+ bg->alloc_offset = zone_info[0].alloc_offset;
+ else
+ bg->alloc_offset = zone_info[i - 1].alloc_offset;
+
+ return 0;
+}
+
+static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ for (int i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_zone_activate(bg))
+ return -EIO;
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+ bg->zone_capacity += zone_info[i].capacity;
+ bg->alloc_offset += zone_info[i].alloc_offset;
+ }
+
+ return 0;
+}
+
+static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ for (int i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_zone_activate(bg))
+ return -EIO;
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+
+ if ((i % map->sub_stripes) == 0) {
+ bg->zone_capacity += zone_info[i].capacity;
+ bg->alloc_offset += zone_info[i].alloc_offset;
+ }
+ }
+
+ return 0;
+}
+
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_device *device;
u64 logical = cache->start;
u64 length = cache->length;
+ struct zone_info *zone_info = NULL;
int ret;
int i;
- unsigned int nofs_flag;
- u64 *alloc_offsets = NULL;
- u64 *caps = NULL;
- u64 *physical = NULL;
unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
@@ -1328,20 +1591,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
- alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
- if (!alloc_offsets) {
- ret = -ENOMEM;
- goto out;
- }
-
- caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
- if (!caps) {
- ret = -ENOMEM;
- goto out;
- }
-
- physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
- if (!physical) {
+ zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS);
+ if (!zone_info) {
ret = -ENOMEM;
goto out;
}
@@ -1353,98 +1604,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
for (i = 0; i < map->num_stripes; i++) {
- bool is_sequential;
- struct blk_zone zone;
- struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- int dev_replace_is_ongoing = 0;
-
- device = map->stripes[i].dev;
- physical[i] = map->stripes[i].physical;
-
- if (device->bdev == NULL) {
- alloc_offsets[i] = WP_MISSING_DEV;
- continue;
- }
-
- is_sequential = btrfs_dev_is_sequential(device, physical[i]);
- if (is_sequential)
- num_sequential++;
- else
- num_conventional++;
-
- /*
- * Consider a zone as active if we can allow any number of
- * active zones.
- */
- if (!device->zone_info->max_active_zones)
- __set_bit(i, active);
-
- if (!is_sequential) {
- alloc_offsets[i] = WP_CONVENTIONAL;
- continue;
- }
-
- /*
- * This zone will be used for allocation, so mark this zone
- * non-empty.
- */
- btrfs_dev_clear_zone_empty(device, physical[i]);
-
- down_read(&dev_replace->rwsem);
- dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
- btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
- up_read(&dev_replace->rwsem);
-
- /*
- * The group is mapped to a sequential zone. Get the zone write
- * pointer to determine the allocation offset within the zone.
- */
- WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
- nofs_flag = memalloc_nofs_save();
- ret = btrfs_get_dev_zone(device, physical[i], &zone);
- memalloc_nofs_restore(nofs_flag);
- if (ret == -EIO || ret == -EOPNOTSUPP) {
- ret = 0;
- alloc_offsets[i] = WP_MISSING_DEV;
- continue;
- } else if (ret) {
- goto out;
- }
-
- if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
- btrfs_err_in_rcu(fs_info,
- "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
- zone.start << SECTOR_SHIFT,
- rcu_str_deref(device->name), device->devid);
- ret = -EIO;
+ ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
+ if (ret)
goto out;
- }
- caps[i] = (zone.capacity << SECTOR_SHIFT);
-
- switch (zone.cond) {
- case BLK_ZONE_COND_OFFLINE:
- case BLK_ZONE_COND_READONLY:
- btrfs_err(fs_info,
- "zoned: offline/readonly zone %llu on device %s (devid %llu)",
- physical[i] >> device->zone_info->zone_size_shift,
- rcu_str_deref(device->name), device->devid);
- alloc_offsets[i] = WP_MISSING_DEV;
- break;
- case BLK_ZONE_COND_EMPTY:
- alloc_offsets[i] = 0;
- break;
- case BLK_ZONE_COND_FULL:
- alloc_offsets[i] = caps[i];
- break;
- default:
- /* Partially used zone */
- alloc_offsets[i] =
- ((zone.wp - zone.start) << SECTOR_SHIFT);
- __set_bit(i, active);
- break;
- }
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ num_conventional++;
+ else
+ num_sequential++;
}
if (num_sequential > 0)
@@ -1468,63 +1635,24 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
case 0: /* single */
- if (alloc_offsets[0] == WP_MISSING_DEV) {
- btrfs_err(fs_info,
- "zoned: cannot recover write pointer for zone %llu",
- physical[0]);
- ret = -EIO;
- goto out;
- }
- cache->alloc_offset = alloc_offsets[0];
- cache->zone_capacity = caps[0];
- if (test_bit(0, active))
- set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
+ ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
break;
case BTRFS_BLOCK_GROUP_DUP:
- if (map->type & BTRFS_BLOCK_GROUP_DATA) {
- btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
- ret = -EINVAL;
- goto out;
- }
- if (alloc_offsets[0] == WP_MISSING_DEV) {
- btrfs_err(fs_info,
- "zoned: cannot recover write pointer for zone %llu",
- physical[0]);
- ret = -EIO;
- goto out;
- }
- if (alloc_offsets[1] == WP_MISSING_DEV) {
- btrfs_err(fs_info,
- "zoned: cannot recover write pointer for zone %llu",
- physical[1]);
- ret = -EIO;
- goto out;
- }
- if (alloc_offsets[0] != alloc_offsets[1]) {
- btrfs_err(fs_info,
- "zoned: write pointer offset mismatch of zones in DUP profile");
- ret = -EIO;
- goto out;
- }
- if (test_bit(0, active) != test_bit(1, active)) {
- if (!btrfs_zone_activate(cache)) {
- ret = -EIO;
- goto out;
- }
- } else {
- if (test_bit(0, active))
- set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
- &cache->runtime_flags);
- }
- cache->alloc_offset = alloc_offsets[0];
- cache->zone_capacity = min(caps[0], caps[1]);
+ ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
break;
case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID0:
+ ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID10:
+ ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID5:
case BTRFS_BLOCK_GROUP_RAID6:
- /* non-single profiles are not supported yet */
default:
btrfs_err(fs_info, "zoned: profile %s not yet supported",
btrfs_bg_type_to_raid_name(map->type));
@@ -1570,9 +1698,7 @@ out:
cache->physical_map = NULL;
}
bitmap_free(active);
- kfree(physical);
- kfree(caps);
- kfree(alloc_offsets);
+ kfree(zone_info);
free_extent_map(em);
return ret;
@@ -1609,7 +1735,7 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans,
set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
set_extent_buffer_dirty(eb);
set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
- EXTENT_DIRTY | EXTENT_NOWAIT, NULL);
+ EXTENT_DIRTY, NULL);
}
bool btrfs_use_zone_append(struct btrfs_bio *bbio)
@@ -1887,7 +2013,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
int i, ret;
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &mapped_length, &bioc, NULL, NULL, 1);
+ &mapped_length, &bioc, NULL, NULL);
if (ret || !bioc || mapped_length < PAGE_SIZE) {
ret = -EIO;
goto out_put_bioc;