diff options
author | Filipe Manana <fdmanana@suse.com> | 2022-04-13 16:20:43 +0100 |
---|---|---|
committer | David Sterba <dsterba@suse.com> | 2022-05-16 17:03:13 +0200 |
commit | 2306e83e730a86fbf6855d4611438d8632ad2e73 (patch) | |
tree | 8b868c7b95564d58af56645035e2fe190a2ecac6 /fs | |
parent | 8b01f931c140a943e837d86a9b82f0910629492e (diff) |
btrfs: avoid double search for block group during NOCOW writes
When doing a NOCOW write, either through direct IO or buffered IO, we do
two lookups for the block group that contains the target extent: once
when we call btrfs_inc_nocow_writers() and then later again when we call
btrfs_dec_nocow_writers() after creating the ordered extent.
The lookups require taking a lock and navigating the red black tree used
to track all block groups, which can take a non-negligible amount of time
for a large filesystem with thousands of block groups, as well as lock
contention and cache line bouncing.
Improve on this by having a single block group search: making
btrfs_inc_nocow_writers() return the block group to its caller and then
have the caller pass that block group to btrfs_dec_nocow_writers().
This is part of a patchset comprised of the following patches:
btrfs: remove search start argument from first_logical_byte()
btrfs: use rbtree with leftmost node cached for tracking lowest block group
btrfs: use a read/write lock for protecting the block groups tree
btrfs: return block group directly at btrfs_next_block_group()
btrfs: avoid double search for block group during NOCOW writes
The following test was used to test these changes from a performance
perspective:
$ cat test.sh
#!/bin/bash
modprobe null_blk nr_devices=0
NULL_DEV_PATH=/sys/kernel/config/nullb/nullb0
mkdir $NULL_DEV_PATH
if [ $? -ne 0 ]; then
echo "Failed to create nullb0 directory."
exit 1
fi
echo 2 > $NULL_DEV_PATH/submit_queues
echo 16384 > $NULL_DEV_PATH/size # 16G
echo 1 > $NULL_DEV_PATH/memory_backed
echo 1 > $NULL_DEV_PATH/power
DEV=/dev/nullb0
MNT=/mnt/nullb0
LOOP_MNT="$MNT/loop"
MOUNT_OPTIONS="-o ssd -o nodatacow"
MKFS_OPTIONS="-R free-space-tree -O no-holes"
cat <<EOF > /tmp/fio-job.ini
[io_uring_writes]
rw=randwrite
fsync=0
fallocate=posix
group_reporting=1
direct=1
ioengine=io_uring
iodepth=64
bs=64k
filesize=1g
runtime=300
time_based
directory=$LOOP_MNT
numjobs=8
thread
EOF
echo performance | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
echo
echo "Using config:"
echo
cat /tmp/fio-job.ini
echo
umount $MNT &> /dev/null
mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
mount $MOUNT_OPTIONS $DEV $MNT
mkdir $LOOP_MNT
truncate -s 4T $MNT/loopfile
mkfs.btrfs -f $MKFS_OPTIONS $MNT/loopfile &> /dev/null
mount $MOUNT_OPTIONS $MNT/loopfile $LOOP_MNT
# Trigger the allocation of about 3500 data block groups, without
# actually consuming space on underlying filesystem, just to make
# the tree of block group large.
fallocate -l 3500G $LOOP_MNT/filler
fio /tmp/fio-job.ini
umount $LOOP_MNT
umount $MNT
echo 0 > $NULL_DEV_PATH/power
rmdir $NULL_DEV_PATH
The test was run on a non-debug kernel (Debian's default kernel config),
the result were the following.
Before patchset:
WRITE: bw=1455MiB/s (1526MB/s), 1455MiB/s-1455MiB/s (1526MB/s-1526MB/s), io=426GiB (458GB), run=300006-300006msec
After patchset:
WRITE: bw=1503MiB/s (1577MB/s), 1503MiB/s-1503MiB/s (1577MB/s-1577MB/s), io=440GiB (473GB), run=300006-300006msec
+3.3% write throughput and +3.3% IO done in the same time period.
The test has somewhat limited coverage scope, as with only NOCOW writes
we get less contention on the red black tree of block groups, since we
don't have the extra contention caused by COW writes, namely when
allocating data extents, pinning and unpinning data extents, but on the
hand there's access to tree in the NOCOW path, when incrementing a block
group's number of NOCOW writers.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/block-group.c | 58 | ||||
-rw-r--r-- | fs/btrfs/block-group.h | 5 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 26 |
3 files changed, 60 insertions, 29 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index db112a01d711..9739f3e8230a 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -284,42 +284,66 @@ struct btrfs_block_group *btrfs_next_block_group( return cache; } -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +/** + * Check if we can do a NOCOW write for a given extent. + * + * @fs_info: The filesystem information object. + * @bytenr: Logical start address of the extent. + * + * Check if we can do a NOCOW write for the given extent, and increments the + * number of NOCOW writers in the block group that contains the extent, as long + * as the block group exists and it's currently not in read-only mode. + * + * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller + * is responsible for calling btrfs_dec_nocow_writers() later. + * + * Or NULL if we can not do a NOCOW write + */ +struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, + u64 bytenr) { struct btrfs_block_group *bg; - bool ret = true; + bool can_nocow = true; bg = btrfs_lookup_block_group(fs_info, bytenr); if (!bg) - return false; + return NULL; spin_lock(&bg->lock); if (bg->ro) - ret = false; + can_nocow = false; else atomic_inc(&bg->nocow_writers); spin_unlock(&bg->lock); - /* No put on block group, done by btrfs_dec_nocow_writers */ - if (!ret) + if (!can_nocow) { btrfs_put_block_group(bg); + return NULL; + } - return ret; + /* No put on block group, done by btrfs_dec_nocow_writers(). */ + return bg; } -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +/** + * Decrement the number of NOCOW writers in a block group. + * + * @bg: The block group. + * + * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), + * and on the block group returned by that call. Typically this is called after + * creating an ordered extent for a NOCOW write, to prevent races with scrub and + * relocation. + * + * After this call, the caller should not use the block group anymore. It it wants + * to use it, then it should get a reference on it before calling this function. + */ +void btrfs_dec_nocow_writers(struct btrfs_block_group *bg) { - struct btrfs_block_group *bg; - - bg = btrfs_lookup_block_group(fs_info, bytenr); - ASSERT(bg); if (atomic_dec_and_test(&bg->nocow_writers)) wake_up_var(&bg->nocow_writers); - /* - * Once for our lookup and once for the lookup done by a previous call - * to btrfs_inc_nocow_writers() - */ - btrfs_put_block_group(bg); + + /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */ btrfs_put_block_group(bg); } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index e8308f2ad07d..c9bf01dd10e8 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -254,8 +254,9 @@ void btrfs_put_block_group(struct btrfs_block_group *cache); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg); -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, + u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a1504cdbab7a..44b7c9a7c84d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1773,6 +1773,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, int ret; bool check_prev = true; u64 ino = btrfs_ino(inode); + struct btrfs_block_group *bg; bool nocow = false; struct can_nocow_file_extent_args nocow_args = { 0 }; @@ -1901,7 +1902,8 @@ next_slot: } ret = 0; - if (btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr)) + bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + if (bg) nocow = true; out_check: /* @@ -1977,9 +1979,10 @@ out_check: goto error; } - if (nocow) - btrfs_dec_nocow_writers(fs_info, nocow_args.disk_bytenr); - nocow = false; + if (nocow) { + btrfs_dec_nocow_writers(bg); + nocow = false; + } if (btrfs_is_data_reloc_root(root)) /* @@ -2023,7 +2026,7 @@ out_check: error: if (nocow) - btrfs_dec_nocow_writers(fs_info, nocow_args.disk_bytenr); + btrfs_dec_nocow_writers(bg); if (ret && cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end, @@ -7417,6 +7420,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, struct extent_map *em = *map; int type; u64 block_start, orig_start, orig_block_len, ram_bytes; + struct btrfs_block_group *bg; bool can_nocow = false; bool space_reserved = false; u64 prev_len; @@ -7442,9 +7446,11 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes, false) == 1 && - btrfs_inc_nocow_writers(fs_info, block_start)) - can_nocow = true; + &orig_block_len, &ram_bytes, false) == 1) { + bg = btrfs_inc_nocow_writers(fs_info, block_start); + if (bg) + can_nocow = true; + } } prev_len = len; @@ -7458,7 +7464,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; - btrfs_dec_nocow_writers(fs_info, block_start); + btrfs_dec_nocow_writers(bg); if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) ret = -EAGAIN; goto out; @@ -7469,7 +7475,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, orig_start, block_start, len, orig_block_len, ram_bytes, type); - btrfs_dec_nocow_writers(fs_info, block_start); + btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); *map = em = em2; |