From e1a6d2648300ef4cfdcfd4838224fe5cefe3caaa Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jul 2021 16:03:41 +0100 Subject: btrfs: avoid unnecessary log mutex contention when syncing log When syncing the log we acquire the root's log mutex just to update the root's last_log_commit. This is unnecessary because: 1) At this point there can only be one task updating this value, which is the task committing the current log transaction. Any task that enters btrfs_sync_log() has to wait for the previous log transaction to commit and wait for the current log transaction to commit if someone else already started it (in this case it never reaches to the point of updating last_log_commit, as that is done by the committing task); 2) All readers of the root's last_log_commit don't acquire the root's log mutex. This is to avoid blocking the readers, potentially for too long and because getting a stale value of last_log_commit does not cause any functional problem, in the worst case getting a stale value results in logging an inode unnecessarily. Plus it's actually very rare to get a stale value that results in unnecessarily logging the inode. So in order to avoid unnecessary contention on the root's log mutex, which is used for several different purposes, like starting/joining a log transaction and starting writeback of a log transaction, stop acquiring the log mutex for updating the root's last_log_commit. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e6430ac9bbe8..a7ce23b9d2ee 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3328,10 +3328,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } - mutex_lock(&root->log_mutex); - if (root->last_log_commit < log_transid) - root->last_log_commit = log_transid; - mutex_unlock(&root->log_mutex); + /* + * We know there can only be one task here, since we have not yet set + * root->log_commit[index1] to 0 and any task attempting to sync the + * log must wait for the previous log transaction to commit if it's + * still in progress or wait for the current log transaction commit if + * someone else already started it. We use <= and not < because the + * first log transaction has an ID of 0. + */ + ASSERT(root->last_log_commit <= log_transid); + root->last_log_commit = log_transid; out_wake_log_root: mutex_lock(&log_root_tree->log_mutex); -- cgit v1.2.3-70-g09d2 From e68107e51f8466e1fae40d64b873d0a11398a628 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jul 2021 16:03:42 +0100 Subject: btrfs: remove unnecessary list head initialization when syncing log One of the last steps of syncing the log is to remove all log contexts from the root's list of contexts, done at btrfs_remove_all_log_ctxs(). There we iterate over all the contexts in the list and delete each one from the list, and after that we call INIT_LIST_HEAD() on the list. That is unnecessary since at that point the list is empty. So just remove the INIT_LIST_HEAD() call. It's not needed, increases code size (bloat-o-meter reported a delta of -122 for btrfs_sync_log() after this change) and increases two critical sections delimited by log mutexes. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a7ce23b9d2ee..3e6c8f8a2909 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3039,8 +3039,6 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, list_del_init(&ctx->list); ctx->log_ret = error; } - - INIT_LIST_HEAD(&root->log_ctxs[index]); } /* -- cgit v1.2.3-70-g09d2 From 2ac691d8b3b1dd300a48b1763fa3a1434863070b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 20 Jul 2021 16:03:43 +0100 Subject: btrfs: avoid unnecessary lock and leaf splits when updating inode in the log During a fast fsync, if we have already fsynced the file before and in the current transaction, we can make the inode item update more efficient and avoid acquiring a write lock on the leaf's parent. To update the inode item we are always using btrfs_insert_empty_item() to get a path pointing to the inode item, which calls btrfs_search_slot() with an "ins_len" argument of 'sizeof(struct btrfs_inode_item) + sizeof(struct btrfs_item)', and that always results in the search taking a write lock on the level 1 node that is the parent of the leaf that contains the inode item. This adds unnecessary lock contention on log trees when we have multiple fsyncs in parallel against inodes in the same subvolume, which has a very significant impact due to the fact that log trees are short lived and their height very rarely goes beyond level 2. Also, by using btrfs_insert_empty_item() when we need to update the inode item, we also end up splitting the leaf of the existing inode item when the leaf has an amount of free space smaller than the size of an inode item. Improve this by using btrfs_seach_slot(), with a 0 "ins_len" argument, when we know the inode item already exists in the log. This avoids these two inefficiencies. The following script, using fio, was used to perform the tests: $ cat fio-test.sh #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/nvme0n1 MOUNT_OPTIONS="-o ssd" MKFS_OPTIONS="-d single -m single" if [ $# -ne 4 ]; then echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ BLOCK_SIZE" exit 1 fi NUM_JOBS=$1 FILE_SIZE=$2 FSYNC_FREQ=$3 BLOCK_SIZE=$4 cat < /tmp/fio-job.ini [writers] rw=randwrite fsync=$FSYNC_FREQ fallocate=none group_reporting=1 direct=0 bs=$BLOCK_SIZE ioengine=sync size=$FILE_SIZE directory=$MNT numjobs=$NUM_JOBS EOF echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor echo echo "Using config:" echo cat /tmp/fio-job.ini echo echo "mount options: $MOUNT_OPTIONS" echo umount $MNT &> /dev/null mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT fio /tmp/fio-job.ini umount $MNT The tests were done on a physical machine, with 12 cores, 64G of RAM, using a NVMEe device and using a non-debug kernel config (the default one from Debian). The summary line from fio is provided below for each test run. With 8 jobs, file size 256M, fsync frequency of 4 and a block size of 4K: Before: WRITE: bw=28.3MiB/s (29.7MB/s), 28.3MiB/s-28.3MiB/s (29.7MB/s-29.7MB/s), io=2048MiB (2147MB), run=72297-72297msec After: WRITE: bw=28.7MiB/s (30.1MB/s), 28.7MiB/s-28.7MiB/s (30.1MB/s-30.1MB/s), io=2048MiB (2147MB), run=71411-71411msec +1.4% throughput, -1.2% runtime With 16 jobs, file size 256M, fsync frequency of 4 and a block size of 4K: Before: WRITE: bw=40.0MiB/s (42.0MB/s), 40.0MiB/s-40.0MiB/s (42.0MB/s-42.0MB/s), io=4096MiB (4295MB), run=99980-99980msec After: WRITE: bw=40.9MiB/s (42.9MB/s), 40.9MiB/s-40.9MiB/s (42.9MB/s-42.9MB/s), io=4096MiB (4295MB), run=97933-97933msec +2.2% throughput, -2.1% runtime The changes are small but it's possible to be better on faster hardware as in the test machine used disk utilization was pretty much 100% during the whole time the tests were running (observed with 'iostat -xz 1'). The tests also included the previous patch with the subject of: "btrfs: avoid unnecessary log mutex contention when syncing log". So they compared a branch without that patch and without this patch versus a branch with these two patches applied. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3e6c8f8a2909..8dde5c08a48f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3972,14 +3972,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, static int log_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, - struct btrfs_inode *inode) + struct btrfs_inode *inode, bool inode_item_dropped) { struct btrfs_inode_item *inode_item; int ret; - ret = btrfs_insert_empty_item(trans, log, path, - &inode->location, sizeof(*inode_item)); - if (ret && ret != -EEXIST) + /* + * If we are doing a fast fsync and the inode was logged before in the + * current transaction, then we know the inode was previously logged and + * it exists in the log tree. For performance reasons, in this case use + * btrfs_search_slot() directly with ins_len set to 0 so that we never + * attempt a write lock on the leaf's parent, which adds unnecessary lock + * contention in case there are concurrent fsyncs for other inodes of the + * same subvolume. Using btrfs_insert_empty_item() when the inode item + * already exists can also result in unnecessarily splitting a leaf. + */ + if (!inode_item_dropped && inode->logged_trans == trans->transid) { + ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); + ASSERT(ret <= 0); + if (ret > 0) + ret = -ENOENT; + } else { + /* + * This means it is the first fsync in the current transaction, + * so the inode item is not in the log and we need to insert it. + * We can never get -EEXIST because we are only called for a fast + * fsync and in case an inode eviction happens after the inode was + * logged before in the current transaction, when we load again + * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime + * flags and set ->logged_trans to 0. + */ + ret = btrfs_insert_empty_item(trans, log, path, &inode->location, + sizeof(*inode_item)); + ASSERT(ret != -EEXIST); + } + if (ret) return ret; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); @@ -5303,6 +5330,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, bool need_log_inode_item = true; bool xattrs_logged = false; bool recursive_logging = false; + bool inode_item_dropped = true; path = btrfs_alloc_path(); if (!path) @@ -5437,6 +5465,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { if (inode_only == LOG_INODE_ALL) fast_search = true; + inode_item_dropped = false; goto log_extents; } @@ -5470,7 +5499,7 @@ log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (need_log_inode_item) { - err = log_inode_item(trans, log, dst_path, inode); + err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); if (err) goto out_unlock; /* -- cgit v1.2.3-70-g09d2 From 214cc184321743327c84c4a13ad08d088dfb3c4a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 26 Jul 2021 14:15:26 +0200 Subject: btrfs: constify and cleanup variables in comparators Comparators just read the data and thus get const parameters. This should be also preserved by the local variables, update all comparators passed to sort or bsearch. Cleanups: - unnecessary casts are dropped - btrfs_cmp_device_free_bytes is cleaned up to follow the common pattern and 'inline' is dropped as the function address is taken Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 8 ++++---- fs/btrfs/send.c | 6 +++--- fs/btrfs/super.c | 13 ++++++------- fs/btrfs/tree-log.c | 2 +- fs/btrfs/volumes.c | 2 +- 5 files changed, 15 insertions(+), 16 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index a40a45a007d4..d8d268ca8aa7 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1636,10 +1636,10 @@ struct btrfs_plug_cb { static int plug_cmp(void *priv, const struct list_head *a, const struct list_head *b) { - struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, - plug_list); - struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, - plug_list); + const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, + plug_list); + const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, + plug_list); u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6ac37ae6c811..75cff564dedf 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1198,7 +1198,7 @@ struct backref_ctx { static int __clone_root_cmp_bsearch(const void *key, const void *elt) { u64 root = (u64)(uintptr_t)key; - struct clone_root *cr = (struct clone_root *)elt; + const struct clone_root *cr = elt; if (root < cr->root->root_key.objectid) return -1; @@ -1209,8 +1209,8 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt) static int __clone_root_cmp_sort(const void *e1, const void *e2) { - struct clone_root *cr1 = (struct clone_root *)e1; - struct clone_root *cr2 = (struct clone_root *)e2; + const struct clone_root *cr1 = e1; + const struct clone_root *cr2 = e2; if (cr1->root->root_key.objectid < cr2->root->root_key.objectid) return -1; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d07b18b2b250..35ff142ad242 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2096,16 +2096,15 @@ restore: } /* Used to sort the devices by max_avail(descending sort) */ -static inline int btrfs_cmp_device_free_bytes(const void *dev_info1, - const void *dev_info2) +static int btrfs_cmp_device_free_bytes(const void *a, const void *b) { - if (((struct btrfs_device_info *)dev_info1)->max_avail > - ((struct btrfs_device_info *)dev_info2)->max_avail) + const struct btrfs_device_info *dev_info1 = a; + const struct btrfs_device_info *dev_info2 = b; + + if (dev_info1->max_avail > dev_info2->max_avail) return -1; - else if (((struct btrfs_device_info *)dev_info1)->max_avail < - ((struct btrfs_device_info *)dev_info2)->max_avail) + else if (dev_info1->max_avail < dev_info2->max_avail) return 1; - else return 0; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8dde5c08a48f..191dea1d2416 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4191,7 +4191,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, static int extent_cmp(void *priv, const struct list_head *a, const struct list_head *b) { - struct extent_map *em1, *em2; + const struct extent_map *em1, *em2; em1 = list_entry(a, struct extent_map, list); em2 = list_entry(b, struct extent_map, list); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c7339f163bda..d42fb61aadc3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1216,7 +1216,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, static int devid_cmp(void *priv, const struct list_head *a, const struct list_head *b) { - struct btrfs_device *dev1, *dev2; + const struct btrfs_device *dev1, *dev2; dev1 = list_entry(a, struct btrfs_device, dev_list); dev2 = list_entry(b, struct btrfs_device, dev_list); -- cgit v1.2.3-70-g09d2 From 6e8e777deb5cbff76bcd34b1f45bc747f48e8abe Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 27 Jul 2021 11:24:44 +0100 Subject: btrfs: eliminate some false positives when checking if inode was logged When checking if an inode was previously logged in the current transaction through the helper inode_logged(), we can return some false positives that can be easily eliminated. These correspond to the cases where an inode has a ->logged_trans value that is not zero and its value is smaller then the ID of the current transaction. This means we know exactly that the inode was never logged before in the current transaction, so we can return false and avoid the callers to do extra work: 1) Having btrfs_del_dir_entries_in_log() and btrfs_del_inode_ref_in_log() unnecessarily join a log transaction and do deletion searches in a log tree that will not find anything. This just adds unnecessary contention on extent buffer locks; 2) Having btrfs_log_new_name() unnecessarily log an inode when it is not needed. If the inode was not logged before, we don't need to log it in LOG_INODE_EXISTS mode. So just make sure that any false positive only happens when ->logged_trans has a value of 0. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 191dea1d2416..d09202e0c9df 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3421,14 +3421,10 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, } /* - * Check if an inode was logged in the current transaction. We can't always rely - * on an inode's logged_trans value, because it's an in-memory only field and - * therefore not persisted. This means that its value is lost if the inode gets - * evicted and loaded again from disk (in which case it has a value of 0, and - * certainly it is smaller then any possible transaction ID), when that happens - * the full_sync flag is set in the inode's runtime flags, so on that case we - * assume eviction happened and ignore the logged_trans value, assuming the - * worst case, that the inode was logged before in the current transaction. + * Check if an inode was logged in the current transaction. This may often + * return some false positives, because logged_trans is an in memory only field, + * not persisted anywhere. This is meant to be used in contexts where a false + * positive has no functional consequences. */ static bool inode_logged(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) @@ -3436,7 +3432,18 @@ static bool inode_logged(struct btrfs_trans_handle *trans, if (inode->logged_trans == trans->transid) return true; - if (inode->last_trans == trans->transid && + /* + * The inode's logged_trans is always 0 when we load it (because it is + * not persisted in the inode item or elsewhere). So if it is 0, the + * inode was last modified in the current transaction and has the + * full_sync flag set, then the inode may have been logged before in + * the current transaction, then evicted and loaded again in the current + * transaction - or may have never been logged in the current transaction, + * but since we can not be sure, we have to assume it was, otherwise our + * callers can leave an inconsistent log. + */ + if (inode->logged_trans == 0 && + inode->last_trans == trans->transid && test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) return true; -- cgit v1.2.3-70-g09d2 From 77eea05e7851d910b7992c8c237a6b5d462050da Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 30 Jun 2021 13:01:48 -0700 Subject: btrfs: add ro compat flags to inodes Currently, inode flags are fully backwards incompatible in btrfs. If we introduce a new inode flag, then tree-checker will detect it and fail. This can even cause us to fail to mount entirely. To make it possible to introduce new flags which can be read-only compatible, like VERITY, we add new ro flags to btrfs without treating them quite so harshly in tree-checker. A read-only file system can survive an unexpected flag, and can be mounted. As for the implementation, it unfortunately gets a little complicated. The on-disk representation of the inode, btrfs_inode_item, has an __le64 for flags but the in-memory representation, btrfs_inode, uses a u32. David Sterba had the nice idea that we could reclaim those wasted 32 bits on disk and use them for the new ro_compat flags. It turns out that the tree-checker code which checks for unknown flags is broken, and ignores the upper 32 bits we are hoping to use. The issue is that the flags use the literal 1 rather than 1ULL, so the flags are signed ints, and one of them is specifically (1 << 31). As a result, the mask which ORs the flags is a negative integer on machines where int is 32 bit twos complement. When tree-checker evaluates the expression: btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) The mask is something like 0x80000abc, which gets promoted to u64 with sign extension to 0xffffffff80000abc. Negating that 64 bit mask leaves all the upper bits zeroed, and we can't detect unexpected flags. This suggests that we can't use those bits after all. Luckily, we have good reason to believe that they are zero anyway. Inode flags are metadata, which is always checksummed, so any bit flips that would introduce 1s would cause a checksum failure anyway (excluding the improbable case of the checksum getting corrupted exactly badly). Further, unless the 1 << 31 flag is used, the cast to u64 of the 32 bit inode flag should preserve its value and not add leading zeroes (at least for twos complement). The only place that flag (BTRFS_INODE_ROOT_ITEM_INIT) is used is in a special inode embedded in the root item, and indeed for that inode we see 0xffffffff80000000 as the flags on disk. However, that inode is never seen by tree checker, nor is it used in a context where verity might be meaningful. Theoretically, a future ro flag might cause trouble on that inode, so we should proactively clean up that mess before it does. With the introduction of the new ro flags, keep two separate unsigned masks and check them against the appropriate u32. Since we no longer run afoul of sign extension, this also stops writing out 0xffffffff80000000 in root_item inodes going forward. Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 20 +++++++++++++++++++- fs/btrfs/ctree.h | 30 ++++++++++++++++-------------- fs/btrfs/delayed-inode.c | 9 +++++++-- fs/btrfs/inode.c | 9 +++++++-- fs/btrfs/ioctl.c | 7 ++++--- fs/btrfs/tree-checker.c | 17 +++++++++++++---- fs/btrfs/tree-log.c | 5 ++++- 7 files changed, 70 insertions(+), 27 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c652e19ad74e..1093b00130be 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -189,8 +189,10 @@ struct btrfs_inode { */ u64 csum_bytes; - /* flags field from the on disk inode */ + /* Backwards incompatible flags, lower half of inode_item::flags */ u32 flags; + /* Read-only compatibility flags, upper half of inode_item::flags */ + u32 ro_flags; /* * Counters to keep track of the number of extent item's we may use due @@ -348,6 +350,22 @@ struct btrfs_dio_private { u8 csums[]; }; +/* + * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two + * separate u32s. These two functions convert between the two representations. + */ +static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags) +{ + return (flags | ((u64)ro_flags << 32)); +} + +static inline void btrfs_inode_split_flags(u64 inode_item_flags, + u32 *flags, u32 *ro_flags) +{ + *flags = (u32)inode_item_flags; + *ro_flags = (u32)(inode_item_flags >> 32); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fd3084feb4b5..9e3b7a56a78f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1482,20 +1482,20 @@ do { \ /* * Inode flags */ -#define BTRFS_INODE_NODATASUM (1 << 0) -#define BTRFS_INODE_NODATACOW (1 << 1) -#define BTRFS_INODE_READONLY (1 << 2) -#define BTRFS_INODE_NOCOMPRESS (1 << 3) -#define BTRFS_INODE_PREALLOC (1 << 4) -#define BTRFS_INODE_SYNC (1 << 5) -#define BTRFS_INODE_IMMUTABLE (1 << 6) -#define BTRFS_INODE_APPEND (1 << 7) -#define BTRFS_INODE_NODUMP (1 << 8) -#define BTRFS_INODE_NOATIME (1 << 9) -#define BTRFS_INODE_DIRSYNC (1 << 10) -#define BTRFS_INODE_COMPRESS (1 << 11) - -#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) +#define BTRFS_INODE_NODATASUM (1U << 0) +#define BTRFS_INODE_NODATACOW (1U << 1) +#define BTRFS_INODE_READONLY (1U << 2) +#define BTRFS_INODE_NOCOMPRESS (1U << 3) +#define BTRFS_INODE_PREALLOC (1U << 4) +#define BTRFS_INODE_SYNC (1U << 5) +#define BTRFS_INODE_IMMUTABLE (1U << 6) +#define BTRFS_INODE_APPEND (1U << 7) +#define BTRFS_INODE_NODUMP (1U << 8) +#define BTRFS_INODE_NOATIME (1U << 9) +#define BTRFS_INODE_DIRSYNC (1U << 10) +#define BTRFS_INODE_COMPRESS (1U << 11) + +#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31) #define BTRFS_INODE_FLAG_MASK \ (BTRFS_INODE_NODATASUM | \ @@ -1512,6 +1512,8 @@ do { \ BTRFS_INODE_COMPRESS | \ BTRFS_INODE_ROOT_ITEM_INIT) +#define BTRFS_INODE_RO_FLAG_MASK (0) + struct btrfs_map_token { struct extent_buffer *eb; char *kaddr; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 61452f04181a..1e08eb2b27f0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1645,6 +1645,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item, struct inode *inode) { + u64 flags; + btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); @@ -1657,7 +1659,9 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, inode_peek_iversion(inode)); btrfs_set_stack_inode_transid(inode_item, trans->transid); btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); - btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); + flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, + BTRFS_I(inode)->ro_flags); + btrfs_set_stack_inode_flags(inode_item, flags); btrfs_set_stack_inode_block_group(inode_item, 0); btrfs_set_stack_timespec_sec(&inode_item->atime, @@ -1715,7 +1719,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) btrfs_stack_inode_sequence(inode_item)); inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); - BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); + btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), + &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime); inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 949f6a6c616a..cd5a67ba7e71 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3764,7 +3764,8 @@ static int btrfs_read_locked_inode(struct inode *inode, rdev = btrfs_inode_rdev(leaf, inode_item); BTRFS_I(inode)->index_cnt = (u64)-1; - BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), + &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); cache_index: /* @@ -3895,6 +3896,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_map_token token; + u64 flags; btrfs_init_map_token(&token, leaf); @@ -3930,7 +3932,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); - btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); + flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, + BTRFS_I(inode)->ro_flags); + btrfs_set_token_inode_flags(&token, item, flags); btrfs_set_token_inode_block_group(&token, item, 0); } @@ -9064,6 +9068,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; + ei->ro_flags = 0; ei->csum_bytes = 0; ei->index_cnt = (u64)-1; ei->dir_index = 0; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4d809899c076..17aefb5f08ea 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -103,9 +103,10 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS * ioctl. */ -static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags) +static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) { unsigned int iflags = 0; + u32 flags = binode->flags; if (flags & BTRFS_INODE_SYNC) iflags |= FS_SYNC_FL; @@ -200,7 +201,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); - fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode->flags)); + fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode)); return 0; } @@ -224,7 +225,7 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns, return -EOPNOTSUPP; fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); - old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); + old_fsflags = btrfs_inode_flags_to_fsflags(binode); ret = check_fsflags(old_fsflags, fsflags); if (ret) return ret; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7ba94b683ee3..7733e8ac0a69 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -24,6 +24,7 @@ #include "compression.h" #include "volumes.h" #include "misc.h" +#include "btrfs_inode.h" /* * Error message should follow the following format: @@ -1008,6 +1009,8 @@ static int check_inode_item(struct extent_buffer *leaf, u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); u32 mode; int ret; + u32 flags; + u32 ro_flags; ret = check_inode_key(leaf, key, slot); if (unlikely(ret < 0)) @@ -1063,11 +1066,17 @@ static int check_inode_item(struct extent_buffer *leaf, btrfs_inode_nlink(leaf, iitem)); return -EUCLEAN; } - if (unlikely(btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)) { + btrfs_inode_split_flags(btrfs_inode_flags(leaf, iitem), &flags, &ro_flags); + if (unlikely(flags & ~BTRFS_INODE_FLAG_MASK)) { inode_item_err(leaf, slot, - "unknown flags detected: 0x%llx", - btrfs_inode_flags(leaf, iitem) & - ~BTRFS_INODE_FLAG_MASK); + "unknown incompat flags detected: 0x%x", flags); + return -EUCLEAN; + } + if (unlikely(!sb_rdonly(fs_info->sb) && + (ro_flags & ~BTRFS_INODE_RO_FLAG_MASK))) { + inode_item_err(leaf, slot, + "unknown ro-compat flags detected on writeable mount: 0x%x", + ro_flags); return -EUCLEAN; } return 0; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d09202e0c9df..567adc3de11a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3924,6 +3924,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, u64 logged_isize) { struct btrfs_map_token token; + u64 flags; btrfs_init_map_token(&token, leaf); @@ -3973,7 +3974,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); - btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); + flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, + BTRFS_I(inode)->ro_flags); + btrfs_set_token_inode_flags(&token, item, flags); btrfs_set_token_inode_block_group(&token, item, 0); } -- cgit v1.2.3-70-g09d2 From d135a5339611352047462ef5943aee3a1202aa37 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 29 Jul 2021 15:29:01 +0100 Subject: btrfs: remove no longer needed full sync flag check at inode_logged() Now that we are checking if the inode's logged_trans is 0 to detect the possibility of the inode having been evicted and reloaded, the test for the full sync flag (BTRFS_INODE_NEEDS_FULL_SYNC) is no longer needed at tree-log.c:inode_logged(). Its purpose was to detect the possibility of a previous eviction as well, since when an inode is loaded the full sync flag is always set on it (and only cleared after the inode is logged). So just remove the check and update the comment. The check for the inode's logged_trans being 0 was added recently by the patch with the subject "btrfs: eliminate some false positives when checking if inode was logged". Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 567adc3de11a..5debb8c3663c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3435,16 +3435,14 @@ static bool inode_logged(struct btrfs_trans_handle *trans, /* * The inode's logged_trans is always 0 when we load it (because it is * not persisted in the inode item or elsewhere). So if it is 0, the - * inode was last modified in the current transaction and has the - * full_sync flag set, then the inode may have been logged before in - * the current transaction, then evicted and loaded again in the current - * transaction - or may have never been logged in the current transaction, - * but since we can not be sure, we have to assume it was, otherwise our - * callers can leave an inconsistent log. + * inode was last modified in the current transaction then the inode may + * have been logged before in the current transaction, then evicted and + * loaded again in the current transaction - or may have never been logged + * in the current transaction, but since we can not be sure, we have to + * assume it was, otherwise our callers can leave an inconsistent log. */ if (inode->logged_trans == 0 && inode->last_trans == trans->transid && - test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) return true; -- cgit v1.2.3-70-g09d2 From 1f295373022e84683bc5768caca46bdba3a376c1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 29 Jul 2021 15:30:21 +0100 Subject: btrfs: update comment at log_conflicting_inodes() A comment at log_conflicting_inodes() mentions that we check the inode's logged_trans field instead of using btrfs_inode_in_log() because the field last_log_commit is not updated when we log that an inode exists and the inode has the full sync flag (BTRFS_INODE_NEEDS_FULL_SYNC) set. The part about the full sync flag is not true anymore since commit 9acc8103ab594f ("btrfs: fix unpersisted i_size on fsync after expanding truncate"), so update the comment to not mention that part anymore. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5debb8c3663c..1ce7e4480256 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5092,8 +5092,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, /* * Check the inode's logged_trans only instead of * btrfs_inode_in_log(). This is because the last_log_commit of - * the inode is not updated when we only log that it exists and - * it has the full sync bit set (see btrfs_log_inode()). + * the inode is not updated when we only log that it exists (see + * btrfs_log_inode()). */ if (BTRFS_I(inode)->logged_trans == trans->transid) { spin_unlock(&BTRFS_I(inode)->lock); -- cgit v1.2.3-70-g09d2 From 8be2ba2e0e11ade6ab96d8887dbb12abbd3540f4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 29 Jul 2021 18:52:46 +0100 Subject: btrfs: avoid unnecessarily logging directories that had no changes There are several cases where when logging an inode we need to log its parent directories or logging subdirectories when logging a directory. There are cases however where we end up logging a directory even if it was not changed in the current transaction, no dentries added or removed since the last transaction. While this is harmless from a functional point of view, it is a waste time as it brings no advantage. One example where this is triggered is the following: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt $ mkdir /mnt/A $ mkdir /mnt/B $ mkdir /mnt/C $ touch /mnt/A/foo $ ln /mnt/A/foo /mnt/B/bar $ ln /mnt/A/foo /mnt/C/baz $ sync $ rm -f /mnt/A/foo $ xfs_io -c "fsync" /mnt/B/bar This last fsync ends up logging directories A, B and C, however we only need to log directory A, as B and C were not changed since the last transaction commit. So fix this by changing need_log_inode(), to return false in case the given inode is a directory and has a ->last_trans value smaller than the current transaction's ID. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1ce7e4480256..a1aaa1b8bd5c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5613,6 +5613,13 @@ out_unlock: static bool need_log_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { + /* + * If a directory was not modified, no dentries added or removed, we can + * and should avoid logging it. + */ + if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) + return false; + /* * If this inode does not have new/updated/deleted xattrs since the last * time it was logged and is flagged as logged in the current transaction, -- cgit v1.2.3-70-g09d2 From 3736127a3aa805602b7a2ad60ec9cfce68065fbb Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Mon, 2 Aug 2021 09:34:00 -0300 Subject: btrfs: tree-log: check btrfs_lookup_data_extent return value Function btrfs_lookup_data_extent calls btrfs_search_slot to verify if the EXTENT_ITEM exists in the extent tree. btrfs_search_slot can return values bellow zero if an error happened. Function replay_one_extent currently checks if the search found something (0 returned) and increments the reference, and if not, it seems to evaluate as 'not found'. Fix the condition by checking if the value was bellow zero and return early. Reviewed-by: Filipe Manana Signed-off-by: Marcos Paulo de Souza Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a1aaa1b8bd5c..f7efc26aa82a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -753,7 +753,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, */ ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); - if (ret == 0) { + if (ret < 0) { + goto out; + } else if (ret == 0) { btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, ins.objectid, ins.offset, 0); -- cgit v1.2.3-70-g09d2