summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/block-group.c80
-rw-r--r--fs/btrfs/block-group.h7
-rw-r--r--fs/btrfs/block-rsv.c2
-rw-r--r--fs/btrfs/block-rsv.h32
-rw-r--r--fs/btrfs/defrag.c2
-rw-r--r--fs/btrfs/delalloc-space.c29
-rw-r--r--fs/btrfs/dev-replace.c38
-rw-r--r--fs/btrfs/disk-io.c29
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent_io.c186
-rw-r--r--fs/btrfs/inode.c48
-rw-r--r--fs/btrfs/ioctl.c23
-rw-r--r--fs/btrfs/qgroup.c14
-rw-r--r--fs/btrfs/send.c19
-rw-r--r--fs/btrfs/space-info.c26
-rw-r--r--fs/btrfs/transaction.c40
-rw-r--r--fs/btrfs/volumes.c92
-rw-r--r--fs/btrfs/volumes.h4
-rw-r--r--fs/btrfs/zoned.c45
19 files changed, 520 insertions, 198 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index a9be9ac99222..378d9103a207 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1455,6 +1455,7 @@ out:
*/
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
+ LIST_HEAD(retry_list);
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
@@ -1476,6 +1477,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
+ u64 used;
int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
@@ -1511,9 +1513,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
goto next;
}
+ spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (block_group->reserved || block_group->pinned ||
- block_group->used || block_group->ro ||
+ if (btrfs_is_block_group_used(block_group) || block_group->ro ||
list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
@@ -1523,10 +1525,49 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
}
+
+ /*
+ * The block group may be unused but there may be space reserved
+ * accounting with the existence of that block group, that is,
+ * space_info->bytes_may_use was incremented by a task but no
+ * space was yet allocated from the block group by the task.
+ * That space may or may not be allocated, as we are generally
+ * pessimistic about space reservation for metadata as well as
+ * for data when using compression (as we reserve space based on
+ * the worst case, when data can't be compressed, and before
+ * actually attempting compression, before starting writeback).
+ *
+ * So check if the total space of the space_info minus the size
+ * of this block group is less than the used space of the
+ * space_info - if that's the case, then it means we have tasks
+ * that might be relying on the block group in order to allocate
+ * extents, and add back the block group to the unused list when
+ * we finish, so that we retry later in case no tasks ended up
+ * needing to allocate extents from the block group.
+ */
+ used = btrfs_space_info_used(space_info, true);
+ if (space_info->total_bytes - block_group->length < used) {
+ /*
+ * Add a reference for the list, compensate for the ref
+ * drop under the "next" label for the
+ * fs_info->unused_bgs list.
+ */
+ btrfs_get_block_group(block_group);
+ list_add_tail(&block_group->bg_list, &retry_list);
+
+ trace_btrfs_skip_unused_block_group(block_group);
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
/* We don't want to force the issue, only flip if it's ok. */
ret = inc_block_group_ro(block_group, 0);
@@ -1650,12 +1691,16 @@ next:
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
+ list_splice_tail(&retry_list, &fs_info->unused_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
return;
flip_async:
btrfs_end_transaction(trans);
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_splice_tail(&retry_list, &fs_info->unused_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_put_block_group(block_group);
btrfs_discard_punt_unused_bgs_list(fs_info);
@@ -2684,6 +2729,37 @@ next:
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
list_del_init(&block_group->bg_list);
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+
+ /*
+ * If the block group is still unused, add it to the list of
+ * unused block groups. The block group may have been created in
+ * order to satisfy a space reservation, in which case the
+ * extent allocation only happens later. But often we don't
+ * actually need to allocate space that we previously reserved,
+ * so the block group may become unused for a long time. For
+ * example for metadata we generally reserve space for a worst
+ * possible scenario, but then don't end up allocating all that
+ * space or none at all (due to no need to COW, extent buffers
+ * were already COWed in the current transaction and still
+ * unwritten, tree heights lower than the maximum possible
+ * height, etc). For data we generally reserve the axact amount
+ * of space we are going to allocate later, the exception is
+ * when using compression, as we must reserve space based on the
+ * uncompressed data size, because the compression is only done
+ * when writeback triggered and we don't know how much space we
+ * are actually going to need, so we reserve the uncompressed
+ * size because the data may be uncompressible in the worst case.
+ */
+ if (ret == 0) {
+ bool used;
+
+ spin_lock(&block_group->lock);
+ used = btrfs_is_block_group_used(block_group);
+ spin_unlock(&block_group->lock);
+
+ if (!used)
+ btrfs_mark_bg_unused(block_group);
+ }
}
btrfs_trans_release_chunk_metadata(trans);
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index c4a1f01cc1c2..962b11983901 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -257,6 +257,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
return (block_group->start + block_group->length);
}
+static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
+{
+ lockdep_assert_held(&bg->lock);
+
+ return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
+}
+
static inline bool btrfs_is_block_group_data_only(
struct btrfs_block_group *block_group)
{
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index ceb5f586a2d5..1043a8142351 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -494,7 +494,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
- if (unlikely(block_rsv->size == 0))
+ if (unlikely(btrfs_block_rsv_size(block_rsv) == 0))
goto try_reserve;
again:
ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index b0bd12b8652f..43a9a6b5a79f 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -101,4 +101,36 @@ static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv)
return data_race(rsv->full);
}
+/*
+ * Get the reserved mount of a block reserve in a context where getting a stale
+ * value is acceptable, instead of accessing it directly and trigger data race
+ * warning from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_reserved(struct btrfs_block_rsv *rsv)
+{
+ u64 ret;
+
+ spin_lock(&rsv->lock);
+ ret = rsv->reserved;
+ spin_unlock(&rsv->lock);
+
+ return ret;
+}
+
+/*
+ * Get the size of a block reserve in a context where getting a stale value is
+ * acceptable, instead of accessing it directly and trigger data race warning
+ * from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_size(struct btrfs_block_rsv *rsv)
+{
+ u64 ret;
+
+ spin_lock(&rsv->lock);
+ ret = rsv->size;
+ spin_unlock(&rsv->lock);
+
+ return ret;
+}
+
#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index c276b136ab63..5b0b64571418 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -1046,7 +1046,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
goto add;
/* Skip too large extent */
- if (range_len >= extent_thresh)
+ if (em->len >= extent_thresh)
goto next;
/*
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 2833e8ef4c09..acf9f4b6c044 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -245,7 +245,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 reserve_size = 0;
u64 qgroup_rsv_size = 0;
- u64 csum_leaves;
unsigned outstanding_extents;
lockdep_assert_held(&inode->lock);
@@ -260,10 +259,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
outstanding_extents);
reserve_size += btrfs_calc_metadata_size(fs_info, 1);
}
- csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
- inode->csum_bytes);
- reserve_size += btrfs_calc_insert_metadata_size(fs_info,
- csum_leaves);
+ if (!(inode->flags & BTRFS_INODE_NODATASUM)) {
+ u64 csum_leaves;
+
+ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
+ reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);
+ }
/*
* For qgroup rsv, the calculation is very simple:
* account one nodesize for each outstanding extent
@@ -278,14 +279,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
spin_unlock(&block_rsv->lock);
}
-static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
+static void calc_inode_reservations(struct btrfs_inode *inode,
u64 num_bytes, u64 disk_num_bytes,
u64 *meta_reserve, u64 *qgroup_reserve)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 nr_extents = count_max_extents(fs_info, num_bytes);
- u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+ u64 csum_leaves;
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ csum_leaves = 0;
+ else
+ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+
*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
nr_extents + csum_leaves);
@@ -337,7 +344,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* everything out and try again, which is bad. This way we just
* over-reserve slightly, and clean up the mess when we are done.
*/
- calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
+ calc_inode_reservations(inode, num_bytes, disk_num_bytes,
&meta_reserve, &qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
noflush);
@@ -359,7 +366,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
nr_extents = count_max_extents(fs_info, num_bytes);
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, nr_extents);
- inode->csum_bytes += disk_num_bytes;
+ if (!(inode->flags & BTRFS_INODE_NODATASUM))
+ inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
@@ -393,7 +401,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
spin_lock(&inode->lock);
- inode->csum_bytes -= num_bytes;
+ if (!(inode->flags & BTRFS_INODE_NODATASUM))
+ inode->csum_bytes -= num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1502d664c892..fb33027e5a4c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -246,7 +246,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct block_device *bdev;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
@@ -257,13 +257,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return -EINVAL;
}
- bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+ bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev_handle)) {
+ if (IS_ERR(bdev_file)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
- return PTR_ERR(bdev_handle);
+ return PTR_ERR(bdev_file);
}
- bdev = bdev_handle->bdev;
+ bdev = file_bdev(bdev_file);
if (!btrfs_check_device_zone_type(fs_info, bdev)) {
btrfs_err(fs_info,
@@ -314,7 +314,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
device->bdev = bdev;
- device->bdev_handle = bdev_handle;
+ device->bdev_file = bdev_file;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->dev_stats_valid = 1;
@@ -335,7 +335,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return 0;
error:
- bdev_release(bdev_handle);
+ fput(bdev_file);
return ret;
}
@@ -725,6 +725,23 @@ leave:
return ret;
}
+static int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args)
+{
+ if (args->start.srcdevid == 0) {
+ if (memchr(args->start.srcdev_name, 0,
+ sizeof(args->start.srcdev_name)) == NULL)
+ return -ENAMETOOLONG;
+ } else {
+ args->start.srcdev_name[0] = 0;
+ }
+
+ if (memchr(args->start.tgtdev_name, 0,
+ sizeof(args->start.tgtdev_name)) == NULL)
+ return -ENAMETOOLONG;
+
+ return 0;
+}
+
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
@@ -737,10 +754,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
default:
return -EINVAL;
}
-
- if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
- args->start.tgtdev_name[0] == '\0')
- return -EINVAL;
+ ret = btrfs_check_replace_dev_names(args);
+ if (ret < 0)
+ return ret;
ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
args->start.srcdevid,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c6907d533fe8..c843563914ca 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1307,12 +1307,12 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
*
* @objectid: root id
* @anon_dev: preallocated anonymous block device number for new roots,
- * pass 0 for new allocation.
+ * pass NULL for a new allocation.
* @check_ref: whether to check root item references, If true, return -ENOENT
* for orphan roots
*/
static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev,
+ u64 objectid, dev_t *anon_dev,
bool check_ref)
{
struct btrfs_root *root;
@@ -1336,8 +1336,17 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
- /* Shouldn't get preallocated anon_dev for cached roots */
- ASSERT(!anon_dev);
+ /*
+ * Some other caller may have read out the newly inserted
+ * subvolume already (for things like backref walk etc). Not
+ * that common but still possible. In that case, we just need
+ * to free the anon_dev.
+ */
+ if (unlikely(anon_dev && *anon_dev)) {
+ free_anon_bdev(*anon_dev);
+ *anon_dev = 0;
+ }
+
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
btrfs_put_root(root);
return ERR_PTR(-ENOENT);
@@ -1357,7 +1366,7 @@ again:
goto fail;
}
- ret = btrfs_init_fs_root(root, anon_dev);
+ ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0);
if (ret)
goto fail;
@@ -1393,7 +1402,7 @@ fail:
* root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
* and once again by our caller.
*/
- if (anon_dev)
+ if (anon_dev && *anon_dev)
root->anon_dev = 0;
btrfs_put_root(root);
return ERR_PTR(ret);
@@ -1409,7 +1418,7 @@ fail:
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref)
{
- return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
+ return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref);
}
/*
@@ -1417,11 +1426,11 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
* the anonymous block device id
*
* @objectid: tree objectid
- * @anon_dev: if zero, allocate a new anonymous block device or use the
- * parameter value
+ * @anon_dev: if NULL, allocate a new anonymous block device or use the
+ * parameter value if not NULL
*/
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev)
+ u64 objectid, dev_t *anon_dev)
{
return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 9413726b329b..eb3473d1c1ac 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -61,7 +61,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref);
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev);
+ u64 objectid, dev_t *anon_dev);
struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
u64 objectid);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cfd2967f04a2..8b4bef05e222 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2480,6 +2480,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
struct fiemap_cache *cache,
u64 offset, u64 phys, u64 len, u32 flags)
{
+ u64 cache_end;
int ret = 0;
/* Set at the end of extent_fiemap(). */
@@ -2489,15 +2490,102 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
goto assign;
/*
- * Sanity check, extent_fiemap() should have ensured that new
- * fiemap extent won't overlap with cached one.
- * Not recoverable.
+ * When iterating the extents of the inode, at extent_fiemap(), we may
+ * find an extent that starts at an offset behind the end offset of the
+ * previous extent we processed. This happens if fiemap is called
+ * without FIEMAP_FLAG_SYNC and there are ordered extents completing
+ * while we call btrfs_next_leaf() (through fiemap_next_leaf_item()).
*
- * NOTE: Physical address can overlap, due to compression
+ * For example we are in leaf X processing its last item, which is the
+ * file extent item for file range [512K, 1M[, and after
+ * btrfs_next_leaf() releases the path, there's an ordered extent that
+ * completes for the file range [768K, 2M[, and that results in trimming
+ * the file extent item so that it now corresponds to the file range
+ * [512K, 768K[ and a new file extent item is inserted for the file
+ * range [768K, 2M[, which may end up as the last item of leaf X or as
+ * the first item of the next leaf - in either case btrfs_next_leaf()
+ * will leave us with a path pointing to the new extent item, for the
+ * file range [768K, 2M[, since that's the first key that follows the
+ * last one we processed. So in order not to report overlapping extents
+ * to user space, we trim the length of the previously cached extent and
+ * emit it.
+ *
+ * Upon calling btrfs_next_leaf() we may also find an extent with an
+ * offset smaller than or equals to cache->offset, and this happens
+ * when we had a hole or prealloc extent with several delalloc ranges in
+ * it, but after btrfs_next_leaf() released the path, delalloc was
+ * flushed and the resulting ordered extents were completed, so we can
+ * now have found a file extent item for an offset that is smaller than
+ * or equals to what we have in cache->offset. We deal with this as
+ * described below.
*/
- if (cache->offset + cache->len > offset) {
- WARN_ON(1);
- return -EINVAL;
+ cache_end = cache->offset + cache->len;
+ if (cache_end > offset) {
+ if (offset == cache->offset) {
+ /*
+ * We cached a dealloc range (found in the io tree) for
+ * a hole or prealloc extent and we have now found a
+ * file extent item for the same offset. What we have
+ * now is more recent and up to date, so discard what
+ * we had in the cache and use what we have just found.
+ */
+ goto assign;
+ } else if (offset > cache->offset) {
+ /*
+ * The extent range we previously found ends after the
+ * offset of the file extent item we found and that
+ * offset falls somewhere in the middle of that previous
+ * extent range. So adjust the range we previously found
+ * to end at the offset of the file extent item we have
+ * just found, since this extent is more up to date.
+ * Emit that adjusted range and cache the file extent
+ * item we have just found. This corresponds to the case
+ * where a previously found file extent item was split
+ * due to an ordered extent completing.
+ */
+ cache->len = offset - cache->offset;
+ goto emit;
+ } else {
+ const u64 range_end = offset + len;
+
+ /*
+ * The offset of the file extent item we have just found
+ * is behind the cached offset. This means we were
+ * processing a hole or prealloc extent for which we
+ * have found delalloc ranges (in the io tree), so what
+ * we have in the cache is the last delalloc range we
+ * found while the file extent item we found can be
+ * either for a whole delalloc range we previously
+ * emmitted or only a part of that range.
+ *
+ * We have two cases here:
+ *
+ * 1) The file extent item's range ends at or behind the
+ * cached extent's end. In this case just ignore the
+ * current file extent item because we don't want to
+ * overlap with previous ranges that may have been
+ * emmitted already;
+ *
+ * 2) The file extent item starts behind the currently
+ * cached extent but its end offset goes beyond the
+ * end offset of the cached extent. We don't want to
+ * overlap with a previous range that may have been
+ * emmitted already, so we emit the currently cached
+ * extent and then partially store the current file
+ * extent item's range in the cache, for the subrange
+ * going the cached extent's end to the end of the
+ * file extent item.
+ */
+ if (range_end <= cache_end)
+ return 0;
+
+ if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
+ phys += cache_end - offset;
+
+ offset = cache_end;
+ len = range_end - cache_end;
+ goto emit;
+ }
}
/*
@@ -2517,6 +2605,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
return 0;
}
+emit:
/* Not mergeable, need to submit cached one */
ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
cache->len, cache->flags);
@@ -2689,16 +2778,34 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
* it beyond i_size.
*/
while (cur_offset < end && cur_offset < i_size) {
+ struct extent_state *cached_state = NULL;
u64 delalloc_start;
u64 delalloc_end;
u64 prealloc_start;
+ u64 lockstart;
+ u64 lockend;
u64 prealloc_len = 0;
bool delalloc;
+ lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize);
+ lockend = round_up(end, inode->root->fs_info->sectorsize);
+
+ /*
+ * We are only locking for the delalloc range because that's the
+ * only thing that can change here. With fiemap we have a lock
+ * on the inode, so no buffered or direct writes can happen.
+ *
+ * However mmaps and normal page writeback will cause this to
+ * change arbitrarily. We have to lock the extent lock here to
+ * make sure that nobody messes with the tree while we're doing
+ * btrfs_find_delalloc_in_range.
+ */
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
delalloc_cached_state,
&delalloc_start,
&delalloc_end);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
if (!delalloc)
break;
@@ -2866,15 +2973,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
const u64 ino = btrfs_ino(inode);
- struct extent_state *cached_state = NULL;
struct extent_state *delalloc_cached_state = NULL;
struct btrfs_path *path;
struct fiemap_cache cache = { 0 };
struct btrfs_backref_share_check_ctx *backref_ctx;
u64 last_extent_end;
u64 prev_extent_end;
- u64 lockstart;
- u64 lockend;
+ u64 range_start;
+ u64 range_end;
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
bool stopped = false;
int ret;
@@ -2885,22 +2992,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto out;
}
- lockstart = round_down(start, inode->root->fs_info->sectorsize);
- lockend = round_up(start + len, inode->root->fs_info->sectorsize);
- prev_extent_end = lockstart;
-
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ range_start = round_down(start, sectorsize);
+ range_end = round_up(start + len, sectorsize);
+ prev_extent_end = range_start;
ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
if (ret < 0)
- goto out_unlock;
+ goto out;
btrfs_release_path(path);
path->reada = READA_FORWARD;
- ret = fiemap_search_slot(inode, path, lockstart);
+ ret = fiemap_search_slot(inode, path, range_start);
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/*
* No file extent item found, but we may have delalloc between
@@ -2910,7 +3014,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto check_eof_delalloc;
}
- while (prev_extent_end < lockend) {
+ while (prev_extent_end < range_end) {
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_file_extent_item *ei;
struct btrfs_key key;
@@ -2933,21 +3037,21 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
* The first iteration can leave us at an extent item that ends
* before our range's start. Move to the next item.
*/
- if (extent_end <= lockstart)
+ if (extent_end <= range_start)
goto next_item;
backref_ctx->curr_leaf_bytenr = leaf->start;
/* We have in implicit hole (NO_HOLES feature enabled). */
if (prev_extent_end < key.offset) {
- const u64 range_end = min(key.offset, lockend) - 1;
+ const u64 hole_end = min(key.offset, range_end) - 1;
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state,
backref_ctx, 0, 0, 0,
- prev_extent_end, range_end);
+ prev_extent_end, hole_end);
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/* fiemap_fill_next_extent() told us to stop. */
stopped = true;
@@ -2955,7 +3059,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
}
/* We've reached the end of the fiemap range, stop. */
- if (key.offset >= lockend) {
+ if (key.offset >= range_end) {
stopped = true;
break;
}
@@ -3003,7 +3107,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
extent_gen,
backref_ctx);
if (ret < 0)
- goto out_unlock;
+ goto out;
else if (ret > 0)
flags |= FIEMAP_EXTENT_SHARED;
}
@@ -3014,7 +3118,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
}
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/* fiemap_fill_next_extent() told us to stop. */
stopped = true;
@@ -3025,12 +3129,12 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
next_item:
if (fatal_signal_pending(current)) {
ret = -EINTR;
- goto out_unlock;
+ goto out;
}
ret = fiemap_next_leaf_item(inode, path);
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/* No more file extent items for this inode. */
break;
@@ -3049,29 +3153,41 @@ check_eof_delalloc:
btrfs_free_path(path);
path = NULL;
- if (!stopped && prev_extent_end < lockend) {
+ if (!stopped && prev_extent_end < range_end) {
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state, backref_ctx,
- 0, 0, 0, prev_extent_end, lockend - 1);
+ 0, 0, 0, prev_extent_end, range_end - 1);
if (ret < 0)
- goto out_unlock;
- prev_extent_end = lockend;
+ goto out;
+ prev_extent_end = range_end;
}
if (cache.cached && cache.offset + cache.len >= last_extent_end) {
const u64 i_size = i_size_read(&inode->vfs_inode);
if (prev_extent_end < i_size) {
+ struct extent_state *cached_state = NULL;
u64 delalloc_start;
u64 delalloc_end;
+ u64 lockstart;
+ u64 lockend;
bool delalloc;
+ lockstart = round_down(prev_extent_end, sectorsize);
+ lockend = round_up(i_size, sectorsize);
+
+ /*
+ * See the comment in fiemap_process_hole as to why
+ * we're doing the locking here.
+ */
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
delalloc = btrfs_find_delalloc_in_range(inode,
prev_extent_end,
i_size - 1,
&delalloc_cached_state,
&delalloc_start,
&delalloc_end);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
if (!delalloc)
cache.flags |= FIEMAP_EXTENT_LAST;
} else {
@@ -3080,10 +3196,6 @@ check_eof_delalloc:
}
ret = emit_last_fiemap_cache(fieinfo, &cache);
-
-out_unlock:
- unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
out:
free_extent_state(delalloc_cached_state);
btrfs_free_backref_share_ctx(backref_ctx);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1eb93d3962aa..4795738d5785 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3184,8 +3184,23 @@ out:
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
- /* Drop extent maps for the part of the extent we didn't write. */
- btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
+ /*
+ * Drop extent maps for the part of the extent we didn't write.
+ *
+ * We have an exception here for the free_space_inode, this is
+ * because when we do btrfs_get_extent() on the free space inode
+ * we will search the commit root. If this is a new block group
+ * we won't find anything, and we will trip over the assert in
+ * writepage where we do ASSERT(em->block_start !=
+ * EXTENT_MAP_HOLE).
+ *
+ * Theoretically we could also skip this for any NOCOW extent as
+ * we don't mess with the extent map tree in the NOCOW case, but
+ * for now simply skip this if we are the free space inode.
+ */
+ if (!btrfs_is_free_space_inode(inode))
+ btrfs_drop_extent_map_range(inode, unwritten_start,
+ end, false);
/*
* If the ordered extent had an IOERR or something else went
@@ -7820,6 +7835,7 @@ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
+ struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
int ret;
ret = fiemap_prep(inode, fieinfo, start, &len, 0);
@@ -7845,7 +7861,26 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return ret;
}
- return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
+ btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ /*
+ * We did an initial flush to avoid holding the inode's lock while
+ * triggering writeback and waiting for the completion of IO and ordered
+ * extents. Now after we locked the inode we do it again, because it's
+ * possible a new write may have happened in between those two steps.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
+ if (ret) {
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+ }
+ }
+
+ ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ return ret;
}
static int btrfs_writepages(struct address_space *mapping,
@@ -10273,6 +10308,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
return -EINVAL;
+ /*
+ * Compressed extents should always have checksums, so error out if we
+ * have a NOCOW file or inode was created while mounted with NODATASUM.
+ */
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ return -EINVAL;
+
orig_count = iov_iter_count(from);
/* The extent size must be sane. */
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dfed9dd9c2d7..9876ee27f069 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -721,7 +721,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
free_extent_buffer(leaf);
leaf = NULL;
- new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
+ new_root = btrfs_get_new_fs_root(fs_info, objectid, &anon_dev);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
@@ -2698,7 +2698,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
- struct bdev_handle *bdev_handle = NULL;
+ struct file *bdev_file = NULL;
int ret;
bool cancel = false;
@@ -2735,7 +2735,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto err_drop;
/* Exclusive operation is now claimed */
- ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_file);
btrfs_exclop_finish(fs_info);
@@ -2749,8 +2749,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
}
err_drop:
mnt_drop_write_file(file);
- if (bdev_handle)
- bdev_release(bdev_handle);
+ if (bdev_file)
+ fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -2763,7 +2763,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
- struct bdev_handle *bdev_handle = NULL;
+ struct file *bdev_file = NULL;
int ret;
bool cancel = false;
@@ -2790,15 +2790,15 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret == 0) {
- ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_file);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
btrfs_exclop_finish(fs_info);
}
mnt_drop_write_file(file);
- if (bdev_handle)
- bdev_release(bdev_handle);
+ if (bdev_file)
+ fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -3815,6 +3815,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto out;
}
+ if (sa->create && is_fstree(sa->qgroupid)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 63b426cc7798..5470e1cdf10c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1736,6 +1736,15 @@ out:
return ret;
}
+static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
+{
+ return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
+ qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
+}
+
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1755,6 +1764,11 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
goto out;
}
+ if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/* Check if there are no children of this qgroup */
if (!list_empty(&qgroup->members)) {
ret = -EBUSY;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 2d7519a6ce72..e48a063ef085 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6705,11 +6705,20 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
if (ret)
goto out;
}
- if (sctx->cur_inode_last_extent <
- sctx->cur_inode_size) {
- ret = send_hole(sctx, sctx->cur_inode_size);
- if (ret)
+ if (sctx->cur_inode_last_extent < sctx->cur_inode_size) {
+ ret = range_is_hole_in_parent(sctx,
+ sctx->cur_inode_last_extent,
+ sctx->cur_inode_size);
+ if (ret < 0) {
goto out;
+ } else if (ret == 0) {
+ ret = send_hole(sctx, sctx->cur_inode_size);
+ if (ret < 0)
+ goto out;
+ } else {
+ /* Range is already a hole, skip. */
+ ret = 0;
+ }
}
}
if (need_truncate) {
@@ -8111,7 +8120,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
}
if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
- ret = -EINVAL;
+ ret = -EOPNOTSUPP;
goto out;
}
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 571bb13587d5..3b54eb583474 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -856,7 +856,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
- u64 global_rsv_size = fs_info->global_block_rsv.reserved;
+ const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv);
u64 ordered, delalloc;
u64 thresh;
u64 used;
@@ -956,8 +956,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
if (ordered >= delalloc)
- used += fs_info->delayed_refs_rsv.reserved +
- fs_info->delayed_block_rsv.reserved;
+ used += btrfs_block_rsv_reserved(&fs_info->delayed_refs_rsv) +
+ btrfs_block_rsv_reserved(&fs_info->delayed_block_rsv);
else
used += space_info->bytes_may_use - global_rsv_size;
@@ -1173,7 +1173,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
enum btrfs_flush_state flush;
u64 delalloc_size = 0;
u64 to_reclaim, block_rsv_size;
- u64 global_rsv_size = global_rsv->reserved;
+ const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv);
loops++;
@@ -1185,9 +1185,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* assume it's tied up in delalloc reservations.
*/
block_rsv_size = global_rsv_size +
- delayed_block_rsv->reserved +
- delayed_refs_rsv->reserved +
- trans_rsv->reserved;
+ btrfs_block_rsv_reserved(delayed_block_rsv) +
+ btrfs_block_rsv_reserved(delayed_refs_rsv) +
+ btrfs_block_rsv_reserved(trans_rsv);
if (block_rsv_size < space_info->bytes_may_use)
delalloc_size = space_info->bytes_may_use - block_rsv_size;
@@ -1207,16 +1207,16 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
to_reclaim = delalloc_size;
flush = FLUSH_DELALLOC;
} else if (space_info->bytes_pinned >
- (delayed_block_rsv->reserved +
- delayed_refs_rsv->reserved)) {
+ (btrfs_block_rsv_reserved(delayed_block_rsv) +
+ btrfs_block_rsv_reserved(delayed_refs_rsv))) {
to_reclaim = space_info->bytes_pinned;
flush = COMMIT_TRANS;
- } else if (delayed_block_rsv->reserved >
- delayed_refs_rsv->reserved) {
- to_reclaim = delayed_block_rsv->reserved;
+ } else if (btrfs_block_rsv_reserved(delayed_block_rsv) >
+ btrfs_block_rsv_reserved(delayed_refs_rsv)) {
+ to_reclaim = btrfs_block_rsv_reserved(delayed_block_rsv);
flush = FLUSH_DELAYED_ITEMS_NR;
} else {
- to_reclaim = delayed_refs_rsv->reserved;
+ to_reclaim = btrfs_block_rsv_reserved(delayed_refs_rsv);
flush = FLUSH_DELAYED_REFS_NR;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5b3333ceef04..bf8e64c766b6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -564,56 +564,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
u64 num_bytes,
u64 *delayed_refs_bytes)
{
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
- u64 extra_delayed_refs_bytes = 0;
- u64 bytes;
+ u64 bytes = num_bytes + *delayed_refs_bytes;
int ret;
/*
- * If there's a gap between the size of the delayed refs reserve and
- * its reserved space, than some tasks have added delayed refs or bumped
- * its size otherwise (due to block group creation or removal, or block
- * group item update). Also try to allocate that gap in order to prevent
- * using (and possibly abusing) the global reserve when committing the
- * transaction.
- */
- if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- !btrfs_block_rsv_full(delayed_refs_rsv)) {
- spin_lock(&delayed_refs_rsv->lock);
- if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
- extra_delayed_refs_bytes = delayed_refs_rsv->size -
- delayed_refs_rsv->reserved;
- spin_unlock(&delayed_refs_rsv->lock);
- }
-
- bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
-
- /*
* We want to reserve all the bytes we may need all at once, so we only
* do 1 enospc flushing cycle per transaction start.
*/
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
- if (ret == 0) {
- if (extra_delayed_refs_bytes > 0)
- btrfs_migrate_to_delayed_refs_rsv(fs_info,
- extra_delayed_refs_bytes);
- return 0;
- }
-
- if (extra_delayed_refs_bytes > 0) {
- bytes -= extra_delayed_refs_bytes;
- ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
- if (ret == 0)
- return 0;
- }
/*
* If we are an emergency flush, which can steal from the global block
* reserve, then attempt to not reserve space for the delayed refs, as
* we will consume space for them from the global block reserve.
*/
- if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
bytes -= *delayed_refs_bytes;
*delayed_refs_bytes = 0;
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
@@ -1868,7 +1834,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.offset = (u64)-1;
- pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
+ pending->snap = btrfs_get_new_fs_root(fs_info, objectid, &pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
pending->snap = NULL;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d67785be2c77..e180da4cc227 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -468,39 +468,39 @@ static noinline struct btrfs_fs_devices *find_fsid(
static int
btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
- int flush, struct bdev_handle **bdev_handle,
+ int flush, struct file **bdev_file,
struct btrfs_super_block **disk_super)
{
struct block_device *bdev;
int ret;
- *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL);
+ *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
- if (IS_ERR(*bdev_handle)) {
- ret = PTR_ERR(*bdev_handle);
+ if (IS_ERR(*bdev_file)) {
+ ret = PTR_ERR(*bdev_file);
goto error;
}
- bdev = (*bdev_handle)->bdev;
+ bdev = file_bdev(*bdev_file);
if (flush)
sync_blockdev(bdev);
ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
- bdev_release(*bdev_handle);
+ fput(*bdev_file);
goto error;
}
invalidate_bdev(bdev);
*disk_super = btrfs_read_dev_super(bdev);
if (IS_ERR(*disk_super)) {
ret = PTR_ERR(*disk_super);
- bdev_release(*bdev_handle);
+ fput(*bdev_file);
goto error;
}
return 0;
error:
- *bdev_handle = NULL;
+ *bdev_file = NULL;
return ret;
}
@@ -643,7 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, blk_mode_t flags,
void *holder)
{
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct btrfs_super_block *disk_super;
u64 devid;
int ret;
@@ -654,7 +654,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
return -EINVAL;
ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev_handle, &disk_super);
+ &bdev_file, &disk_super);
if (ret)
return ret;
@@ -678,20 +678,20 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
fs_devices->seeding = true;
} else {
- if (bdev_read_only(bdev_handle->bdev))
+ if (bdev_read_only(file_bdev(bdev_file)))
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
else
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- if (!bdev_nonrot(bdev_handle->bdev))
+ if (!bdev_nonrot(file_bdev(bdev_file)))
fs_devices->rotating = true;
- if (bdev_max_discard_sectors(bdev_handle->bdev))
+ if (bdev_max_discard_sectors(file_bdev(bdev_file)))
fs_devices->discardable = true;
- device->bdev_handle = bdev_handle;
- device->bdev = bdev_handle->bdev;
+ device->bdev_file = bdev_file;
+ device->bdev = file_bdev(bdev_file);
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
fs_devices->open_devices++;
@@ -706,7 +706,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
error_free_page:
btrfs_release_disk_super(disk_super);
- bdev_release(bdev_handle);
+ fput(bdev_file);
return -EINVAL;
}
@@ -1015,10 +1015,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
continue;
- if (device->bdev_handle) {
- bdev_release(device->bdev_handle);
+ if (device->bdev_file) {
+ fput(device->bdev_file);
device->bdev = NULL;
- device->bdev_handle = NULL;
+ device->bdev_file = NULL;
fs_devices->open_devices--;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1063,7 +1063,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
invalidate_bdev(device->bdev);
}
- bdev_release(device->bdev_handle);
+ fput(device->bdev_file);
}
static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1316,7 +1316,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
struct btrfs_super_block *disk_super;
bool new_device_added = false;
struct btrfs_device *device = NULL;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
u64 bytenr, bytenr_orig;
int ret;
@@ -1339,18 +1339,18 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
* values temporarily, as the device paths of the fsid are the only
* required information for assembling the volume.
*/
- bdev_handle = bdev_open_by_path(path, flags, NULL, NULL);
- if (IS_ERR(bdev_handle))
- return ERR_CAST(bdev_handle);
+ bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
+ if (IS_ERR(bdev_file))
+ return ERR_CAST(bdev_file);
bytenr_orig = btrfs_sb_offset(0);
- ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr);
+ ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
if (ret) {
device = ERR_PTR(ret);
goto error_bdev_put;
}
- disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr,
+ disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
bytenr_orig);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
@@ -1381,7 +1381,7 @@ free_disk_super:
btrfs_release_disk_super(disk_super);
error_bdev_put:
- bdev_release(bdev_handle);
+ fput(bdev_file);
return device;
}
@@ -2057,7 +2057,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct bdev_handle **bdev_handle)
+ struct file **bdev_file)
{
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
@@ -2166,7 +2166,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
btrfs_assign_next_active_device(device, NULL);
- if (device->bdev_handle) {
+ if (device->bdev_file) {
cur_devices->open_devices--;
/* remove sysfs entry */
btrfs_sysfs_remove_device(device);
@@ -2182,9 +2182,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
* free the device.
*
* We cannot call btrfs_close_bdev() here because we're holding the sb
- * write lock, and bdev_release() will pull in the ->open_mutex on
- * the block device and it's dependencies. Instead just flush the
- * device and let the caller do the final bdev_release.
+ * write lock, and fput() on the block device will pull in the
+ * ->open_mutex on the block device and it's dependencies. Instead
+ * just flush the device and let the caller do the final bdev_release.
*/
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
btrfs_scratch_superblocks(fs_info, device->bdev,
@@ -2195,7 +2195,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
}
}
- *bdev_handle = device->bdev_handle;
+ *bdev_file = device->bdev_file;
synchronize_rcu();
btrfs_free_device(device);
@@ -2332,7 +2332,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
const char *path)
{
struct btrfs_super_block *disk_super;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
int ret;
if (!path || !path[0])
@@ -2350,7 +2350,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
}
ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
- &bdev_handle, &disk_super);
+ &bdev_file, &disk_super);
if (ret) {
btrfs_put_dev_args_from_path(args);
return ret;
@@ -2363,7 +2363,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
else
memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- bdev_release(bdev_handle);
+ fput(bdev_file);
return 0;
}
@@ -2583,7 +2583,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct super_block *sb = fs_info->sb;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *seed_devices = NULL;
@@ -2596,12 +2596,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
- bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+ bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev_handle))
- return PTR_ERR(bdev_handle);
+ if (IS_ERR(bdev_file))
+ return PTR_ERR(bdev_file);
- if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) {
+ if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) {
ret = -EINVAL;
goto error;
}
@@ -2613,11 +2613,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
locked = true;
}
- sync_blockdev(bdev_handle->bdev);
+ sync_blockdev(file_bdev(bdev_file));
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
- if (device->bdev == bdev_handle->bdev) {
+ if (device->bdev == file_bdev(bdev_file)) {
ret = -EEXIST;
rcu_read_unlock();
goto error;
@@ -2633,8 +2633,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
device->fs_info = fs_info;
- device->bdev_handle = bdev_handle;
- device->bdev = bdev_handle->bdev;
+ device->bdev_file = bdev_file;
+ device->bdev = file_bdev(bdev_file);
ret = lookup_bdev(device_path, &device->devt);
if (ret)
goto error_free_device;
@@ -2817,7 +2817,7 @@ error_free_zone:
error_free_device:
btrfs_free_device(device);
error:
- bdev_release(bdev_handle);
+ fput(bdev_file);
if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53f87f398da7..a11854912d53 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -90,7 +90,7 @@ struct btrfs_device {
u64 generation;
- struct bdev_handle *bdev_handle;
+ struct file *bdev_file;
struct block_device *bdev;
struct btrfs_zoned_device_info *zone_info;
@@ -661,7 +661,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct bdev_handle **bdev_handle);
+ struct file **bdev_file);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 168af9d000d1..aea51fd850cd 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -824,11 +824,14 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset = &zones[1];
if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+ unsigned int nofs_flags;
+
ASSERT(sb_zone_is_full(reset));
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- reset->start, reset->len,
- GFP_NOFS);
+ reset->start, reset->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -974,11 +977,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
* explicit ZONE_FINISH is not necessary.
*/
if (zone->wp != zone->start + zone->capacity) {
+ unsigned int nofs_flags;
int ret;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev,
REQ_OP_ZONE_FINISH, zone->start,
- zone->len, GFP_NOFS);
+ zone->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
}
@@ -996,11 +1002,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
{
+ unsigned int nofs_flags;
sector_t zone_sectors;
sector_t nr_sectors;
u8 zone_sectors_shift;
u32 sb_zone;
u32 nr_zones;
+ int ret;
zone_sectors = bdev_zone_sectors(bdev);
zone_sectors_shift = ilog2(zone_sectors);
@@ -1011,9 +1019,12 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
if (sb_zone + 1 >= nr_zones)
return -ENOENT;
- return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- zone_start_sector(sb_zone, bdev),
- zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ zone_start_sector(sb_zone, bdev),
+ zone_sectors * BTRFS_NR_SB_LOG_ZONES);
+ memalloc_nofs_restore(nofs_flags);
+ return ret;
}
/*
@@ -1124,12 +1135,14 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
u64 length, u64 *bytes)
{
+ unsigned int nofs_flags;
int ret;
*bytes = 0;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
- physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
- GFP_NOFS);
+ physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -1639,6 +1652,15 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
out:
+ /* Reject non SINGLE data profiles without RST */
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) &&
+ (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+ !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
if (cache->alloc_offset > cache->zone_capacity) {
btrfs_err(fs_info,
"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
@@ -1670,6 +1692,7 @@ out:
}
bitmap_free(active);
kfree(zone_info);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2234,14 +2257,16 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
struct btrfs_device *device = map->stripes[i].dev;
const u64 physical = map->stripes[i].physical;
struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ unsigned int nofs_flags;
if (zinfo->max_active_zones == 0)
continue;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
- zinfo->zone_size >> SECTOR_SHIFT,
- GFP_NOFS);
+ zinfo->zone_size >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;