diff options
Diffstat (limited to 'fs/btrfs/qgroup.c')
-rw-r--r-- | fs/btrfs/qgroup.c | 455 |
1 files changed, 438 insertions, 17 deletions
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d4917c0cddf5..45868fd76209 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1416,13 +1416,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) if (!qgroup) { ret = -ENOENT; goto out; - } else { - /* check if there are no children of this qgroup */ - if (!list_empty(&qgroup->members)) { - ret = -EBUSY; - goto out; - } } + + /* Check if there are no children of this qgroup */ + if (!list_empty(&qgroup->members)) { + ret = -EBUSY; + goto out; + } + ret = del_qgroup_item(trans, qgroupid); if (ret && ret != -ENOENT) goto out; @@ -1712,6 +1713,416 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level) return 0; } +/* + * Helper function to trace a subtree tree block swap. + * + * The swap will happen in highest tree block, but there may be a lot of + * tree blocks involved. + * + * For example: + * OO = Old tree blocks + * NN = New tree blocks allocated during balance + * + * File tree (257) Reloc tree for 257 + * L2 OO NN + * / \ / \ + * L1 OO OO (a) OO NN (a) + * / \ / \ / \ / \ + * L0 OO OO OO OO OO OO NN NN + * (b) (c) (b) (c) + * + * When calling qgroup_trace_extent_swap(), we will pass: + * @src_eb = OO(a) + * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] + * @dst_level = 0 + * @root_level = 1 + * + * In that case, qgroup_trace_extent_swap() will search from OO(a) to + * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. + * + * The main work of qgroup_trace_extent_swap() can be split into 3 parts: + * + * 1) Tree search from @src_eb + * It should acts as a simplified btrfs_search_slot(). + * The key for search can be extracted from @dst_path->nodes[dst_level] + * (first key). + * + * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty + * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. + * They should be marked during preivous (@dst_level = 1) iteration. + * + * 3) Mark file extents in leaves dirty + * We don't have good way to pick out new file extents only. + * So we still follow the old method by scanning all file extents in + * the leave. + * + * This function can free us from keeping two pathes, thus later we only need + * to care about how to iterate all new tree blocks in reloc tree. + */ +static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, + struct extent_buffer *src_eb, + struct btrfs_path *dst_path, + int dst_level, int root_level, + bool trace_leaf) +{ + struct btrfs_key key; + struct btrfs_path *src_path; + struct btrfs_fs_info *fs_info = trans->fs_info; + u32 nodesize = fs_info->nodesize; + int cur_level = root_level; + int ret; + + BUG_ON(dst_level > root_level); + /* Level mismatch */ + if (btrfs_header_level(src_eb) != root_level) + return -EINVAL; + + src_path = btrfs_alloc_path(); + if (!src_path) { + ret = -ENOMEM; + goto out; + } + + if (dst_level) + btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); + else + btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); + + /* For src_path */ + extent_buffer_get(src_eb); + src_path->nodes[root_level] = src_eb; + src_path->slots[root_level] = dst_path->slots[root_level]; + src_path->locks[root_level] = 0; + + /* A simplified version of btrfs_search_slot() */ + while (cur_level >= dst_level) { + struct btrfs_key src_key; + struct btrfs_key dst_key; + + if (src_path->nodes[cur_level] == NULL) { + struct btrfs_key first_key; + struct extent_buffer *eb; + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + eb = src_path->nodes[cur_level + 1]; + parent_slot = src_path->slots[cur_level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + btrfs_node_key_to_cpu(eb, &first_key, parent_slot); + + eb = read_tree_block(fs_info, child_bytenr, child_gen, + cur_level, &first_key); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } + + src_path->nodes[cur_level] = eb; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; + } + + src_path->slots[cur_level] = dst_path->slots[cur_level]; + if (cur_level) { + btrfs_node_key_to_cpu(dst_path->nodes[cur_level], + &dst_key, dst_path->slots[cur_level]); + btrfs_node_key_to_cpu(src_path->nodes[cur_level], + &src_key, src_path->slots[cur_level]); + } else { + btrfs_item_key_to_cpu(dst_path->nodes[cur_level], + &dst_key, dst_path->slots[cur_level]); + btrfs_item_key_to_cpu(src_path->nodes[cur_level], + &src_key, src_path->slots[cur_level]); + } + /* Content mismatch, something went wrong */ + if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { + ret = -ENOENT; + goto out; + } + cur_level--; + } + + /* + * Now both @dst_path and @src_path have been populated, record the tree + * blocks for qgroup accounting. + */ + ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, + nodesize, GFP_NOFS); + if (ret < 0) + goto out; + ret = btrfs_qgroup_trace_extent(trans, + dst_path->nodes[dst_level]->start, + nodesize, GFP_NOFS); + if (ret < 0) + goto out; + + /* Record leaf file extents */ + if (dst_level == 0 && trace_leaf) { + ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); + if (ret < 0) + goto out; + ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); + } +out: + btrfs_free_path(src_path); + return ret; +} + +/* + * Helper function to do recursive generation-aware depth-first search, to + * locate all new tree blocks in a subtree of reloc tree. + * + * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) + * reloc tree + * L2 NN (a) + * / \ + * L1 OO NN (b) + * / \ / \ + * L0 OO OO OO NN + * (c) (d) + * If we pass: + * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], + * @cur_level = 1 + * @root_level = 1 + * + * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace + * above tree blocks along with their counter parts in file tree. + * While during search, old tree blocsk OO(c) will be skiped as tree block swap + * won't affect OO(c). + */ +static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, + struct extent_buffer *src_eb, + struct btrfs_path *dst_path, + int cur_level, int root_level, + u64 last_snapshot, bool trace_leaf) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct extent_buffer *eb; + bool need_cleanup = false; + int ret = 0; + int i; + + /* Level sanity check */ + if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL || + root_level < 0 || root_level >= BTRFS_MAX_LEVEL || + root_level < cur_level) { + btrfs_err_rl(fs_info, + "%s: bad levels, cur_level=%d root_level=%d", + __func__, cur_level, root_level); + return -EUCLEAN; + } + + /* Read the tree block if needed */ + if (dst_path->nodes[cur_level] == NULL) { + struct btrfs_key first_key; + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + /* + * dst_path->nodes[root_level] must be initialized before + * calling this function. + */ + if (cur_level == root_level) { + btrfs_err_rl(fs_info, + "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", + __func__, root_level, root_level, cur_level); + return -EUCLEAN; + } + + /* + * We need to get child blockptr/gen from parent before we can + * read it. + */ + eb = dst_path->nodes[cur_level + 1]; + parent_slot = dst_path->slots[cur_level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + btrfs_node_key_to_cpu(eb, &first_key, parent_slot); + + /* This node is old, no need to trace */ + if (child_gen < last_snapshot) + goto out; + + eb = read_tree_block(fs_info, child_bytenr, child_gen, + cur_level, &first_key); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } + + dst_path->nodes[cur_level] = eb; + dst_path->slots[cur_level] = 0; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; + need_cleanup = true; + } + + /* Now record this tree block and its counter part for qgroups */ + ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, + root_level, trace_leaf); + if (ret < 0) + goto cleanup; + + eb = dst_path->nodes[cur_level]; + + if (cur_level > 0) { + /* Iterate all child tree blocks */ + for (i = 0; i < btrfs_header_nritems(eb); i++) { + /* Skip old tree blocks as they won't be swapped */ + if (btrfs_node_ptr_generation(eb, i) < last_snapshot) + continue; + dst_path->slots[cur_level] = i; + + /* Recursive call (at most 7 times) */ + ret = qgroup_trace_new_subtree_blocks(trans, src_eb, + dst_path, cur_level - 1, root_level, + last_snapshot, trace_leaf); + if (ret < 0) + goto cleanup; + } + } + +cleanup: + if (need_cleanup) { + /* Clean up */ + btrfs_tree_unlock_rw(dst_path->nodes[cur_level], + dst_path->locks[cur_level]); + free_extent_buffer(dst_path->nodes[cur_level]); + dst_path->nodes[cur_level] = NULL; + dst_path->slots[cur_level] = 0; + dst_path->locks[cur_level] = 0; + } +out: + return ret; +} + +/* + * Inform qgroup to trace subtree swap used in balance. + * + * Unlike btrfs_qgroup_trace_subtree(), this function will only trace + * new tree blocks whose generation is equal to (or larger than) @last_snapshot. + * + * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and + * @dst_slot), and find any tree blocks whose generation is at @last_snapshot, + * and then go down @src_eb (pointed by @src_parent and @src_slot) to find + * the conterpart of the tree block, then mark both tree blocks as qgroup dirty, + * and skip all tree blocks whose generation is smaller than last_snapshot. + * + * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(), + * which could be the cause of very slow balance if the file tree is large. + * + * @src_parent, @src_slot: pointer to src (file tree) eb. + * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb. + */ +int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *bg_cache, + struct extent_buffer *src_parent, int src_slot, + struct extent_buffer *dst_parent, int dst_slot, + u64 last_snapshot) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_path *dst_path = NULL; + struct btrfs_key first_key; + struct extent_buffer *src_eb = NULL; + struct extent_buffer *dst_eb = NULL; + bool trace_leaf = false; + u64 child_gen; + u64 child_bytenr; + int level; + int ret; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + /* Check parameter order */ + if (btrfs_node_ptr_generation(src_parent, src_slot) > + btrfs_node_ptr_generation(dst_parent, dst_slot)) { + btrfs_err_rl(fs_info, + "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, + btrfs_node_ptr_generation(src_parent, src_slot), + btrfs_node_ptr_generation(dst_parent, dst_slot)); + return -EUCLEAN; + } + + /* + * Only trace leaf if we're relocating data block groups, this could + * reduce tons of data extents tracing for meta/sys bg relocation. + */ + if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA) + trace_leaf = true; + /* Read out real @src_eb, pointed by @src_parent and @src_slot */ + child_bytenr = btrfs_node_blockptr(src_parent, src_slot); + child_gen = btrfs_node_ptr_generation(src_parent, src_slot); + btrfs_node_key_to_cpu(src_parent, &first_key, src_slot); + + src_eb = read_tree_block(fs_info, child_bytenr, child_gen, + btrfs_header_level(src_parent) - 1, &first_key); + if (IS_ERR(src_eb)) { + ret = PTR_ERR(src_eb); + goto out; + } + + /* Read out real @dst_eb, pointed by @src_parent and @src_slot */ + child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot); + child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot); + btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot); + + dst_eb = read_tree_block(fs_info, child_bytenr, child_gen, + btrfs_header_level(dst_parent) - 1, &first_key); + if (IS_ERR(dst_eb)) { + ret = PTR_ERR(dst_eb); + goto out; + } + + if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { + ret = -EINVAL; + goto out; + } + + level = btrfs_header_level(dst_eb); + dst_path = btrfs_alloc_path(); + if (!dst_path) { + ret = -ENOMEM; + goto out; + } + + /* For dst_path */ + extent_buffer_get(dst_eb); + dst_path->nodes[level] = dst_eb; + dst_path->slots[level] = 0; + dst_path->locks[level] = 0; + + /* Do the generation-aware breadth-first search */ + ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, + level, last_snapshot, trace_leaf); + if (ret < 0) + goto out; + ret = 0; + +out: + free_extent_buffer(src_eb); + free_extent_buffer(dst_eb); + btrfs_free_path(dst_path); + if (ret < 0) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + return ret; +} + int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level) @@ -2132,6 +2543,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) struct btrfs_delayed_ref_root *delayed_refs; struct ulist *new_roots = NULL; struct rb_node *node; + u64 num_dirty_extents = 0; u64 qgroup_to_skip; int ret = 0; @@ -2141,6 +2553,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) record = rb_entry(node, struct btrfs_qgroup_extent_record, node); + num_dirty_extents++; trace_btrfs_qgroup_account_extents(fs_info, record); if (!ret) { @@ -2186,6 +2599,8 @@ cleanup: kfree(record); } + trace_qgroup_num_dirty_extents(fs_info, trans->transid, + num_dirty_extents); return ret; } @@ -2897,6 +3312,7 @@ qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) qgroup->rfer_cmpr = 0; qgroup->excl = 0; qgroup->excl_cmpr = 0; + qgroup_dirty(fs_info, qgroup); } spin_unlock(&fs_info->qgroup_lock); } @@ -3004,7 +3420,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || - !is_fstree(root->objectid) || len == 0) + !is_fstree(root->root_key.objectid) || len == 0) return 0; /* @reserved parameter is mandatory for qgroup */ @@ -3090,7 +3506,7 @@ static int qgroup_free_reserved_data(struct inode *inode, goto out; freed += changeset.bytes_changed; } - btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed, + btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, BTRFS_QGROUP_RSV_DATA); ret = freed; out: @@ -3106,6 +3522,10 @@ static int __btrfs_qgroup_release_data(struct inode *inode, int trace_op = QGROUP_RELEASE; int ret; + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, + &BTRFS_I(inode)->root->fs_info->flags)) + return 0; + /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) @@ -3122,7 +3542,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, changeset.bytes_changed, trace_op); if (free) btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->objectid, + BTRFS_I(inode)->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); ret = changeset.bytes_changed; out: @@ -3215,7 +3635,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid) || num_bytes == 0) + !is_fstree(root->root_key.objectid) || num_bytes == 0) return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); @@ -3240,13 +3660,13 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid)) + !is_fstree(root->root_key.objectid)) return; /* TODO: Update trace point to handle such free */ trace_qgroup_meta_free_all_pertrans(root); /* Special value -1 means to free all reserved space */ - btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1, + btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, BTRFS_QGROUP_RSV_META_PERTRANS); } @@ -3256,7 +3676,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, struct btrfs_fs_info *fs_info = root->fs_info; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid)) + !is_fstree(root->root_key.objectid)) return; /* @@ -3267,7 +3687,8 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); trace_qgroup_meta_reserve(root, type, -(s64)num_bytes); - btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type); + btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, + num_bytes, type); } static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, @@ -3321,13 +3742,13 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) struct btrfs_fs_info *fs_info = root->fs_info; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - !is_fstree(root->objectid)) + !is_fstree(root->root_key.objectid)) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); - qgroup_convert_meta(fs_info, root->objectid, num_bytes); + qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); } /* @@ -3354,7 +3775,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) inode->i_ino, unode->val, unode->aux); } btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->objectid, + BTRFS_I(inode)->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); } |