diff options
Diffstat (limited to 'fs/btrfs/transaction.c')
| -rw-r--r-- | fs/btrfs/transaction.c | 263 |
1 files changed, 160 insertions, 103 deletions
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1c3a1189c0bd..1f1c25db6f6b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -162,7 +162,17 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) struct btrfs_root *root, *tmp; struct btrfs_caching_control *caching_ctl, *next; + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING); + down_write(&fs_info->commit_root_sem); + + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) + fs_info->last_reloc_trans = trans->transid; + list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, dirty_list) { list_del_init(&root->dirty_list); @@ -413,7 +423,6 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && root->last_trans < trans->transid) || force) { - WARN_ON(root == fs_info->extent_root); WARN_ON(!force && root->commit_root != root->node); /* @@ -628,7 +637,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush); + ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush); if (ret) goto reserve_fail; if (delayed_refs_bytes) { @@ -692,7 +701,6 @@ again: h->transid = cur_trans->transid; h->transaction = cur_trans; - h->root = root; refcount_set(&h->use_count, 1); h->fs_info = root->fs_info; @@ -846,7 +854,37 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) static noinline void wait_for_commit(struct btrfs_transaction *commit, const enum btrfs_trans_state min_state) { - wait_event(commit->commit_wait, commit->state >= min_state); + struct btrfs_fs_info *fs_info = commit->fs_info; + u64 transid = commit->transid; + bool put = false; + + while (1) { + wait_event(commit->commit_wait, commit->state >= min_state); + if (put) + btrfs_put_transaction(commit); + + if (min_state < TRANS_STATE_COMPLETED) + break; + + /* + * A transaction isn't really completed until all of the + * previous transactions are completed, but with fsync we can + * end up with SUPER_COMMITTED transactions before a COMPLETED + * transaction. Wait for those. + */ + + spin_lock(&fs_info->trans_lock); + commit = list_first_entry_or_null(&fs_info->trans_list, + struct btrfs_transaction, + list); + if (!commit || commit->transid > transid) { + spin_unlock(&fs_info->trans_lock); + break; + } + refcount_inc(&commit->use_count); + put = true; + spin_unlock(&fs_info->trans_lock); + } } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) @@ -1236,6 +1274,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) struct extent_buffer *eb; int ret; + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, BTRFS_NESTING_COW); @@ -1267,9 +1311,8 @@ again: root = list_entry(next, struct btrfs_root, dirty_list); clear_bit(BTRFS_ROOT_DIRTY, &root->state); - if (root != fs_info->extent_root) - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); ret = update_cowonly_root(trans, root); if (ret) return ret; @@ -1299,9 +1342,6 @@ again: if (!list_empty(&fs_info->dirty_cowonly_roots)) goto again; - list_add_tail(&fs_info->extent_root->dirty_list, - &trans->transaction->switch_commits); - /* Update dev-replace pointer once everything is committed */ fs_info->dev_replace.committed_cursor_left = fs_info->dev_replace.cursor_left_last_write_of_item; @@ -1310,6 +1350,32 @@ again: } /* + * If we had a pending drop we need to see if there are any others left in our + * dead roots list, and if not clear our bit and wake any waiters. + */ +void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + /* + * We put the drop in progress roots at the front of the list, so if the + * first entry doesn't have UNFINISHED_DROP set we can wake everybody + * up. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + struct btrfs_root *root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, + root_list); + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) { + spin_unlock(&fs_info->trans_lock); + return; + } + } + spin_unlock(&fs_info->trans_lock); + + btrfs_wake_unfinished_drop(fs_info); +} + +/* * dead roots are old snapshots that need to be deleted. This allocates * a dirty root struct and adds it into the list of dead roots that need to * be deleted @@ -1321,13 +1387,19 @@ void btrfs_add_dead_root(struct btrfs_root *root) spin_lock(&fs_info->trans_lock); if (list_empty(&root->root_list)) { btrfs_grab_root(root); - list_add_tail(&root->root_list, &fs_info->dead_roots); + + /* We want to process the partially complete drops first. */ + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) + list_add(&root->root_list, &fs_info->dead_roots); + else + list_add_tail(&root->root_list, &fs_info->dead_roots); } spin_unlock(&fs_info->trans_lock); } /* - * update all the cowonly tree roots on disk + * Update each subvolume root and its relocation root, if it exists, in the tree + * of tree roots. Also free log roots if they exist. */ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) { @@ -1336,6 +1408,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) int i; int ret; + /* + * At this point no one can be using this transaction to modify any tree + * and no one can start another transaction to modify any tree either. + */ + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + spin_lock(&fs_info->fs_roots_radix_lock); while (1) { ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, @@ -1348,6 +1426,14 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) struct btrfs_root *root = gang[i]; int ret2; + /* + * At this point we can neither have tasks logging inodes + * from a root nor trying to commit a log tree. + */ + ASSERT(atomic_read(&root->log_writers) == 0); + ASSERT(atomic_read(&root->log_commit[0]) == 0); + ASSERT(atomic_read(&root->log_commit[1]) == 0); + radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); @@ -1472,12 +1558,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, return ret; } - /* - * We are going to commit transaction, see btrfs_commit_transaction() - * comment for reason locking tree_log_mutex - */ - mutex_lock(&fs_info->tree_log_mutex); - ret = commit_fs_roots(trans); if (ret) goto out; @@ -1513,8 +1593,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, "Error while writing out transaction for qgroup"); out: - mutex_unlock(&fs_info->tree_log_mutex); - /* * Force parent root to be updated, as we recorded it before so its * last_trans == cur_transid. @@ -1578,7 +1656,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_reloc_pre_snapshot(pending, &to_reserve); if (to_reserve > 0) { - pending->error = btrfs_block_rsv_add(root, + pending->error = btrfs_block_rsv_add(fs_info, &pending->block_rsv, to_reserve, BTRFS_RESERVE_NO_FLUSH); @@ -1861,50 +1939,14 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) return ret; } -/* - * commit transactions asynchronously. once btrfs_commit_transaction_async - * returns, any subsequent transaction will not be allowed to join. - */ -struct btrfs_async_commit { - struct btrfs_trans_handle *newtrans; - struct work_struct work; -}; - -static void do_async_commit(struct work_struct *work) -{ - struct btrfs_async_commit *ac = - container_of(work, struct btrfs_async_commit, work); - - /* - * We've got freeze protection passed with the transaction. - * Tell lockdep about it. - */ - if (ac->newtrans->type & __TRANS_FREEZABLE) - __sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS); - - current->journal_info = ac->newtrans; - - btrfs_commit_transaction(ac->newtrans); - kfree(ac); -} - -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) +void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_async_commit *ac; struct btrfs_transaction *cur_trans; - ac = kmalloc(sizeof(*ac), GFP_NOFS); - if (!ac) - return -ENOMEM; - - INIT_WORK(&ac->work, do_async_commit); - ac->newtrans = btrfs_join_transaction(trans->root); - if (IS_ERR(ac->newtrans)) { - int err = PTR_ERR(ac->newtrans); - kfree(ac); - return err; - } + /* Kick the transaction kthread. */ + set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); + wake_up_process(fs_info->transaction_kthread); /* take transaction reference */ cur_trans = trans->transaction; @@ -1913,28 +1955,15 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) btrfs_end_transaction(trans); /* - * Tell lockdep we've released the freeze rwsem, since the - * async commit thread will be the one to unlock it. - */ - if (ac->newtrans->type & __TRANS_FREEZABLE) - __sb_writers_release(fs_info->sb, SB_FREEZE_FS); - - schedule_work(&ac->work); - /* * Wait for the current transaction commit to start and block * subsequent transaction joins */ wait_event(fs_info->transaction_blocked_wait, cur_trans->state >= TRANS_STATE_COMMIT_START || TRANS_ABORTED(cur_trans)); - if (current->journal_info == trans) - current->journal_info = NULL; - btrfs_put_transaction(cur_trans); - return 0; } - static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1986,7 +2015,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) btrfs_put_transaction(cur_trans); btrfs_put_transaction(cur_trans); - trace_btrfs_transaction_commit(trans->root); + trace_btrfs_transaction_commit(fs_info); if (current->journal_info == trans) current->journal_info = NULL; @@ -2013,16 +2042,24 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { /* - * We use writeback_inodes_sb here because if we used + * We use try_to_writeback_inodes_sb() here because if we used * btrfs_start_delalloc_roots we would deadlock with fs freeze. * Currently are holding the fs freeze lock, if we do an async flush * we'll do btrfs_join_transaction() and deadlock because we need to * wait for the fs freeze lock. Using the direct flushing we benefit * from already being in a transaction and our join_transaction doesn't * have to re-take the fs freeze lock. + * + * Note that try_to_writeback_inodes_sb() will only trigger writeback + * if it can read lock sb->s_umount. It will always be able to lock it, + * except when the filesystem is being unmounted or being frozen, but in + * those cases sync_filesystem() is called, which results in calling + * writeback_inodes_sb() while holding a write lock on sb->s_umount. + * Note that we don't call writeback_inodes_sb() directly, because it + * will emit a warning if sb->s_umount is not locked. */ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); + try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); return 0; } @@ -2032,6 +2069,27 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } +/* + * Add a pending snapshot associated with the given transaction handle to the + * respective handle. This must be called after the transaction commit started + * and while holding fs_info->trans_lock. + * This serves to guarantee a caller of btrfs_commit_transaction() that it can + * safely free the pending snapshot pointer in case btrfs_commit_transaction() + * returns an error. + */ +static void add_pending_snapshot(struct btrfs_trans_handle *trans) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + + if (!trans->pending_snapshot) + return; + + lockdep_assert_held(&trans->fs_info->trans_lock); + ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START); + + list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2105,6 +2163,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (cur_trans->state >= TRANS_STATE_COMMIT_START) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + add_pending_snapshot(trans); + spin_unlock(&fs_info->trans_lock); refcount_inc(&cur_trans->use_count); @@ -2195,11 +2255,19 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * COMMIT_DOING so make sure to wait for num_writers to == 1 again. */ spin_lock(&fs_info->trans_lock); + add_pending_snapshot(trans); cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); + /* + * We've started the commit, clear the flag in case we were triggered to + * do an async commit but somebody else started before the transaction + * kthread could do the work. + */ + clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); + if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; goto scrub_continue; @@ -2246,24 +2314,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) WARN_ON(cur_trans != trans->transaction); - /* btrfs_commit_tree_roots is responsible for getting the - * various roots consistent with each other. Every pointer - * in the tree of tree roots has to point to the most up to date - * root for every subvolume and other tree. So, we have to keep - * the tree logging code from jumping in and changing any - * of the trees. - * - * At this point in the commit, there can't be any tree-log - * writers, but a little lower down we drop the trans mutex - * and let new people in. By holding the tree_log_mutex - * from now until after the super is written, we avoid races - * with the tree-log code. - */ - mutex_lock(&fs_info->tree_log_mutex); - ret = commit_fs_roots(trans); if (ret) - goto unlock_tree_log; + goto unlock_reloc; /* * Since the transaction is done, we can apply the pending changes @@ -2282,11 +2335,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ ret = btrfs_qgroup_account_extents(trans); if (ret < 0) - goto unlock_tree_log; + goto unlock_reloc; ret = commit_cowonly_roots(trans); if (ret) - goto unlock_tree_log; + goto unlock_reloc; /* * The tasks which save the space cache and inode cache may also @@ -2294,7 +2347,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; - goto unlock_tree_log; + goto unlock_reloc; } cur_trans = fs_info->running_transaction; @@ -2327,6 +2380,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_trans_release_chunk_metadata(trans); + /* + * Before changing the transaction state to TRANS_STATE_UNBLOCKED and + * setting fs_info->running_transaction to NULL, lock tree_log_mutex to + * make sure that before we commit our superblock, no other task can + * start a new transaction and commit a log tree before we commit our + * superblock. Anyone trying to commit a log tree locks this mutex before + * writing its superblock. + */ + mutex_lock(&fs_info->tree_log_mutex); + spin_lock(&fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; fs_info->running_transaction = NULL; @@ -2339,10 +2402,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) { btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction"); - /* - * reloc_mutex has been unlocked, tree_log_mutex is still held - * but we can't jump to unlock_tree_log causing double unlock - */ mutex_unlock(&fs_info->tree_log_mutex); goto scrub_continue; } @@ -2393,7 +2452,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(fs_info->sb); - trace_btrfs_transaction_commit(trans->root); + trace_btrfs_transaction_commit(fs_info); btrfs_scrub_continue(fs_info); @@ -2404,8 +2463,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; -unlock_tree_log: - mutex_unlock(&fs_info->tree_log_mutex); unlock_reloc: mutex_unlock(&fs_info->reloc_mutex); scrub_continue: |
