From 827463c49f29111efd22148d24c9ca44d648acfa Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 14 Jan 2014 20:31:51 +0800 Subject: Btrfs: don't mix the ordered extents of all files together during logging the inodes There was a problem in the old code: If we failed to log the csum, we would free all the ordered extents in the log list including those ordered extents that were logged successfully, it would make the log committer not to wait for the completion of the ordered extents. This patch doesn't insert the ordered extents that is about to be logged into a global list, instead, we insert them into a local list. If we log the ordered extents successfully, we splice them with the global list, or we will throw them away, then do full sync. It can also reduce the lock contention and the traverse time of list. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 39d83da03e03..7c449c699bed 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3479,7 +3479,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) static int log_one_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_root *root, - struct extent_map *em, struct btrfs_path *path) + struct extent_map *em, struct btrfs_path *path, + struct list_head *logged_list) { struct btrfs_root *log = root->log_root; struct btrfs_file_extent_item *fi; @@ -3495,7 +3496,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; - int index = log->log_transid % 2; bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; int extent_inserted = 0; @@ -3579,17 +3579,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans, * First check and see if our csums are on our outstanding ordered * extents. */ -again: - spin_lock_irq(&log->log_extents_lock[index]); - list_for_each_entry(ordered, &log->logged_list[index], log_list) { + list_for_each_entry(ordered, logged_list, log_list) { struct btrfs_ordered_sum *sum; if (!mod_len) break; - if (ordered->inode != inode) - continue; - if (ordered->file_offset + ordered->len <= mod_start || mod_start + mod_len <= ordered->file_offset) continue; @@ -3632,12 +3627,6 @@ again: if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) continue; - atomic_inc(&ordered->refs); - spin_unlock_irq(&log->log_extents_lock[index]); - /* - * we've dropped the lock, we must either break or - * start over after this. - */ if (ordered->csum_bytes_left) { btrfs_start_ordered_extent(inode, ordered, 0); @@ -3647,16 +3636,11 @@ again: list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); - if (ret) { - btrfs_put_ordered_extent(ordered); + if (ret) goto unlocked; - } } - btrfs_put_ordered_extent(ordered); - goto again; } - spin_unlock_irq(&log->log_extents_lock[index]); unlocked: if (!mod_len || ret) @@ -3694,7 +3678,8 @@ unlocked: static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct btrfs_path *path) + struct btrfs_path *path, + struct list_head *logged_list) { struct extent_map *em, *n; struct list_head extents; @@ -3752,7 +3737,7 @@ process: write_unlock(&tree->lock); - ret = log_one_extent(trans, inode, root, em, path); + ret = log_one_extent(trans, inode, root, em, path, logged_list); write_lock(&tree->lock); clear_em_logging(tree, em); free_extent_map(em); @@ -3788,6 +3773,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src = NULL; + LIST_HEAD(logged_list); u64 last_extent = 0; int err = 0; int ret; @@ -3836,7 +3822,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); - btrfs_get_logged_extents(log, inode); + btrfs_get_logged_extents(inode, &logged_list); /* * a brute force approach to making sure we get the most uptodate @@ -3962,7 +3948,8 @@ log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (fast_search) { - ret = btrfs_log_changed_extents(trans, root, inode, dst_path); + ret = btrfs_log_changed_extents(trans, root, inode, dst_path, + &logged_list); if (ret) { err = ret; goto out_unlock; @@ -3987,8 +3974,10 @@ log_extents: BTRFS_I(inode)->logged_trans = trans->transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; out_unlock: - if (err) - btrfs_free_logged_extents(log, log->log_transid); + if (unlikely(err)) + btrfs_put_logged_extents(&logged_list); + else + btrfs_submit_logged_extents(&logged_list, log); mutex_unlock(&BTRFS_I(inode)->log_mutex); btrfs_free_path(path); -- cgit v1.2.3-70-g09d2 From 5c902ba6223f6a6575054226931fafc51314a25f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 20 Feb 2014 18:08:51 +0800 Subject: Btrfs: use ACCESS_ONCE to prevent the optimize accesses to ->last_trans_log_full_commit ->last_trans_log_full_commit may be changed by the other tasks without lock, so we need prevent the compiler from the optimize access just like tmp = fs_info->last_trans_log_full_commit if (tmp == ...) ... if (tmp == ...) ... In fact, we need get the new value of ->last_trans_log_full_commit during the second access. Fix it by ACCESS_ONCE(). Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7c449c699bed..5a4e10b9ac3e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2375,14 +2375,14 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->fs_info->last_trans_log_full_commit != + if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) != trans->transid && root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])) schedule(); finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); - } while (root->fs_info->last_trans_log_full_commit != + } while (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) != trans->transid && root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])); return 0; @@ -2392,12 +2392,12 @@ static void wait_for_writer(struct btrfs_trans_handle *trans, struct btrfs_root *root) { DEFINE_WAIT(wait); - while (root->fs_info->last_trans_log_full_commit != + while (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) != trans->transid && atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->fs_info->last_trans_log_full_commit != + if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) != trans->transid && atomic_read(&root->log_writers)) schedule(); mutex_lock(&root->log_mutex); @@ -2456,7 +2456,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } /* bail out if we need to do a full commit */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { + if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == + trans->transid) { ret = -EAGAIN; btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); @@ -2515,7 +2516,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); goto out; } - root->fs_info->last_trans_log_full_commit = trans->transid; + ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = + trans->transid; btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2547,7 +2549,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { + if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == + trans->transid) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); -- cgit v1.2.3-70-g09d2 From 48cab2e0714913a63155f800a55609a4ff6a36b9 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 20 Feb 2014 18:08:52 +0800 Subject: Btrfs: fix the skipped transaction commit during the file sync We may abort the wait earlier if ->last_trans_log_full_commit was set to the current transaction id, at this case, we need commit the current transaction instead of the log sub-transaction. But the current code didn't tell the caller to do it (return 0, not -EAGAIN). Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5a4e10b9ac3e..8a03b39648be 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2364,6 +2364,7 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, { DEFINE_WAIT(wait); int index = transid % 2; + int ret = 0; /* * we only allow two pending log transactions at a time, @@ -2371,21 +2372,26 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, * current transaction, we're done */ do { + if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == + trans->transid) { + ret = -EAGAIN; + break; + } + prepare_to_wait(&root->log_commit_wait[index], &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) != - trans->transid && root->log_transid < transid + 2 && + if (root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])) schedule(); finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); - } while (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) != - trans->transid && root->log_transid < transid + 2 && + } while (root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])); - return 0; + + return ret; } static void wait_for_writer(struct btrfs_trans_handle *trans, @@ -2433,15 +2439,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, log_transid = root->log_transid; index1 = root->log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(trans, root, root->log_transid); + ret = wait_log_commit(trans, root, root->log_transid); mutex_unlock(&root->log_mutex); - return 0; + return ret; } atomic_set(&root->log_commit[index1], 1); /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root->log_transid - 1); + while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ @@ -2529,11 +2536,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid); + ret = wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); - ret = 0; goto out; } atomic_set(&log_root_tree->log_commit[index2], 1); -- cgit v1.2.3-70-g09d2 From e87ac1368700af66c295afa47e5c7df0d9d8b919 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 20 Feb 2014 18:08:53 +0800 Subject: Btrfs: don't start the log transaction if the log tree init fails The old code would start the log transaction even the log tree init failed, it was unnecessary. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8a03b39648be..ca960ad271fe 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -139,7 +139,6 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; - int err = 0; mutex_lock(&root->log_mutex); if (root->log_root) { @@ -155,24 +154,27 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); return 0; } - root->log_multiple_pids = false; - root->log_start_pid = current->pid; + + ret = 0; mutex_lock(&root->fs_info->tree_log_mutex); - if (!root->fs_info->log_root_tree) { + if (!root->fs_info->log_root_tree) ret = btrfs_init_log_root_tree(trans, root->fs_info); - if (ret) - err = ret; - } - if (err == 0 && !root->log_root) { + mutex_unlock(&root->fs_info->tree_log_mutex); + if (ret) + goto out; + + if (!root->log_root) { ret = btrfs_add_log_tree(trans, root); if (ret) - err = ret; + goto out; } - mutex_unlock(&root->fs_info->tree_log_mutex); + root->log_multiple_pids = false; + root->log_start_pid = current->pid; atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); +out: mutex_unlock(&root->log_mutex); - return err; + return ret; } /* @@ -4116,7 +4118,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, ret = start_log_trans(trans, root); if (ret) - goto end_trans; + goto end_no_trans; ret = btrfs_log_inode(trans, root, inode, inode_only); if (ret) -- cgit v1.2.3-70-g09d2 From 7483e1a4464999c72b231af0efe39cb31fd73f14 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 20 Feb 2014 18:08:55 +0800 Subject: Btrfs: remove unnecessary memory barrier in btrfs_sync_log() Mutex unlock implies certain memory barriers to make sure all the memory operation completes before the unlock, and the next mutex lock implies memory barriers to make sure the all the memory happens after the lock. So it is a full memory barrier(smp_mb), we needn't add memory barriers. Remove them. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ca960ad271fe..285c168391f3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2496,7 +2496,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; - smp_mb(); /* * IO has been started, blocks of the log tree have WRITTEN flag set * in their headers. new modifications of the log will be written to @@ -2589,8 +2588,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_header_level(log_root_tree->node)); log_root_tree->log_transid++; - smp_mb(); - mutex_unlock(&log_root_tree->log_mutex); /* -- cgit v1.2.3-70-g09d2 From bb14a59b619d3a9993c3fa04bb10347db35ca550 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 20 Feb 2014 18:08:56 +0800 Subject: Btrfs: use signed integer instead of unsigned long integer for log transid The log trans id is initialized to be 0 every time we create a log tree, and the log tree need be re-created after a new transaction is started, it means the log trans id is unlikely to be a huge number, so we can use signed integer instead of unsigned long integer to save a bit space. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 14 +++++++------- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/tree-log.c | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 8fed2125689e..c9a24444ec9a 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -109,14 +109,17 @@ struct btrfs_inode { u64 last_trans; /* - * log transid when this inode was last modified + * transid that last logged this inode */ - u64 last_sub_trans; + u64 logged_trans; /* - * transid that last logged this inode + * log transid when this inode was last modified */ - u64 logged_trans; + int last_sub_trans; + + /* a local copy of root's last_log_commit */ + int last_log_commit; /* total number of bytes pending delalloc, used by stat to calc the * real block usage of the file @@ -155,9 +158,6 @@ struct btrfs_inode { /* flags field from the on disk inode */ u32 flags; - /* a local copy of root's last_log_commit */ - unsigned long last_log_commit; - /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dac6653d4cce..70c03f5f0953 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1721,8 +1721,8 @@ struct btrfs_root { atomic_t log_writers; atomic_t log_commit[2]; atomic_t log_batch; - unsigned long log_transid; - unsigned long last_log_commit; + int log_transid; + int last_log_commit; pid_t log_start_pid; bool log_multiple_pids; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 285c168391f3..128a904ceac0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2362,7 +2362,7 @@ static int update_log_root(struct btrfs_trans_handle *trans, } static int wait_log_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long transid) + struct btrfs_root *root, int transid) { DEFINE_WAIT(wait); int index = transid % 2; @@ -2434,7 +2434,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; - unsigned long log_transid = 0; + int log_transid = 0; struct blk_plug plug; mutex_lock(&root->log_mutex); -- cgit v1.2.3-70-g09d2 From 8b050d350c7846462a21e9e054c9154ede9b43cf Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 20 Feb 2014 18:08:58 +0800 Subject: Btrfs: fix skipped error handle when log sync failed It is possible that many tasks sync the log tree at the same time, but only one task can do the sync work, the others will wait for it. But those wait tasks didn't get the result of the log sync, and returned 0 when they ended the wait. It caused those tasks skipped the error handle, and the serious problem was they told the users the file sync succeeded but in fact they failed. This patch fixes this problem by introducing a log context structure, we insert it into the a global list. When the sync fails, we will set the error number of every log context in the list, then the waiting tasks get the error number of the log context and handle the error if need. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 2 + fs/btrfs/file.c | 9 +++-- fs/btrfs/tree-log.c | 114 ++++++++++++++++++++++++++++++++++++++++------------ fs/btrfs/tree-log.h | 16 +++++++- 5 files changed, 111 insertions(+), 31 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 70c03f5f0953..906410719acb 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1718,6 +1718,7 @@ struct btrfs_root { struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; + struct list_head log_ctxs[2]; atomic_t log_writers; atomic_t log_commit[2]; atomic_t log_batch; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index cc1b4237dc62..44f52d280b7d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1200,6 +1200,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); + INIT_LIST_HEAD(&root->log_ctxs[0]); + INIT_LIST_HEAD(&root->log_ctxs[1]); atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 006af2f4dd98..6acccc4a7f2a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1864,8 +1864,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; struct btrfs_trans_handle *trans; + struct btrfs_log_ctx ctx; + int ret = 0; bool full_sync = 0; trace_btrfs_sync_file(file, datasync); @@ -1959,7 +1960,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } trans->sync = true; - ret = btrfs_log_dentry_safe(trans, root, dentry); + btrfs_init_log_ctx(&ctx); + + ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -1979,7 +1982,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { - ret = btrfs_sync_log(trans, root); + ret = btrfs_sync_log(trans, root, &ctx); if (!ret) { ret = btrfs_end_transaction(trans, root); goto out; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 128a904ceac0..da6da274dce3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -136,8 +136,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, * syncing the tree wait for us to finish */ static int start_log_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, + struct btrfs_log_ctx *ctx) { + int index; int ret; mutex_lock(&root->log_mutex); @@ -151,6 +153,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans, atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); + if (ctx) { + index = root->log_transid % 2; + list_add_tail(&ctx->list, &root->log_ctxs[index]); + } mutex_unlock(&root->log_mutex); return 0; } @@ -172,6 +178,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans, root->log_start_pid = current->pid; atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); + if (ctx) { + index = root->log_transid % 2; + list_add_tail(&ctx->list, &root->log_ctxs[index]); + } out: mutex_unlock(&root->log_mutex); return ret; @@ -2361,12 +2371,11 @@ static int update_log_root(struct btrfs_trans_handle *trans, return ret; } -static int wait_log_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int transid) +static void wait_log_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int transid) { DEFINE_WAIT(wait); int index = transid % 2; - int ret = 0; /* * we only allow two pending log transactions at a time, @@ -2374,12 +2383,6 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, * current transaction, we're done */ do { - if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == - trans->transid) { - ret = -EAGAIN; - break; - } - prepare_to_wait(&root->log_commit_wait[index], &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); @@ -2392,27 +2395,55 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); } while (root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])); - - return ret; } static void wait_for_writer(struct btrfs_trans_handle *trans, struct btrfs_root *root) { DEFINE_WAIT(wait); - while (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) != - trans->transid && atomic_read(&root->log_writers)) { + + while (atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) != - trans->transid && atomic_read(&root->log_writers)) + if (atomic_read(&root->log_writers)) schedule(); mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); } } +static inline void btrfs_remove_log_ctx(struct btrfs_root *root, + struct btrfs_log_ctx *ctx) +{ + if (!ctx) + return; + + mutex_lock(&root->log_mutex); + list_del_init(&ctx->list); + mutex_unlock(&root->log_mutex); +} + +/* + * Invoked in log mutex context, or be sure there is no other task which + * can access the list. + */ +static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, + int index, int error) +{ + struct btrfs_log_ctx *ctx; + + if (!error) { + INIT_LIST_HEAD(&root->log_ctxs[index]); + return; + } + + list_for_each_entry(ctx, &root->log_ctxs[index], list) + ctx->log_ret = error; + + INIT_LIST_HEAD(&root->log_ctxs[index]); +} + /* * btrfs_sync_log does sends a given tree log down to the disk and * updates the super blocks to record it. When this call is done, @@ -2426,7 +2457,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans, * that has happened. */ int btrfs_sync_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, struct btrfs_log_ctx *ctx) { int index1; int index2; @@ -2435,15 +2466,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; int log_transid = 0; + struct btrfs_log_ctx root_log_ctx; struct blk_plug plug; mutex_lock(&root->log_mutex); log_transid = root->log_transid; index1 = root->log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - ret = wait_log_commit(trans, root, root->log_transid); + wait_log_commit(trans, root, root->log_transid); mutex_unlock(&root->log_mutex); - return ret; + return ctx->log_ret; } atomic_set(&root->log_commit[index1], 1); @@ -2534,13 +2566,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } index2 = log_root_tree->log_transid % 2; + + btrfs_init_log_ctx(&root_log_ctx); + list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); + if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - ret = wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid); + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); + ret = root_log_ctx.log_ret; goto out; } atomic_set(&log_root_tree->log_commit[index2], 1); @@ -2609,12 +2646,31 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); out_wake_log_root: + /* + * We needn't get log_mutex here because we are sure all + * the other tasks are blocked. + */ + btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); + + /* + * It is dangerous if log_commit is changed before we set + * ->log_ret of log ctx. Because the readers may not get + * the return value. + */ + smp_wmb(); + atomic_set(&log_root_tree->log_commit[index2], 0); smp_mb(); if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: + /* See above. */ + btrfs_remove_all_log_ctxs(root, index1, ret); + + /* See above. */ + smp_wmb(); atomic_set(&root->log_commit[index1], 0); + smp_mb(); if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); @@ -4076,7 +4132,8 @@ out: */ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only) + struct dentry *parent, int exists_only, + struct btrfs_log_ctx *ctx) { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; @@ -4113,7 +4170,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - ret = start_log_trans(trans, root); + ret = start_log_trans(trans, root, ctx); if (ret) goto end_no_trans; @@ -4163,6 +4220,9 @@ end_trans: root->fs_info->last_trans_log_full_commit = trans->transid; ret = 1; } + + if (ret) + btrfs_remove_log_ctx(root, ctx); btrfs_end_log_trans(root); end_no_trans: return ret; @@ -4175,12 +4235,14 @@ end_no_trans: * data on disk. */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry) + struct btrfs_root *root, struct dentry *dentry, + struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; - ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); + ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, + 0, ctx); dput(parent); return ret; @@ -4417,6 +4479,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, root->fs_info->last_trans_committed)) return 0; - return btrfs_log_inode_parent(trans, root, inode, parent, 1); + return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 1d4ae0d15a70..59c1edb31d19 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -22,14 +22,26 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 +struct btrfs_log_ctx { + int log_ret; + struct list_head list; +}; + +static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) +{ + ctx->log_ret = 0; + INIT_LIST_HEAD(&ctx->list); +} + int btrfs_sync_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_root *root, struct btrfs_log_ctx *ctx); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry); + struct btrfs_root *root, struct dentry *dentry, + struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, -- cgit v1.2.3-70-g09d2 From d1433debe7f4346cf9fc0dafc71c3137d2a97bc4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 20 Feb 2014 18:08:59 +0800 Subject: Btrfs: just wait or commit our own log sub-transaction We might commit the log sub-transaction which didn't contain the metadata we logged. It was because we didn't record the log transid and just select the current log sub-transaction to commit, but the right one might be committed by the other task already. Actually, we needn't do anything and it is safe that we go back directly in this case. This patch improves the log sync by the above idea. We record the transid of the log sub-transaction in which we log the metadata, and the transid of the log sub-transaction we have committed. If the committed transid is >= the transid we record when logging the metadata, we just go back. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/disk-io.c | 2 ++ fs/btrfs/tree-log.c | 63 ++++++++++++++++++++++++++++++++++------------------- fs/btrfs/tree-log.h | 2 ++ 4 files changed, 47 insertions(+), 23 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 906410719acb..b2c0336db691 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1723,6 +1723,9 @@ struct btrfs_root { atomic_t log_commit[2]; atomic_t log_batch; int log_transid; + /* No matter the commit succeeds or not*/ + int log_transid_committed; + /* Just be updated when the commit succeeds. */ int last_log_commit; pid_t log_start_pid; bool log_multiple_pids; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 44f52d280b7d..dd52146035b3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1209,6 +1209,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->orphan_inodes, 0); atomic_set(&root->refs, 1); root->log_transid = 0; + root->log_transid_committed = -1; root->last_log_commit = 0; if (fs_info) extent_io_tree_init(&root->dirty_log_pages, @@ -1422,6 +1423,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; root->log_transid = 0; + root->log_transid_committed = -1; root->last_log_commit = 0; return 0; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index da6da274dce3..57d4ca7fd555 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -156,6 +156,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, if (ctx) { index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); + ctx->log_transid = root->log_transid; } mutex_unlock(&root->log_mutex); return 0; @@ -181,6 +182,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, if (ctx) { index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); + ctx->log_transid = root->log_transid; } out: mutex_unlock(&root->log_mutex); @@ -2387,13 +2389,13 @@ static void wait_log_commit(struct btrfs_trans_handle *trans, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->log_transid < transid + 2 && + if (root->log_transid_committed < transid && atomic_read(&root->log_commit[index])) schedule(); finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); - } while (root->log_transid < transid + 2 && + } while (root->log_transid_committed < transid && atomic_read(&root->log_commit[index])); } @@ -2470,18 +2472,24 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct blk_plug plug; mutex_lock(&root->log_mutex); - log_transid = root->log_transid; - index1 = root->log_transid % 2; + log_transid = ctx->log_transid; + if (root->log_transid_committed >= log_transid) { + mutex_unlock(&root->log_mutex); + return ctx->log_ret; + } + + index1 = log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(trans, root, root->log_transid); + wait_log_commit(trans, root, log_transid); mutex_unlock(&root->log_mutex); return ctx->log_ret; } + ASSERT(log_transid == root->log_transid); atomic_set(&root->log_commit[index1], 1); /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) - wait_log_commit(trans, root, root->log_transid - 1); + wait_log_commit(trans, root, log_transid - 1); while (1) { int batch = atomic_read(&root->log_batch); @@ -2535,9 +2543,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->log_mutex); + btrfs_init_log_ctx(&root_log_ctx); + mutex_lock(&log_root_tree->log_mutex); atomic_inc(&log_root_tree->log_batch); atomic_inc(&log_root_tree->log_writers); + + index2 = log_root_tree->log_transid % 2; + list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); + root_log_ctx.log_transid = log_root_tree->log_transid; + mutex_unlock(&log_root_tree->log_mutex); ret = update_log_root(trans, log); @@ -2550,6 +2565,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } if (ret) { + if (!list_empty(&root_log_ctx.list)) + list_del_init(&root_log_ctx.list); + blk_finish_plug(&plug); if (ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); @@ -2565,26 +2583,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } - index2 = log_root_tree->log_transid % 2; - - btrfs_init_log_ctx(&root_log_ctx); - list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); + if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { + mutex_unlock(&log_root_tree->log_mutex); + ret = root_log_ctx.log_ret; + goto out; + } + index2 = root_log_ctx.log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid); + root_log_ctx.log_transid); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = root_log_ctx.log_ret; goto out; } + ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); atomic_set(&log_root_tree->log_commit[index2], 1); if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid - 1); + root_log_ctx.log_transid - 1); } wait_for_writer(trans, log_root_tree); @@ -2652,26 +2673,22 @@ out_wake_log_root: */ btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); - /* - * It is dangerous if log_commit is changed before we set - * ->log_ret of log ctx. Because the readers may not get - * the return value. - */ - smp_wmb(); - + mutex_lock(&log_root_tree->log_mutex); + log_root_tree->log_transid_committed++; atomic_set(&log_root_tree->log_commit[index2], 0); - smp_mb(); + mutex_unlock(&log_root_tree->log_mutex); + if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: /* See above. */ btrfs_remove_all_log_ctxs(root, index1, ret); - /* See above. */ - smp_wmb(); + mutex_lock(&root->log_mutex); + root->log_transid_committed++; atomic_set(&root->log_commit[index1], 0); + mutex_unlock(&root->log_mutex); - smp_mb(); if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); return ret; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 59c1edb31d19..91b145fce333 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -24,12 +24,14 @@ struct btrfs_log_ctx { int log_ret; + int log_transid; struct list_head list; }; static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) { ctx->log_ret = 0; + ctx->log_transid = 0; INIT_LIST_HEAD(&ctx->list); } -- cgit v1.2.3-70-g09d2 From 50471a388cf011523f3bf91d275ec3f30669f0ee Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 20 Feb 2014 18:08:57 +0800 Subject: Btrfs: stop joining the log transaction if sync log fails If the log sync fails, there is something wrong in the log tree, we should not continue to join the log transaction and log the metadata. What we should do is to do a full commit. This patch fixes this problem by setting ->last_trans_log_full_commit to the current transaction id, it will tell the tasks not to join the log transaction, and do a full commit. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 57d4ca7fd555..e2f45fc02610 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -144,6 +144,12 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); if (root->log_root) { + if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == + trans->transid) { + ret = -EAGAIN; + goto out; + } + if (!root->log_start_pid) { root->log_start_pid = current->pid; root->log_multiple_pids = false; @@ -2527,6 +2533,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); + ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = + trans->transid; mutex_unlock(&root->log_mutex); goto out; } @@ -2569,13 +2577,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, list_del_init(&root_log_ctx.list); blk_finish_plug(&plug); + ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = + trans->transid; if (ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); mutex_unlock(&log_root_tree->log_mutex); goto out; } - ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = - trans->transid; btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2629,6 +2637,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); if (ret) { + ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = + trans->transid; btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2657,6 +2667,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ ret = write_ctree_super(trans, root->fs_info->tree_root, 1); if (ret) { + ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = + trans->transid; btrfs_abort_transaction(trans, root, ret); goto out_wake_log_root; } -- cgit v1.2.3-70-g09d2