btrfs: do not infinite loop in data reclaim if we aborted

Error injection stressing uncovered a busy loop in our data reclaim loop. There are two cases here, one where we loop creating block groups until space_info->full is set, or in the main loop we will skip erroring out any tickets if space_info->full == 0. Unfortunately if we aborted the transaction then we will never allocate chunks or reclaim any space and thus never get ->full, and you'll see stack traces like this: watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [kworker/u4:4:139] CPU: 0 PID: 139 Comm: kworker/u4:4 Tainted: G W 5.13.0-rc1+ #328 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Workqueue: events_unbound btrfs_async_reclaim_data_space RIP: 0010:btrfs_join_transaction+0x12/0x20 RSP: 0018:ffffb2b780b77de0 EFLAGS: 00000246 RAX: ffffb2b781863d58 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000801 RSI: ffff987952b57400 RDI: ffff987940aa3000 RBP: ffff987954d55000 R08: 0000000000000001 R09: ffff98795539e8f0 R10: 000000000000000f R11: 000000000000000f R12: ffffffffffffffff R13: ffff987952b574c8 R14: ffff987952b57400 R15: 0000000000000008 FS: 0000000000000000(0000) GS:ffff9879bbc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f0703da4000 CR3: 0000000113398004 CR4: 0000000000370ef0 Call Trace: flush_space+0x4a8/0x660 btrfs_async_reclaim_data_space+0x55/0x130 process_one_work+0x1e9/0x380 worker_thread+0x53/0x3e0 ? process_one_work+0x380/0x380 kthread+0x118/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x1f/0x30 Fix this by checking to see if we have a btrfs fs error in either of the reclaim loops, and if so fail the tickets and bail. In addition to this, fix maybe_fail_all_tickets() to not try to grant tickets if we've aborted, simply fail everything. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: David Sterba <dsterba@suse.com>
author: Josef Bacik <josef@toxicpanda.com> 2021-10-05 16:35:26 -0400
committer: David Sterba <dsterba@suse.com> 2021-10-26 19:08:05 +0200
commit: 0e24f6d84b4ca50780c0c55dcdfc5bdeda5c7014 (patch)
tree: b734e7a9efdde4890edc8cbc9ed8ebd653ddf697 /fs/btrfs/space-info.c
parent: 849615394515cce02add9c5b931179162e251aab (diff)
1 files changed, 24 insertions, 4 deletions
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index aa5be0b24987..48d77f360a24 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -885,6 +885,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
 {
 	struct reserve_ticket *ticket;
 	u64 tickets_id = space_info->tickets_id;
+	const bool aborted = BTRFS_FS_ERROR(fs_info);
 
 	trace_btrfs_fail_all_tickets(fs_info, space_info);
 
@@ -898,16 +899,19 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
 		ticket = list_first_entry(&space_info->tickets,
 					  struct reserve_ticket, list);
 
-		if (ticket->steal &&
+		if (!aborted && ticket->steal &&
 		    steal_from_global_rsv(fs_info, space_info, ticket))
 			return true;
 
-		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+		if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
 			btrfs_info(fs_info, "failing ticket with %llu bytes",
 				   ticket->bytes);
 
 		remove_ticket(space_info, ticket);
-		ticket->error = -ENOSPC;
+		if (aborted)
+			ticket->error = -EIO;
+		else
+			ticket->error = -ENOSPC;
 		wake_up(&ticket->wait);
 
 		/*
@@ -916,7 +920,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
 		 * here to see if we can make progress with the next ticket in
 		 * the list.
 		 */
-		btrfs_try_granting_tickets(fs_info, space_info);
+		if (!aborted)
+			btrfs_try_granting_tickets(fs_info, space_info);
 	}
 	return (tickets_id != space_info->tickets_id);
 }
@@ -1172,6 +1177,10 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
 			spin_unlock(&space_info->lock);
 			return;
 		}
+
+		/* Something happened, fail everything and bail. */
+		if (BTRFS_FS_ERROR(fs_info))
+			goto aborted_fs;
 		last_tickets_id = space_info->tickets_id;
 		spin_unlock(&space_info->lock);
 	}
@@ -1202,9 +1211,20 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
 			} else {
 				flush_state = 0;
 			}
+
+			/* Something happened, fail everything and bail. */
+			if (BTRFS_FS_ERROR(fs_info))
+				goto aborted_fs;
+
 		}
 		spin_unlock(&space_info->lock);
 	}
+	return;
+
+aborted_fs:
+	maybe_fail_all_tickets(fs_info, space_info);
+	space_info->flush = 0;
+	spin_unlock(&space_info->lock);
 }
 
 void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
author	Josef Bacik <josef@toxicpanda.com>	2021-10-05 16:35:26 -0400
committer	David Sterba <dsterba@suse.com>	2021-10-26 19:08:05 +0200
commit	0e24f6d84b4ca50780c0c55dcdfc5bdeda5c7014 (patch)
tree	b734e7a9efdde4890edc8cbc9ed8ebd653ddf697 /fs/btrfs/space-info.c
parent	849615394515cce02add9c5b931179162e251aab (diff)