From 67c5e7d464bc466471b05e027abe8a6b29687ebd Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 11 Jun 2015 00:58:53 +0100 Subject: Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[] [] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [] ? trace_hardirqs_on+0xd/0xf [181631.224566] [] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [] ? arch_local_irq_save+0x9/0xc [181631.224566] [] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [] ? __do_page_fault+0x211/0x424 [181631.224566] [] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 ; done Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b977fc8d8201..b59deb2c63f4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1772,7 +1772,6 @@ static int cleaner_kthread(void *arg) } btrfs_run_delayed_iputs(root); - btrfs_delete_unused_bgs(root->fs_info); again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -1781,6 +1780,16 @@ static int cleaner_kthread(void *arg) * needn't do anything special here. */ btrfs_run_defrag_inodes(root->fs_info); + + /* + * Acquires fs_info->delete_unused_bgs_mutex to avoid racing + * with relocation (btrfs_relocate_chunk) and relocation + * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) + * after acquiring fs_info->delete_unused_bgs_mutex. So we + * can't hold, nor need to, fs_info->cleaner_mutex when deleting + * unused block groups. + */ + btrfs_delete_unused_bgs(root->fs_info); sleep: if (!try_to_freeze() && !again) { set_current_state(TASK_INTERRUPTIBLE); @@ -2492,6 +2501,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->unused_bgs_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); + mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); seqlock_init(&fs_info->profiles_lock); -- cgit v1.2.3-70-g09d2 From da288d280d16f4d7e4ada331cb33d381b408b10c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 13 Jun 2015 06:55:31 +0100 Subject: Btrfs: fix crash on close_ctree() if cleaner starts new transaction Often when running fstests btrfs/079 I was running into the following trace during umount on one of my qemu/kvm test vms: [ 8245.682441] WARNING: CPU: 8 PID: 25064 at fs/btrfs/extent-tree.c:138 btrfs_put_block_group+0x51/0x69 [btrfs]() [ 8245.685039] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc i2c_piix4 acpi_cpufreq processor psmouse i2c_core thermal_sys parport evdev serio_raw button pcspkr microcode ext4 crc16 jbd2 mbcache sg sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata floppy virtio_pci virtio_ring scsi_mod virtio e1000 [last unloaded: btrfs] [ 8245.693860] CPU: 8 PID: 25064 Comm: umount Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [ 8245.695081] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [ 8245.697583] 0000000000000009 ffff88020d047ce8 ffffffff8145eec7 ffffffff81095dce [ 8245.699234] 0000000000000000 ffff88020d047d28 ffffffff8104b399 0000000000000028 [ 8245.700995] ffffffffa04db07b ffff8801c6036c00 ffff8801c6036d68 ffff880202eb40b0 [ 8245.702510] Call Trace: [ 8245.703006] [] dump_stack+0x4f/0x7b [ 8245.705393] [] ? console_unlock+0x356/0x3a2 [ 8245.706569] [] warn_slowpath_common+0xa1/0xbb [ 8245.707747] [] ? btrfs_put_block_group+0x51/0x69 [btrfs] [ 8245.709101] [] warn_slowpath_null+0x1a/0x1c [ 8245.710274] [] btrfs_put_block_group+0x51/0x69 [btrfs] [ 8245.711823] [] btrfs_free_block_groups+0x145/0x322 [btrfs] [ 8245.713251] [] close_ctree+0x1ef/0x325 [btrfs] [ 8245.714448] [] ? evict_inodes+0xdc/0xeb [ 8245.715539] [] btrfs_put_super+0x19/0x1b [btrfs] [ 8245.716835] [] generic_shutdown_super+0x73/0xef [ 8245.718015] [] kill_anon_super+0x13/0x1e [ 8245.719101] [] btrfs_kill_super+0x17/0x23 [btrfs] [ 8245.720316] [] deactivate_locked_super+0x3b/0x68 [ 8245.721517] [] deactivate_super+0x3f/0x43 [ 8245.722581] [] cleanup_mnt+0x59/0x78 [ 8245.723538] [] __cleanup_mnt+0x12/0x14 [ 8245.724572] [] task_work_run+0x8f/0xbc [ 8245.725598] [] do_notify_resume+0x45/0x53 [ 8245.726892] [] int_signal+0x12/0x17 [ 8245.737887] ---[ end trace a01d038397e99b92 ]--- [ 8245.769363] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [ 8245.770737] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc i2c_piix4 acpi_cpufreq processor psmouse i2c_core thermal_sys parport evdev serio_raw button pcspkr microcode ext4 crc16 jbd2 mbcache sg sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata floppy virtio_pci virtio_ring scsi_mod virtio e1000 [last unloaded: btrfs] [ 8245.772641] CPU: 2 PID: 25064 Comm: umount Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [ 8245.772641] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [ 8245.772641] task: ffff880013005810 ti: ffff88020d044000 task.ti: ffff88020d044000 [ 8245.772641] RIP: 0010:[] [] btrfs_queue_work+0x2c/0x14d [btrfs] [ 8245.772641] RSP: 0018:ffff88020d0478b8 EFLAGS: 00010202 [ 8245.772641] RAX: 0000000000000004 RBX: 6b6b6b6b6b6b6b6b RCX: ffffffffa0581488 [ 8245.772641] RDX: 0000000000000000 RSI: ffff880194b7bf48 RDI: ffff880144b6a7a0 [ 8245.772641] RBP: ffff88020d0478d8 R08: 0000000000000000 R09: 000000000000ffff [ 8245.772641] R10: 0000000000000004 R11: 0000000000000005 R12: ffff880194b7bf48 [ 8245.772641] R13: ffff880194b7bf48 R14: 0000000000000410 R15: 0000000000000000 [ 8245.772641] FS: 00007f991e77d840(0000) GS:ffff88023e280000(0000) knlGS:0000000000000000 [ 8245.772641] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 8245.772641] CR2: 00007fbbd325ee68 CR3: 000000021de8e000 CR4: 00000000000006e0 [ 8245.772641] Stack: [ 8245.772641] ffff880194b7bf00 ffff880202eb4000 ffff880194b7bf48 0000000000000410 [ 8245.772641] ffff88020d047958 ffffffffa04ec6d5 ffff8801629b2ee8 0000000082987570 [ 8245.772641] 0000000000a5813f 0000000000000001 ffff880013006100 0000000000000002 [ 8245.772641] Call Trace: [ 8245.772641] [] btrfs_wq_submit_bio+0xe1/0x17b [btrfs] [ 8245.772641] [] ? check_irq_usage+0x76/0x87 [ 8245.772641] [] btree_submit_bio_hook+0xb6/0xd9 [btrfs] [ 8245.772641] [] ? btree_csum_one_bio+0xad/0xad [btrfs] [ 8245.772641] [] ? btree_io_failed_hook+0x5e/0x5e [btrfs] [ 8245.772641] [] submit_one_bio+0x8c/0xc7 [btrfs] [ 8245.772641] [] submit_extent_page.isra.18+0x9d/0x186 [btrfs] [ 8245.772641] [] write_one_eb+0x117/0x1ae [btrfs] [ 8245.772641] [] ? end_extent_buffer_writeback+0x21/0x21 [btrfs] [ 8245.772641] [] btree_write_cache_pages+0x2ab/0x385 [btrfs] [ 8245.772641] [] btree_writepages+0x23/0x5c [btrfs] [ 8245.772641] [] do_writepages+0x23/0x2c [ 8245.772641] [] __writeback_single_inode+0xda/0x5bd [ 8245.772641] [] ? writeback_single_inode+0x2b/0x173 [ 8245.772641] [] writeback_single_inode+0xc8/0x173 [ 8245.772641] [] write_inode_now+0x8a/0x95 [ 8245.772641] [] ? _atomic_dec_and_lock+0x30/0x4e [ 8245.772641] [] iput+0x17d/0x26a [ 8245.772641] [] close_ctree+0x22a/0x325 [btrfs] [ 8245.772641] [] ? evict_inodes+0xdc/0xeb [ 8245.772641] [] btrfs_put_super+0x19/0x1b [btrfs] [ 8245.772641] [] generic_shutdown_super+0x73/0xef [ 8245.772641] [] kill_anon_super+0x13/0x1e [ 8245.772641] [] btrfs_kill_super+0x17/0x23 [btrfs] [ 8245.772641] [] deactivate_locked_super+0x3b/0x68 [ 8245.772641] [] deactivate_super+0x3f/0x43 [ 8245.772641] [] cleanup_mnt+0x59/0x78 [ 8245.772641] [] __cleanup_mnt+0x12/0x14 [ 8245.772641] [] task_work_run+0x8f/0xbc [ 8245.772641] [] do_notify_resume+0x45/0x53 [ 8245.772641] [] int_signal+0x12/0x17 [ 8245.772641] Code: 1f 44 00 00 55 48 89 e5 41 56 41 55 41 54 53 49 89 f4 48 8b 46 70 a8 04 74 09 48 8b 5f 08 48 85 db 75 03 48 8b 1f 49 89 5c 24 68 <83> 7b 5c ff 74 04 f0 ff 43 50 49 83 7c 24 08 00 74 2c 4c 8d 6b [ 8245.772641] RIP [] btrfs_queue_work+0x2c/0x14d [btrfs] [ 8245.772641] RSP [ 8245.845040] ---[ end trace a01d038397e99b93 ]--- For logical reasons such as the phase of the moon, this happened more often with "-o inode_cache" than without any mount options. After some debugging it turned out to be simple to understand what was happening: 1) close_ctree() is called; 2) It then stops the transaction kthread, which commits the current transaction; 3) It asks the cleaner kthread to stop, which is currently running btrfs_delete_unused_bgs(); 4) btrfs_delete_unused_bgs() finds an unused block group, starts a new transaction, deletes the block group, which implies COWing some tree nodes and leafs and dirtying their respective pages, and then finally it ends the transaction it started, without committing it; 5) The cleaner kthread stops; 6) close_ctree() releases (from memory) the block group objects, which produces the warning in the trace pasted above; 7) Then it invalidates all pages of the btree inode, by calling invalidate_inode_pages2(), which waits for any pages under writeback, and releases any non-dirty pages; 8) All work queues are destroyed (waiting first for their current tasks to finish execution); 9) A final iput() is called against the btree inode; 10) This iput triggers a writeback of the btree inode because it still has dirty pages; 11) This starts the whole chain of callbacks for the btree inode until it eventually reaches btrfs_wq_submit_bio() where it leads to a NULL pointer dereference because the work queues were already destroyed. Fix this by making the cleaner commit any transaction that it started after the transaction kthread was stopped. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b59deb2c63f4..e5aad7f535aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1751,6 +1751,7 @@ static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; int again; + struct btrfs_trans_handle *trans; do { again = 0; @@ -1798,6 +1799,34 @@ sleep: __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); + + /* + * Transaction kthread is stopped before us and wakes us up. + * However we might have started a new transaction and COWed some + * tree blocks when deleting unused block groups for example. So + * make sure we commit the transaction we started to have a clean + * shutdown when evicting the btree inode - if it has dirty pages + * when we do the final iput() on it, eviction will trigger a + * writeback for it which will fail with null pointer dereferences + * since work queues and other resources were already released and + * destroyed by the time the iput/eviction/writeback is made. + */ + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + btrfs_err(root->fs_info, + "cleaner transaction attach returned %ld", + PTR_ERR(trans)); + } else { + int ret; + + ret = btrfs_commit_transaction(trans, root); + if (ret) + btrfs_err(root->fs_info, + "cleaner open transaction commit returned %d", + ret); + } + return 0; } -- cgit v1.2.3-70-g09d2