From bde6c242027b0f1d697d5333950b3a05761d40e4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 24 Jul 2015 00:00:19 +0100 Subject: Btrfs: fix stale dir entries after unlink, inode eviction and fsync If we remove a hard link from an inode, the inode gets evicted, then we fsync the inode and then power fail/crash, when the log tree is replayed, the parent directory inode still has entries pointing to the name that no longer exists, while our inode no longer has the BTRFS_INODE_REF_KEY item matching the deleted hard link (as expected), leaving the filesystem in an inconsistent state. The stale directory entries can not be deleted (an attempt to delete them causes -ESTALE errors), which makes it impossible to delete the parent directory. This happens because we track the id of the transaction where the last unlink operation for the inode happened (last_unlink_trans) in an in-memory only field of the inode, that is, a value that is never persisted in the inode item stored on the fs/subvol btree. So if an inode is evicted and loaded again, the value for last_unlink_trans is set to 0, which prevents the fsync from logging the parent directory at btrfs_log_inode_parent(). So fix this by setting last_unlink_trans to the id of the transaction that last modified the inode when we load the inode. This is a pessimistic approach but it always ensures correctness with the trade off of ocassional full transaction commits when an fsync is done against the inode in the same transaction where it was evicted and reloaded when our inode is a directory and often logging its parent unnecessarily when our inode is not a directory. The following test case for fstests triggers the problem: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { _cleanup_flakey rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter . ./common/dmflakey # real QA test starts here _need_to_be_root _supported_fs generic _supported_os Linux _require_scratch _require_dm_flakey _require_metadata_journaling $SCRATCH_DEV rm -f $seqres.full _scratch_mkfs >>$seqres.full 2>&1 _init_flakey _mount_flakey # Create our test file with 2 hard links. mkdir $SCRATCH_MNT/testdir touch $SCRATCH_MNT/testdir/foo ln $SCRATCH_MNT/testdir/foo $SCRATCH_MNT/testdir/bar # Make sure everything done so far is durably persisted. sync # Now remove one of the links, trigger inode eviction and then fsync # our inode. unlink $SCRATCH_MNT/testdir/bar echo 2 > /proc/sys/vm/drop_caches $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir/foo # Silently drop all writes on our scratch device to simulate a power failure. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey # Allow writes again and mount the fs to trigger log/journal replay. _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # Now verify our directory entries. echo "Entries in testdir:" ls -1 $SCRATCH_MNT/testdir # If we remove our inode, its parent should become empty and therefore we should # be able to remove the parent. rm -f $SCRATCH_MNT/testdir/* rmdir $SCRATCH_MNT/testdir _unmount_flakey # The fstests framework will call fsck against our filesystem which will verify # that all metadata is in a consistent state. status=0 exit The test failed on btrfs with: generic/098 4s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad) --- tests/generic/098.out 2015-07-23 18:01:12.616175932 +0100 +++ /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad 2015-07-23 18:04:58.924138308 +0100 @@ -1,3 +1,6 @@ QA output created by 098 Entries in testdir: +bar foo +rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/testdir/foo': Stale file handle +rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/testdir': Directory not empty ... (Run 'diff -u tests/generic/098.out /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad' to see the entire diff) _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent (see /home/fdmanana/git/hub/xfstests/results//generic/098.full) $ cat /home/fdmanana/git/hub/xfstests/results//generic/098.full (...) checking fs roots root 5 inode 258 errors 2001, no inode item, link count wrong unresolved ref dir 257 index 0 namelen 3 name foo filetype 1 errors 6, no dir index, no inode ref unresolved ref dir 257 index 3 namelen 3 name bar filetype 1 errors 5, no dir item, no inode ref Checking filesystem on /dev/sdc (...) Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e33dff356460..79a73645346e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3654,6 +3654,35 @@ cache_index: set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + /* + * We don't persist the id of the transaction where an unlink operation + * against the inode was last made. So here we assume the inode might + * have been evicted, and therefore the exact value of last_unlink_trans + * lost, and set it to last_trans to avoid metadata inconsistencies + * between the inode and its parent if the inode is fsync'ed and the log + * replayed. For example, in the scenario: + * + * touch mydir/foo + * ln mydir/foo mydir/bar + * sync + * unlink mydir/bar + * echo 2 > /proc/sys/vm/drop_caches # evicts inode + * xfs_io -c fsync mydir/foo + * + * mount fs, triggers fsync log replay + * + * We must make sure that when we fsync our inode foo we also log its + * parent inode, otherwise after log replay the parent still has the + * dentry with the "bar" name but our inode foo has a link count of 1 + * and doesn't have an inode ref with the name "bar" anymore. + * + * Setting last_unlink_trans to last_trans is a pessimistic approach, + * but it guarantees correctness at the expense of ocassional full + * transaction commits on fsync if our inode is a directory, or if our + * inode is not a directory, logging its parent unnecessarily. + */ + BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) -- cgit v1.2.3-70-g09d2 From da2f0f74cf7d074e5a8918c8efdf6aba4a989b4a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 2 Jul 2015 13:57:22 -0700 Subject: Btrfs: add support for blkio controllers This attaches accounting information to bios as we submit them so the new blkio controllers can throttle on btrfs filesystems. Not much is required, we're just associating bios with blkcgs during clone, calling wbc_init_bio()/wbc_account_io() during writepages submission, and attaching the bios to the current context during direct IO. Finally if we are splitting bios during btrfs_map_bio, this attaches accounting information to the split. The end result is able to throttle nicely on single disk filesystems. A little more work is required for multi-device filesystems. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 1 + fs/btrfs/extent_io.c | 16 +++++++++++++--- fs/btrfs/inode.c | 6 +++++- fs/btrfs/super.c | 1 + fs/btrfs/volumes.c | 8 ++++++++ 5 files changed, 28 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f7536bcf7cee..230546b45474 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1724,6 +1724,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; bdi->congested_fn = btrfs_congested_fn; bdi->congested_data = info; + bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; return 0; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 02d05817cbdf..b9755ce98218 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2730,6 +2730,9 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; btrfs_bio->end_io = NULL; + /* FIXME, put this into bio_clone_bioset */ + if (bio->bi_css) + bio_associate_blkcg(new, bio->bi_css); } return new; } @@ -2790,6 +2793,7 @@ static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, } static int submit_extent_page(int rw, struct extent_io_tree *tree, + struct writeback_control *wbc, struct page *page, sector_t sector, size_t size, unsigned long offset, struct block_device *bdev, @@ -2826,6 +2830,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, } bio = NULL; } else { + if (wbc) + wbc_account_io(wbc, page, page_size); return 0; } } @@ -2841,6 +2847,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, page, page_size); + } if (bio_ret) *bio_ret = bio; @@ -3051,7 +3061,7 @@ static int __do_readpage(struct extent_io_tree *tree, } pnr -= page->index; - ret = submit_extent_page(rw, tree, page, + ret = submit_extent_page(rw, tree, NULL, page, sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, @@ -3446,7 +3456,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page->index, cur, end); } - ret = submit_extent_page(write_flags, tree, page, + ret = submit_extent_page(write_flags, tree, wbc, page, sector, iosize, pg_offset, bdev, &epd->bio, max_nr, end_bio_extent_writepage, @@ -3749,7 +3759,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(rw, tree, p, offset >> 9, + ret = submit_extent_page(rw, tree, wbc, p, offset >> 9, PAGE_CACHE_SIZE, 0, bdev, &epd->bio, -1, end_bio_extent_buffer_writepage, 0, epd->bio_flags, bio_flags); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 79a73645346e..bda3c41dc9d5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7987,7 +7987,11 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, u64 first_sector, gfp_t gfp_flags) { int nr_vecs = bio_get_nr_vecs(bdev); - return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); + struct bio *bio; + bio = btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); + if (bio) + bio_associate_current(bio); + return bio; } static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index cd7ef34d2dce..d366dd4664d0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1033,6 +1033,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_flags |= MS_POSIXACL; #endif sb->s_flags |= MS_I_VERSION; + sb->s_iflags |= SB_I_CGROUPWB; err = open_ctree(sb, fs_devices, (char *)data); if (err) { printk(KERN_ERR "BTRFS: open_ctree failed\n"); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fb9abf1678d0..88e2fe931bde 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5942,6 +5942,14 @@ again: if (!bio) return -ENOMEM; + if (first_bio->bi_ioc) { + get_io_context_active(first_bio->bi_ioc); + bio->bi_ioc = first_bio->bi_ioc; + } + if (first_bio->bi_css) { + css_get(first_bio->bi_css); + bio->bi_css = first_bio->bi_css; + } while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset) < bvec->bv_len) { -- cgit v1.2.3-70-g09d2