diff options
-rw-r--r-- | fs/btrfs/block-group.c | 39 | ||||
-rw-r--r-- | fs/btrfs/ctree.c | 8 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 6 | ||||
-rw-r--r-- | fs/btrfs/delayed-ref.c | 8 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 1 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 49 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 121 | ||||
-rw-r--r-- | fs/btrfs/send.c | 3 | ||||
-rw-r--r-- | fs/btrfs/space-info.c | 18 | ||||
-rw-r--r-- | fs/btrfs/space-info.h | 3 | ||||
-rw-r--r-- | fs/btrfs/super.c | 10 | ||||
-rw-r--r-- | fs/btrfs/tests/btrfs-tests.c | 1 | ||||
-rw-r--r-- | fs/btrfs/tests/extent-io-tests.c | 9 |
13 files changed, 193 insertions, 83 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 14851584e245..404e050ce8ee 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1191,7 +1191,6 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; - u64 sinfo_used; int ret = -ENOSPC; spin_lock(&sinfo->lock); @@ -1205,19 +1204,38 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) num_bytes = cache->length - cache->reserved - cache->pinned - cache->bytes_super - cache->used; - sinfo_used = btrfs_space_info_used(sinfo, true); /* - * sinfo_used + num_bytes should always <= sinfo->total_bytes. - * - * Here we make sure if we mark this bg RO, we still have enough - * free space as buffer. + * Data never overcommits, even in mixed mode, so do just the straight + * check of left over space in how much we have allocated. */ - if (sinfo_used + num_bytes <= sinfo->total_bytes) { + if (force) { + ret = 0; + } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) { + u64 sinfo_used = btrfs_space_info_used(sinfo, true); + + /* + * Here we make sure if we mark this bg RO, we still have enough + * free space as buffer. + */ + if (sinfo_used + num_bytes <= sinfo->total_bytes) + ret = 0; + } else { + /* + * We overcommit metadata, so we need to do the + * btrfs_can_overcommit check here, and we need to pass in + * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of + * leeway to allow us to mark this block group as read only. + */ + if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, + BTRFS_RESERVE_NO_FLUSH)) + ret = 0; + } + + if (!ret) { sinfo->bytes_readonly += num_bytes; cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); - ret = 0; } out: spin_unlock(&cache->lock); @@ -1225,9 +1243,6 @@ out: if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { btrfs_info(cache->fs_info, "unable to make block group %llu ro", cache->start); - btrfs_info(cache->fs_info, - "sinfo_used=%llu bg_num_bytes=%llu", - sinfo_used, num_bytes); btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); } return ret; @@ -2225,7 +2240,7 @@ again: } } - ret = inc_block_group_ro(cache, !do_chunk_alloc); + ret = inc_block_group_ro(cache, 0); if (!do_chunk_alloc) goto unlock_out; if (!ret) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 24658b5a5787..f2ec1a9bae28 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -326,12 +326,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) { write_lock(&fs_info->tree_mod_log_lock); - spin_lock(&fs_info->tree_mod_seq_lock); if (!elem->seq) { elem->seq = btrfs_inc_tree_mod_seq(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } - spin_unlock(&fs_info->tree_mod_seq_lock); write_unlock(&fs_info->tree_mod_log_lock); return elem->seq; @@ -351,7 +349,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, if (!seq_putting) return; - spin_lock(&fs_info->tree_mod_seq_lock); + write_lock(&fs_info->tree_mod_log_lock); list_del(&elem->list); elem->seq = 0; @@ -362,19 +360,17 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * blocker with lower sequence number exists, we * cannot remove anything from the log */ - spin_unlock(&fs_info->tree_mod_seq_lock); + write_unlock(&fs_info->tree_mod_log_lock); return; } min_seq = cur_elem->seq; } } - spin_unlock(&fs_info->tree_mod_seq_lock); /* * anything that's lower than the lowest existing (read: blocked) * sequence number can be removed from the tree. */ - write_lock(&fs_info->tree_mod_log_lock); tm_root = &fs_info->tree_mod_log; for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f90b82050d2d..36df977b64d9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -714,14 +714,12 @@ struct btrfs_fs_info { atomic_t nr_delayed_iputs; wait_queue_head_t delayed_iputs_wait; - /* this protects tree_mod_seq_list */ - spinlock_t tree_mod_seq_lock; atomic64_t tree_mod_seq; - struct list_head tree_mod_seq_list; - /* this protects tree_mod_log */ + /* this protects tree_mod_log and tree_mod_seq_list */ rwlock_t tree_mod_log_lock; struct rb_root tree_mod_log; + struct list_head tree_mod_seq_list; atomic_t async_delalloc_pages; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index df3bd880061d..dfdb7d4f8406 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -492,7 +492,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, if (head->is_data) return; - spin_lock(&fs_info->tree_mod_seq_lock); + read_lock(&fs_info->tree_mod_log_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { struct seq_list *elem; @@ -500,7 +500,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct seq_list, list); seq = elem->seq; } - spin_unlock(&fs_info->tree_mod_seq_lock); + read_unlock(&fs_info->tree_mod_log_lock); again: for (node = rb_first_cached(&head->ref_tree); node; @@ -518,7 +518,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) struct seq_list *elem; int ret = 0; - spin_lock(&fs_info->tree_mod_seq_lock); + read_lock(&fs_info->tree_mod_log_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { elem = list_first_entry(&fs_info->tree_mod_seq_list, struct seq_list, list); @@ -531,7 +531,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) } } - spin_unlock(&fs_info->tree_mod_seq_lock); + read_unlock(&fs_info->tree_mod_log_lock); return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index aea48d6ddc0c..7fa9bb79ad08 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2697,7 +2697,6 @@ int __cold open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); - spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e2d30287e2d5..c0f202741e09 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1593,21 +1593,25 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, /* Find first extent with bits cleared */ while (1) { node = __etree_search(tree, start, &next, &prev, NULL, NULL); - if (!node) { + if (!node && !next && !prev) { + /* + * Tree is completely empty, send full range and let + * caller deal with it + */ + *start_ret = 0; + *end_ret = -1; + goto out; + } else if (!node && !next) { + /* + * We are past the last allocated chunk, set start at + * the end of the last extent. + */ + state = rb_entry(prev, struct extent_state, rb_node); + *start_ret = state->end + 1; + *end_ret = -1; + goto out; + } else if (!node) { node = next; - if (!node) { - /* - * We are past the last allocated chunk, - * set start at the end of the last extent. The - * device alloc tree should never be empty so - * prev is always set. - */ - ASSERT(prev); - state = rb_entry(prev, struct extent_state, rb_node); - *start_ret = state->end + 1; - *end_ret = -1; - goto out; - } } /* * At this point 'node' either contains 'start' or start is @@ -3438,11 +3442,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, ret = btrfs_writepage_cow_fixup(page, start, page_end); if (ret) { /* Fixup worker will requeue */ - if (ret == -EBUSY) - wbc->pages_skipped++; - else - redirty_page_for_writepage(wbc, page); - + redirty_page_for_writepage(wbc, page); update_nr_written(wbc, nr_written); unlock_page(page); return 1; @@ -4166,7 +4166,16 @@ retry: */ scanned = 1; index = 0; - goto retry; + + /* + * If we're looping we could run into a page that is locked by a + * writer and that writer could be waiting on writeback for a + * page in our current bio, and thus deadlock, so flush the + * write bio here. + */ + ret = flush_write_bio(epd); + if (!ret) + goto retry; } if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6d2bb58d277a..5b3ec93ff911 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2189,6 +2189,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct page *page; + struct inode *inode; struct btrfs_work work; }; @@ -2202,27 +2203,71 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct inode *inode; u64 page_start; u64 page_end; - int ret; + int ret = 0; + bool free_delalloc_space = true; fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; + inode = fixup->inode; + page_start = page_offset(page); + page_end = page_offset(page) + PAGE_SIZE - 1; + + /* + * This is similar to page_mkwrite, we need to reserve the space before + * we take the page lock. + */ + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, + PAGE_SIZE); again: lock_page(page); + + /* + * Before we queued this fixup, we took a reference on the page. + * page->mapping may go NULL, but it shouldn't be moved to a different + * address space. + */ if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { - ClearPageChecked(page); + /* + * Unfortunately this is a little tricky, either + * + * 1) We got here and our page had already been dealt with and + * we reserved our space, thus ret == 0, so we need to just + * drop our space reservation and bail. This can happen the + * first time we come into the fixup worker, or could happen + * while waiting for the ordered extent. + * 2) Our page was already dealt with, but we happened to get an + * ENOSPC above from the btrfs_delalloc_reserve_space. In + * this case we obviously don't have anything to release, but + * because the page was already dealt with we don't want to + * mark the page with an error, so make sure we're resetting + * ret to 0. This is why we have this check _before_ the ret + * check, because we do not want to have a surprise ENOSPC + * when the page was already properly dealt with. + */ + if (!ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), + PAGE_SIZE); + btrfs_delalloc_release_space(inode, data_reserved, + page_start, PAGE_SIZE, + true); + } + ret = 0; goto out_page; } - inode = page->mapping->host; - page_start = page_offset(page); - page_end = page_offset(page) + PAGE_SIZE - 1; + /* + * We can't mess with the page state unless it is locked, so now that + * it is locked bail if we failed to make our space reservation. + */ + if (ret) + goto out_page; lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (PagePrivate2(page)) - goto out; + goto out_reserved; ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); @@ -2235,39 +2280,49 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - PAGE_SIZE); - if (ret) { - mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); - ClearPageChecked(page); - goto out; - } - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state); - if (ret) { - mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); - ClearPageChecked(page); + if (ret) goto out_reserved; - } - ClearPageChecked(page); - set_page_dirty(page); + /* + * Everything went as planned, we're now the owner of a dirty page with + * delayed allocation bits set and space reserved for our COW + * destination. + * + * The page was dirty when we started, nothing should have cleaned it. + */ + BUG_ON(!PageDirty(page)); + free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - if (ret) + if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); -out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); out_page: + if (ret) { + /* + * We hit ENOSPC or other errors. Update the mapping and page + * to reflect the errors and clean the page. + */ + mapping_set_error(page->mapping, ret); + end_extent_writepage(page, ret, page_start, page_end); + clear_page_dirty_for_io(page); + SetPageError(page); + } + ClearPageChecked(page); unlock_page(page); put_page(page); kfree(fixup); extent_changeset_free(data_reserved); + /* + * As a precaution, do a delayed iput in case it would be the last iput + * that could need flushing space. Recursing back to fixup worker would + * deadlock. + */ + btrfs_add_delayed_iput(inode); } /* @@ -2291,6 +2346,13 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) if (TestClearPagePrivate2(page)) return 0; + /* + * PageChecked is set below when we create a fixup worker for this page, + * don't try to create another one if we're already PageChecked() + * + * The extent_io writepage code will redirty the page if we send back + * EAGAIN. + */ if (PageChecked(page)) return -EAGAIN; @@ -2298,12 +2360,21 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) if (!fixup) return -EAGAIN; + /* + * We are already holding a reference to this inode from + * write_cache_pages. We need to hold it because the space reservation + * takes place outside of the page lock, and we can't trust + * page->mapping outside of the page lock. + */ + ihold(inode); SetPageChecked(page); get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; + fixup->inode = inode; btrfs_queue_work(fs_info->fixup_workers, &fixup->work); - return -EBUSY; + + return -EAGAIN; } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 091e5bc8c7ea..a055b657cb85 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1269,7 +1269,8 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) * destination of the stream. */ if (ino == bctx->cur_objectid && - offset >= bctx->sctx->cur_inode_next_write_offset) + offset + bctx->extent_len > + bctx->sctx->cur_inode_next_write_offset) return 0; } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 537bc310a673..01297c5b2666 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -159,9 +159,9 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) return (global->size << 1); } -static int can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush) +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) { u64 profile; u64 avail; @@ -226,7 +226,8 @@ again: /* Check and see if our ticket can be satisified now. */ if ((used + ticket->bytes <= space_info->total_bytes) || - can_overcommit(fs_info, space_info, ticket->bytes, flush)) { + btrfs_can_overcommit(fs_info, space_info, ticket->bytes, + flush)) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, ticket->bytes); @@ -639,13 +640,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, return to_reclaim; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) + if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) return 0; used = btrfs_space_info_used(space_info, true); - if (can_overcommit(fs_info, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) + if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -1004,7 +1006,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, */ if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || - can_overcommit(fs_info, space_info, orig_bytes, flush))) { + btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); ret = 0; diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 1a349e3f9cc1..24514cd2c6c1 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -127,6 +127,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, enum btrfs_reserve_flush_enum flush); void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info); +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a906315efd19..0616a5434793 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2135,7 +2135,15 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ thresh = SZ_4M; - if (!mixed && total_free_meta - thresh < block_rsv->size) + /* + * We only want to claim there's no available space if we can no longer + * allocate chunks for our metadata profile and our global reserve will + * not fit in the free metadata space. If we aren't ->full then we + * still can allocate chunks and thus are fine using the currently + * calculated f_bavail. + */ + if (!mixed && block_rsv->space_info->full && + total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index c12b91ff5f56..84fb3fa940a6 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -142,7 +142,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) spin_lock_init(&fs_info->qgroup_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); - spin_lock_init(&fs_info->tree_mod_seq_lock); mutex_init(&fs_info->qgroup_ioctl_lock); mutex_init(&fs_info->qgroup_rescan_lock); rwlock_init(&fs_info->tree_mod_log_lock); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 123d9a614357..df7ce874a74b 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -441,8 +441,17 @@ static int test_find_first_clear_extent_bit(void) int ret = -EINVAL; test_msg("running find_first_clear_extent_bit test"); + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); + /* Test correct handling of empty tree */ + find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); + if (start != 0 || end != -1) { + test_err( + "error getting a range from completely empty tree: start %llu end %llu", + start, end); + goto out; + } /* * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between * 4M-32M |