diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
| -rw-r--r-- | fs/btrfs/extent-tree.c | 293 | 
1 files changed, 169 insertions, 124 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 911908ea5f6f..f356f08b55cb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -69,27 +69,6 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits)  	return (cache->flags & bits) == bits;  } -int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, -			      u64 start, u64 num_bytes) -{ -	u64 end = start + num_bytes - 1; -	set_extent_bit(&fs_info->excluded_extents, start, end, -		       EXTENT_UPTODATE, NULL); -	return 0; -} - -void btrfs_free_excluded_extents(struct btrfs_block_group *cache) -{ -	struct btrfs_fs_info *fs_info = cache->fs_info; -	u64 start, end; - -	start = cache->start; -	end = start + cache->length - 1; - -	clear_extent_bits(&fs_info->excluded_extents, start, end, -			  EXTENT_UPTODATE); -} -  /* simple helper to search for an existing data extent at a given offset */  int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)  { @@ -187,8 +166,10 @@ search_again:  			num_refs = btrfs_extent_refs(leaf, ei);  			extent_flags = btrfs_extent_flags(leaf, ei);  		} else { -			ret = -EINVAL; -			btrfs_print_v0_err(fs_info); +			ret = -EUCLEAN; +			btrfs_err(fs_info, +			"unexpected extent item size, has %u expect >= %zu", +				  item_size, sizeof(*ei));  			if (trans)  				btrfs_abort_transaction(trans, ret);  			else @@ -402,11 +383,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,  		}  	} +	WARN_ON(1);  	btrfs_print_leaf(eb);  	btrfs_err(eb->fs_info,  		  "eb %llu iref 0x%lx invalid extent inline ref type %d",  		  eb->start, (unsigned long)iref, type); -	WARN_ON(1);  	return BTRFS_REF_TYPE_INVALID;  } @@ -624,12 +605,12 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,  		ref2 = btrfs_item_ptr(leaf, path->slots[0],  				      struct btrfs_shared_data_ref);  		num_refs = btrfs_shared_data_ref_count(leaf, ref2); -	} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { -		btrfs_print_v0_err(trans->fs_info); -		btrfs_abort_transaction(trans, -EINVAL); -		return -EINVAL;  	} else { -		BUG(); +		btrfs_err(trans->fs_info, +			  "unrecognized backref key (%llu %u %llu)", +			  key.objectid, key.type, key.offset); +		btrfs_abort_transaction(trans, -EUCLEAN); +		return -EUCLEAN;  	}  	BUG_ON(num_refs < refs_to_drop); @@ -660,7 +641,6 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path,  	leaf = path->nodes[0];  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); -	BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);  	if (iref) {  		/*  		 * If type is invalid, we should have bailed out earlier than @@ -869,6 +849,11 @@ again:  		err = -ENOENT;  		goto out;  	} else if (WARN_ON(ret)) { +		btrfs_print_leaf(path->nodes[0]); +		btrfs_err(fs_info, +"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", +			  bytenr, num_bytes, parent, root_objectid, owner, +			  offset);  		err = -EIO;  		goto out;  	} @@ -876,8 +861,10 @@ again:  	leaf = path->nodes[0];  	item_size = btrfs_item_size(leaf, path->slots[0]);  	if (unlikely(item_size < sizeof(*ei))) { -		err = -EINVAL; -		btrfs_print_v0_err(fs_info); +		err = -EUCLEAN; +		btrfs_err(fs_info, +			  "unexpected extent item size, has %llu expect >= %zu", +			  item_size, sizeof(*ei));  		btrfs_abort_transaction(trans, err);  		goto out;  	} @@ -1079,13 +1066,13 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,  /*   * helper to update/remove inline back ref   */ -static noinline_for_stack -void update_inline_extent_backref(struct btrfs_path *path, +static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *path,  				  struct btrfs_extent_inline_ref *iref,  				  int refs_to_mod,  				  struct btrfs_delayed_extent_op *extent_op)  {  	struct extent_buffer *leaf = path->nodes[0]; +	struct btrfs_fs_info *fs_info = leaf->fs_info;  	struct btrfs_extent_item *ei;  	struct btrfs_extent_data_ref *dref = NULL;  	struct btrfs_shared_data_ref *sref = NULL; @@ -1098,18 +1085,33 @@ void update_inline_extent_backref(struct btrfs_path *path,  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);  	refs = btrfs_extent_refs(leaf, ei); -	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); +	if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) { +		struct btrfs_key key; +		u32 extent_size; + +		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); +		if (key.type == BTRFS_METADATA_ITEM_KEY) +			extent_size = fs_info->nodesize; +		else +			extent_size = key.offset; +		btrfs_print_leaf(leaf); +		btrfs_err(fs_info, +	"invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu", +			  key.objectid, extent_size, refs_to_mod, refs); +		return -EUCLEAN; +	}  	refs += refs_to_mod;  	btrfs_set_extent_refs(leaf, ei, refs);  	if (extent_op)  		__run_delayed_extent_op(extent_op, leaf, ei); +	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);  	/* -	 * If type is invalid, we should have bailed out after -	 * lookup_inline_extent_backref(). +	 * Function btrfs_get_extent_inline_ref_type() has already printed +	 * error messages.  	 */ -	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); -	ASSERT(type != BTRFS_REF_TYPE_INVALID); +	if (unlikely(type == BTRFS_REF_TYPE_INVALID)) +		return -EUCLEAN;  	if (type == BTRFS_EXTENT_DATA_REF_KEY) {  		dref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -1119,10 +1121,43 @@ void update_inline_extent_backref(struct btrfs_path *path,  		refs = btrfs_shared_data_ref_count(leaf, sref);  	} else {  		refs = 1; -		BUG_ON(refs_to_mod != -1); +		/* +		 * For tree blocks we can only drop one ref for it, and tree +		 * blocks should not have refs > 1. +		 * +		 * Furthermore if we're inserting a new inline backref, we +		 * won't reach this path either. That would be +		 * setup_inline_extent_backref(). +		 */ +		if (unlikely(refs_to_mod != -1)) { +			struct btrfs_key key; + +			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + +			btrfs_print_leaf(leaf); +			btrfs_err(fs_info, +			"invalid refs_to_mod for tree block %llu, has %d expect -1", +				  key.objectid, refs_to_mod); +			return -EUCLEAN; +		}  	} -	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); +	if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) { +		struct btrfs_key key; +		u32 extent_size; + +		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); +		if (key.type == BTRFS_METADATA_ITEM_KEY) +			extent_size = fs_info->nodesize; +		else +			extent_size = key.offset; +		btrfs_print_leaf(leaf); +		btrfs_err(fs_info, +"invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu", +			  (unsigned long)iref, key.objectid, extent_size, +			  refs_to_mod, refs); +		return -EUCLEAN; +	}  	refs += refs_to_mod;  	if (refs > 0) { @@ -1142,6 +1177,7 @@ void update_inline_extent_backref(struct btrfs_path *path,  		btrfs_truncate_item(path, item_size, 1);  	}  	btrfs_mark_buffer_dirty(leaf); +	return 0;  }  static noinline_for_stack @@ -1170,7 +1206,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,  				   bytenr, num_bytes, root_objectid, path->slots[0]);  			return -EUCLEAN;  		} -		update_inline_extent_backref(path, iref, refs_to_add, extent_op); +		ret = update_inline_extent_backref(path, iref, refs_to_add, extent_op);  	} else if (ret == -ENOENT) {  		setup_inline_extent_backref(trans->fs_info, path, iref, parent,  					    root_objectid, owner, offset, @@ -1190,7 +1226,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,  	BUG_ON(!is_data && refs_to_drop != 1);  	if (iref) -		update_inline_extent_backref(path, iref, -refs_to_drop, NULL); +		ret = update_inline_extent_backref(path, iref, -refs_to_drop, NULL);  	else if (is_data)  		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);  	else @@ -1629,8 +1665,10 @@ again:  	item_size = btrfs_item_size(leaf, path->slots[0]);  	if (unlikely(item_size < sizeof(*ei))) { -		err = -EINVAL; -		btrfs_print_v0_err(fs_info); +		err = -EUCLEAN; +		btrfs_err(fs_info, +			  "unexpected extent item size, has %u expect >= %zu", +			  item_size, sizeof(*ei));  		btrfs_abort_transaction(trans, err);  		goto out;  	} @@ -2751,9 +2789,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)  		struct extent_state *cached_state = NULL;  		mutex_lock(&fs_info->unused_bg_unpin_mutex); -		ret = find_first_extent_bit(unpin, 0, &start, &end, -					    EXTENT_DIRTY, &cached_state); -		if (ret) { +		if (!find_first_extent_bit(unpin, 0, &start, &end, +					   EXTENT_DIRTY, &cached_state)) {  			mutex_unlock(&fs_info->unused_bg_unpin_mutex);  			break;  		} @@ -3059,8 +3096,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,  	leaf = path->nodes[0];  	item_size = btrfs_item_size(leaf, extent_slot);  	if (unlikely(item_size < sizeof(*ei))) { -		ret = -EINVAL; -		btrfs_print_v0_err(info); +		ret = -EUCLEAN; +		btrfs_err(trans->fs_info, +			  "unexpected extent item size, has %u expect >= %zu", +			  item_size, sizeof(*ei));  		btrfs_abort_transaction(trans, ret);  		goto out;  	} @@ -3351,11 +3390,38 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)  }  enum btrfs_loop_type { +	/* +	 * Start caching block groups but do not wait for progress or for them +	 * to be done. +	 */  	LOOP_CACHING_NOWAIT, + +	/* +	 * Wait for the block group free_space >= the space we're waiting for if +	 * the block group isn't cached. +	 */  	LOOP_CACHING_WAIT, + +	/* +	 * Allow allocations to happen from block groups that do not yet have a +	 * size classification. +	 */  	LOOP_UNSET_SIZE_CLASS, + +	/* +	 * Allocate a chunk and then retry the allocation. +	 */  	LOOP_ALLOC_CHUNK, + +	/* +	 * Ignore the size class restrictions for this allocation. +	 */  	LOOP_WRONG_SIZE_CLASS, + +	/* +	 * Ignore the empty size, only try to allocate the number of bytes +	 * needed for this allocation. +	 */  	LOOP_NO_EMPTY_SIZE,  }; @@ -3427,7 +3493,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache,   * Helper function for find_free_extent().   *   * Return -ENOENT to inform caller that we need fallback to unclustered mode. - * Return -EAGAIN to inform caller that we need to re-search this block group   * Return >0 to inform caller that we find nothing   * Return 0 means we have found a location and set ffe_ctl->found_offset.   */ @@ -3508,14 +3573,6 @@ refill_cluster:  			trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);  			return 0;  		} -	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && -		   !ffe_ctl->retry_clustered) { -		spin_unlock(&last_ptr->refill_lock); - -		ffe_ctl->retry_clustered = true; -		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + -				ffe_ctl->empty_cluster + ffe_ctl->empty_size); -		return -EAGAIN;  	}  	/*  	 * At this point we either didn't find a cluster or we weren't able to @@ -3530,7 +3587,6 @@ refill_cluster:  /*   * Return >0 to inform caller that we find nothing   * Return 0 when we found an free extent and set ffe_ctrl->found_offset - * Return -EAGAIN to inform caller that we need to re-search this block group   */  static int find_free_extent_unclustered(struct btrfs_block_group *bg,  					struct find_free_extent_ctl *ffe_ctl) @@ -3568,25 +3624,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg,  	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,  			ffe_ctl->num_bytes, ffe_ctl->empty_size,  			&ffe_ctl->max_extent_size); - -	/* -	 * If we didn't find a chunk, and we haven't failed on this block group -	 * before, and this block group is in the middle of caching and we are -	 * ok with waiting, then go ahead and wait for progress to be made, and -	 * set @retry_unclustered to true. -	 * -	 * If @retry_unclustered is true then we've already waited on this -	 * block group once and should move on to the next block group. -	 */ -	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && -	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) { -		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + -						      ffe_ctl->empty_size); -		ffe_ctl->retry_unclustered = true; -		return -EAGAIN; -	} else if (!offset) { +	if (!offset)  		return 1; -	}  	ffe_ctl->found_offset = offset;  	return 0;  } @@ -3600,7 +3639,7 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group,  	/* We want to try and use the cluster allocator, so lets look there */  	if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {  		ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); -		if (ret >= 0 || ret == -EAGAIN) +		if (ret >= 0)  			return ret;  		/* ret == -ENOENT case falls through */  	} @@ -3685,7 +3724,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,  	}  	spin_unlock(&block_group->lock); -	if (!ret && !btrfs_zone_activate(block_group)) { +	/* Metadata block group is activated at write time. */ +	if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && +	    !btrfs_zone_activate(block_group)) {  		ret = 1;  		/*  		 * May need to clear fs_info->{treelog,data_reloc}_bg. @@ -3709,7 +3750,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,  	       fs_info->data_reloc_bg == 0);  	if (block_group->ro || -	    test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { +	    (!ffe_ctl->for_data_reloc && +	     test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {  		ret = 1;  		goto out;  	} @@ -3752,8 +3794,26 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,  	if (ffe_ctl->for_treelog && !fs_info->treelog_bg)  		fs_info->treelog_bg = block_group->start; -	if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg) -		fs_info->data_reloc_bg = block_group->start; +	if (ffe_ctl->for_data_reloc) { +		if (!fs_info->data_reloc_bg) +			fs_info->data_reloc_bg = block_group->start; +		/* +		 * Do not allow allocations from this block group, unless it is +		 * for data relocation. Compared to increasing the ->ro, setting +		 * the ->zoned_data_reloc_ongoing flag still allows nocow +		 * writers to come in. See btrfs_inc_nocow_writers(). +		 * +		 * We need to disable an allocation to avoid an allocation of +		 * regular (non-relocation data) extent. With mix of relocation +		 * extents and regular extents, we can dispatch WRITE commands +		 * (for relocation extents) and ZONE APPEND commands (for +		 * regular extents) at the same time to the same zone, which +		 * easily break the write pointer. +		 * +		 * Also, this flag avoids this block group to be zone finished. +		 */ +		set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); +	}  	ffe_ctl->found_offset = start + block_group->alloc_offset;  	block_group->alloc_offset += num_bytes; @@ -3771,24 +3831,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,  out:  	if (ret && ffe_ctl->for_treelog)  		fs_info->treelog_bg = 0; -	if (ret && ffe_ctl->for_data_reloc && -	    fs_info->data_reloc_bg == block_group->start) { -		/* -		 * Do not allow further allocations from this block group. -		 * Compared to increasing the ->ro, setting the -		 * ->zoned_data_reloc_ongoing flag still allows nocow -		 *  writers to come in. See btrfs_inc_nocow_writers(). -		 * -		 * We need to disable an allocation to avoid an allocation of -		 * regular (non-relocation data) extent. With mix of relocation -		 * extents and regular extents, we can dispatch WRITE commands -		 * (for relocation extents) and ZONE APPEND commands (for -		 * regular extents) at the same time to the same zone, which -		 * easily break the write pointer. -		 */ -		set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); +	if (ret && ffe_ctl->for_data_reloc)  		fs_info->data_reloc_bg = 0; -	}  	spin_unlock(&fs_info->relocation_bg_lock);  	spin_unlock(&fs_info->treelog_bg_lock);  	spin_unlock(&block_group->lock); @@ -3816,8 +3860,7 @@ static void release_block_group(struct btrfs_block_group *block_group,  {  	switch (ffe_ctl->policy) {  	case BTRFS_EXTENT_ALLOC_CLUSTERED: -		ffe_ctl->retry_clustered = false; -		ffe_ctl->retry_unclustered = false; +		ffe_ctl->retry_uncached = false;  		break;  	case BTRFS_EXTENT_ALLOC_ZONED:  		/* Nothing to do */ @@ -3861,6 +3904,10 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,  static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,  				    struct find_free_extent_ctl *ffe_ctl)  { +	/* Block group's activeness is not a requirement for METADATA block groups. */ +	if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) +		return 0; +  	/* If we can activate new zone, just allocate a chunk and use it */  	if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))  		return 0; @@ -3949,15 +3996,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,  	if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)  		return 1; -	/* -	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking -	 *			caching kthreads as we move along -	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching -	 * LOOP_UNSET_SIZE_CLASS, allow unset size class -	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again -	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try -	 *		       again -	 */ +	/* See the comments for btrfs_loop_type for an explanation of the phases. */  	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {  		ffe_ctl->index = 0;  		/* @@ -4168,9 +4207,7 @@ static noinline int find_free_extent(struct btrfs_root *root,  	ffe_ctl->orig_have_caching_bg = false;  	ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);  	ffe_ctl->loop = 0; -	/* For clustered allocation */ -	ffe_ctl->retry_clustered = false; -	ffe_ctl->retry_unclustered = false; +	ffe_ctl->retry_uncached = false;  	ffe_ctl->cached = 0;  	ffe_ctl->max_extent_size = 0;  	ffe_ctl->total_free_space = 0; @@ -4310,24 +4347,23 @@ have_block_group:  			ret = 0;  		} -		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) +		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) { +			if (!cache_block_group_error) +				cache_block_group_error = -EIO;  			goto loop; +		}  		if (!find_free_extent_check_size_class(ffe_ctl, block_group))  			goto loop;  		bg_ret = NULL;  		ret = do_allocation(block_group, ffe_ctl, &bg_ret); -		if (ret == 0) { -			if (bg_ret && bg_ret != block_group) { -				btrfs_release_block_group(block_group, -							  ffe_ctl->delalloc); -				block_group = bg_ret; -			} -		} else if (ret == -EAGAIN) { -			goto have_block_group; -		} else if (ret > 0) { +		if (ret > 0)  			goto loop; + +		if (bg_ret && bg_ret != block_group) { +			btrfs_release_block_group(block_group, ffe_ctl->delalloc); +			block_group = bg_ret;  		}  		/* Checks */ @@ -4368,6 +4404,15 @@ have_block_group:  		btrfs_release_block_group(block_group, ffe_ctl->delalloc);  		break;  loop: +		if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && +		    !ffe_ctl->retry_uncached) { +			ffe_ctl->retry_uncached = true; +			btrfs_wait_block_group_cache_progress(block_group, +						ffe_ctl->num_bytes + +						ffe_ctl->empty_cluster + +						ffe_ctl->empty_size); +			goto have_block_group; +		}  		release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);  		cond_resched();  	}  | 
