diff options
Diffstat (limited to 'arch/arm64/kvm/mmu.c')
| -rw-r--r-- | arch/arm64/kvm/mmu.c | 207 | 
1 files changed, 173 insertions, 34 deletions
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 3b9d4d24c361..6db9ef288ec3 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -31,14 +31,21 @@ static phys_addr_t __ro_after_init hyp_idmap_vector;  static unsigned long __ro_after_init io_map_base; -static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, +					   phys_addr_t size)  { -	phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);  	phys_addr_t boundary = ALIGN_DOWN(addr + size, size);  	return (boundary - 1 < end - 1) ? boundary : end;  } +static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +{ +	phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); + +	return __stage2_range_addr_end(addr, end, size); +} +  /*   * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,   * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, @@ -75,6 +82,79 @@ static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,  #define stage2_apply_range_resched(mmu, addr, end, fn)			\  	stage2_apply_range(mmu, addr, end, fn, true) +/* + * Get the maximum number of page-tables pages needed to split a range + * of blocks into PAGE_SIZE PTEs. It assumes the range is already + * mapped at level 2, or at level 1 if allowed. + */ +static int kvm_mmu_split_nr_page_tables(u64 range) +{ +	int n = 0; + +	if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) +		n += DIV_ROUND_UP(range, PUD_SIZE); +	n += DIV_ROUND_UP(range, PMD_SIZE); +	return n; +} + +static bool need_split_memcache_topup_or_resched(struct kvm *kvm) +{ +	struct kvm_mmu_memory_cache *cache; +	u64 chunk_size, min; + +	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) +		return true; + +	chunk_size = kvm->arch.mmu.split_page_chunk_size; +	min = kvm_mmu_split_nr_page_tables(chunk_size); +	cache = &kvm->arch.mmu.split_page_cache; +	return kvm_mmu_memory_cache_nr_free_objects(cache) < min; +} + +static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, +				    phys_addr_t end) +{ +	struct kvm_mmu_memory_cache *cache; +	struct kvm_pgtable *pgt; +	int ret, cache_capacity; +	u64 next, chunk_size; + +	lockdep_assert_held_write(&kvm->mmu_lock); + +	chunk_size = kvm->arch.mmu.split_page_chunk_size; +	cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); + +	if (chunk_size == 0) +		return 0; + +	cache = &kvm->arch.mmu.split_page_cache; + +	do { +		if (need_split_memcache_topup_or_resched(kvm)) { +			write_unlock(&kvm->mmu_lock); +			cond_resched(); +			/* Eager page splitting is best-effort. */ +			ret = __kvm_mmu_topup_memory_cache(cache, +							   cache_capacity, +							   cache_capacity); +			write_lock(&kvm->mmu_lock); +			if (ret) +				break; +		} + +		pgt = kvm->arch.mmu.pgt; +		if (!pgt) +			return -EINVAL; + +		next = __stage2_range_addr_end(addr, end, chunk_size); +		ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache); +		if (ret) +			break; +	} while (addr = next, addr != end); + +	return ret; +} +  static bool memslot_is_logging(struct kvm_memory_slot *memslot)  {  	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); @@ -131,21 +211,21 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size)  static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; -static void stage2_free_removed_table_rcu_cb(struct rcu_head *head) +static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)  {  	struct page *page = container_of(head, struct page, rcu_head);  	void *pgtable = page_to_virt(page);  	u32 level = page_private(page); -	kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level); +	kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level);  } -static void stage2_free_removed_table(void *addr, u32 level) +static void stage2_free_unlinked_table(void *addr, u32 level)  {  	struct page *page = virt_to_page(addr);  	set_page_private(page, (unsigned long)level); -	call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb); +	call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb);  }  static void kvm_host_get_page(void *addr) @@ -701,7 +781,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {  	.zalloc_page		= stage2_memcache_zalloc_page,  	.zalloc_pages_exact	= kvm_s2_zalloc_pages_exact,  	.free_pages_exact	= kvm_s2_free_pages_exact, -	.free_removed_table	= stage2_free_removed_table, +	.free_unlinked_table	= stage2_free_unlinked_table,  	.get_page		= kvm_host_get_page,  	.put_page		= kvm_s2_put_page,  	.page_count		= kvm_host_page_count, @@ -775,6 +855,10 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t  	for_each_possible_cpu(cpu)  		*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; +	 /* The eager page splitting is disabled by default */ +	mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; +	mmu->split_page_cache.gfp_zero = __GFP_ZERO; +  	mmu->pgt = pgt;  	mmu->pgd_phys = __pa(pgt->pgd);  	return 0; @@ -786,6 +870,12 @@ out_free_pgtable:  	return err;  } +void kvm_uninit_stage2_mmu(struct kvm *kvm) +{ +	kvm_free_stage2_pgd(&kvm->arch.mmu); +	kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); +} +  static void stage2_unmap_memslot(struct kvm *kvm,  				 struct kvm_memory_slot *memslot)  { @@ -989,39 +1079,66 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)  }  /** - * kvm_mmu_write_protect_pt_masked() - write protect dirty pages + * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE + *				   pages for memory slot   * @kvm:	The KVM pointer - * @slot:	The memory slot associated with mask - * @gfn_offset:	The gfn offset in memory slot - * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory - *		slot to be write protected + * @slot:	The memory slot to split   * - * Walks bits set in mask write protects the associated pte's. Caller must - * acquire kvm_mmu_lock. + * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, + * serializing operations for VM memory regions.   */ -static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, -		struct kvm_memory_slot *slot, -		gfn_t gfn_offset, unsigned long mask) +static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot)  { -	phys_addr_t base_gfn = slot->base_gfn + gfn_offset; -	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT; -	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; +	struct kvm_memslots *slots; +	struct kvm_memory_slot *memslot; +	phys_addr_t start, end; -	stage2_wp_range(&kvm->arch.mmu, start, end); +	lockdep_assert_held(&kvm->slots_lock); + +	slots = kvm_memslots(kvm); +	memslot = id_to_memslot(slots, slot); + +	start = memslot->base_gfn << PAGE_SHIFT; +	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + +	write_lock(&kvm->mmu_lock); +	kvm_mmu_split_huge_pages(kvm, start, end); +	write_unlock(&kvm->mmu_lock);  }  /* - * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected - * dirty pages. + * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. + * @kvm:	The KVM pointer + * @slot:	The memory slot associated with mask + * @gfn_offset:	The gfn offset in memory slot + * @mask:	The mask of pages at offset 'gfn_offset' in this memory + *		slot to enable dirty logging on   * - * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to - * enable dirty logging for them. + * Writes protect selected pages to enable dirty logging, and then + * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock.   */  void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,  		struct kvm_memory_slot *slot,  		gfn_t gfn_offset, unsigned long mask)  { -	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); +	phys_addr_t base_gfn = slot->base_gfn + gfn_offset; +	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT; +	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + +	lockdep_assert_held_write(&kvm->mmu_lock); + +	stage2_wp_range(&kvm->arch.mmu, start, end); + +	/* +	 * Eager-splitting is done when manual-protect is set.  We +	 * also check for initially-all-set because we can avoid +	 * eager-splitting if initially-all-set is false. +	 * Initially-all-set equal false implies that huge-pages were +	 * already split when enabling dirty logging: no need to do it +	 * again. +	 */ +	if (kvm_dirty_log_manual_protect_and_init_set(kvm)) +		kvm_mmu_split_huge_pages(kvm, start, end);  }  static void kvm_send_hwpoison_signal(unsigned long address, short lsb) @@ -1790,20 +1907,42 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,  				   const struct kvm_memory_slot *new,  				   enum kvm_mr_change change)  { +	bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; +  	/*  	 * At this point memslot has been committed and there is an  	 * allocated dirty_bitmap[], dirty pages will be tracked while the  	 * memory slot is write protected.  	 */ -	if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) { +	if (log_dirty_pages) { + +		if (change == KVM_MR_DELETE) +			return; +  		/* -		 * If we're with initial-all-set, we don't need to write -		 * protect any pages because they're all reported as dirty. -		 * Huge pages and normal pages will be write protect gradually. +		 * Huge and normal pages are write-protected and split +		 * on either of these two cases: +		 * +		 * 1. with initial-all-set: gradually with CLEAR ioctls,  		 */ -		if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) { -			kvm_mmu_wp_memory_region(kvm, new->id); -		} +		if (kvm_dirty_log_manual_protect_and_init_set(kvm)) +			return; +		/* +		 * or +		 * 2. without initial-all-set: all in one shot when +		 *    enabling dirty logging. +		 */ +		kvm_mmu_wp_memory_region(kvm, new->id); +		kvm_mmu_split_memory_region(kvm, new->id); +	} else { +		/* +		 * Free any leftovers from the eager page splitting cache. Do +		 * this when deleting, moving, disabling dirty logging, or +		 * creating the memslot (a nop). Doing it for deletes makes +		 * sure we don't leak memory, and there's no need to keep the +		 * cache around for any of the other cases. +		 */ +		kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);  	}  } @@ -1877,7 +2016,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)  void kvm_arch_flush_shadow_all(struct kvm *kvm)  { -	kvm_free_stage2_pgd(&kvm->arch.mmu); +	kvm_uninit_stage2_mmu(kvm);  }  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,  | 
