diff options
Diffstat (limited to 'arch/x86')
44 files changed, 446 insertions, 278 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 7854685c5f25..bafbd905e6e7 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -286,10 +286,6 @@ vdso_install:  archprepare: checkbin  checkbin: -ifndef CONFIG_CC_HAS_ASM_GOTO -	@echo Compiler lacks asm-goto support. -	@exit 1 -endif  ifdef CONFIG_RETPOLINE  ifeq ($(RETPOLINE_CFLAGS),)  	@echo "You are building kernel with non-retpoline compiler." >&2 diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 4910bf230d7b..62208ec04ca4 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -132,7 +132,17 @@ void snp_set_page_private(unsigned long paddr);  void snp_set_page_shared(unsigned long paddr);  void sev_prep_identity_maps(unsigned long top_level_pgt);  #else -static inline void sev_enable(struct boot_params *bp) { } +static inline void sev_enable(struct boot_params *bp) +{ +	/* +	 * bp->cc_blob_address should only be set by boot/compressed kernel. +	 * Initialize it to 0 unconditionally (thus here in this stub too) to +	 * ensure that uninitialized values from buggy bootloaders aren't +	 * propagated. +	 */ +	if (bp) +		bp->cc_blob_address = 0; +}  static inline void sev_es_shutdown_ghcb(void) { }  static inline bool sev_es_check_ghcb_fault(unsigned long address)  { diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 52f989f6acc2..c93930d5ccbd 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -277,6 +277,14 @@ void sev_enable(struct boot_params *bp)  	bool snp;  	/* +	 * bp->cc_blob_address should only be set by boot/compressed kernel. +	 * Initialize it to 0 to ensure that uninitialized values from +	 * buggy bootloaders aren't propagated. +	 */ +	if (bp) +		bp->cc_blob_address = 0; + +	/*  	 * Setup/preliminary detection of SNP. This will be sanity-checked  	 * against CPUID/MSR values later.  	 */ diff --git a/arch/x86/configs/xen.config b/arch/x86/configs/xen.config index d9fc7139fd46..581296255b39 100644 --- a/arch/x86/configs/xen.config +++ b/arch/x86/configs/xen.config @@ -14,7 +14,6 @@ CONFIG_CPU_FREQ=y  # x86 xen specific config options  CONFIG_XEN_PVH=y -CONFIG_XEN_MAX_DOMAIN_MEMORY=500  CONFIG_XEN_SAVE_RESTORE=y  # CONFIG_XEN_DEBUG_FS is not set  CONFIG_XEN_MCE_LOG=y diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 682338e7e2a3..4dd19819053a 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -311,7 +311,7 @@ SYM_CODE_START(entry_INT80_compat)  	 * Interrupts are off on entry.  	 */  	ASM_CLAC			/* Do this early to minimize exposure */ -	SWAPGS +	ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV  	/*  	 * User tracing code (ptrace or signal handlers) might assume that diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2db93498ff71..c601939a74b1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4052,8 +4052,9 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)  		/* Disable guest PEBS if host PEBS is enabled. */  		arr[pebs_enable].guest = 0;  	} else { -		/* Disable guest PEBS for cross-mapped PEBS counters. */ +		/* Disable guest PEBS thoroughly for cross-mapped PEBS counters. */  		arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask; +		arr[global_ctrl].guest &= ~kvm_pmu->host_cross_mapped_mask;  		/* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */  		arr[global_ctrl].guest |= arr[pebs_enable].guest;  	} @@ -6291,10 +6292,8 @@ __init int intel_pmu_init(void)  		x86_pmu.pebs_aliases = NULL;  		x86_pmu.pebs_prec_dist = true;  		x86_pmu.pebs_block = true; -		x86_pmu.pebs_capable = ~0ULL;  		x86_pmu.flags |= PMU_FL_HAS_RSP_1;  		x86_pmu.flags |= PMU_FL_NO_HT_SHARING; -		x86_pmu.flags |= PMU_FL_PEBS_ALL;  		x86_pmu.flags |= PMU_FL_INSTR_LATENCY;  		x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; @@ -6337,10 +6336,8 @@ __init int intel_pmu_init(void)  		x86_pmu.pebs_aliases = NULL;  		x86_pmu.pebs_prec_dist = true;  		x86_pmu.pebs_block = true; -		x86_pmu.pebs_capable = ~0ULL;  		x86_pmu.flags |= PMU_FL_HAS_RSP_1;  		x86_pmu.flags |= PMU_FL_NO_HT_SHARING; -		x86_pmu.flags |= PMU_FL_PEBS_ALL;  		x86_pmu.flags |= PMU_FL_INSTR_LATENCY;  		x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;  		x86_pmu.lbr_pt_coexist = true; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ba60427caa6d..de1f55d51784 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -291,6 +291,7 @@ static u64 load_latency_data(struct perf_event *event, u64 status)  static u64 store_latency_data(struct perf_event *event, u64 status)  {  	union intel_x86_pebs_dse dse; +	union perf_mem_data_src src;  	u64 val;  	dse.val = status; @@ -304,7 +305,14 @@ static u64 store_latency_data(struct perf_event *event, u64 status)  	val |= P(BLK, NA); -	return val; +	/* +	 * the pebs_data_source table is only for loads +	 * so override the mem_op to say STORE instead +	 */ +	src.val = val; +	src.mem_op = P(OP,STORE); + +	return src.val;  }  struct pebs_record_core { @@ -822,7 +830,7 @@ struct event_constraint intel_glm_pebs_event_constraints[] = {  struct event_constraint intel_grt_pebs_event_constraints[] = {  	/* Allow all events as PEBS with no flags */ -	INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0xf), +	INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0x3),  	INTEL_HYBRID_LAT_CONSTRAINT(0x6d0, 0xf),  	EVENT_CONSTRAINT_END  }; @@ -2262,6 +2270,7 @@ void __init intel_ds_init(void)  					PERF_SAMPLE_BRANCH_STACK |  					PERF_SAMPLE_TIME;  				x86_pmu.flags |= PMU_FL_PEBS_ALL; +				x86_pmu.pebs_capable = ~0ULL;  				pebs_qual = "-baseline";  				x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;  			} else { diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 4f70fb6c2c1e..47fca6a7a8bc 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1097,6 +1097,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)  	if (static_cpu_has(X86_FEATURE_ARCH_LBR)) {  		reg->config = mask; + +		/* +		 * The Arch LBR HW can retrieve the common branch types +		 * from the LBR_INFO. It doesn't require the high overhead +		 * SW disassemble. +		 * Enable the branch type by default for the Arch LBR. +		 */ +		reg->reg |= X86_BR_TYPE_SAVE;  		return 0;  	} diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index ce440011cc4e..1ef4f7861e2e 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -841,6 +841,22 @@ int snb_pci2phy_map_init(int devid)  	return 0;  } +static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ +	struct hw_perf_event *hwc = &event->hw; + +	/* +	 * SNB IMC counters are 32-bit and are laid out back to back +	 * in MMIO space. Therefore we must use a 32-bit accessor function +	 * using readq() from uncore_mmio_read_counter() causes problems +	 * because it is reading 64-bit at a time. This is okay for the +	 * uncore_perf_event_update() function because it drops the upper +	 * 32-bits but not okay for plain uncore_read_counter() as invoked +	 * in uncore_pmu_event_start(). +	 */ +	return (u64)readl(box->io_addr + hwc->event_base); +} +  static struct pmu snb_uncore_imc_pmu = {  	.task_ctx_nr	= perf_invalid_context,  	.event_init	= snb_uncore_imc_event_init, @@ -860,7 +876,7 @@ static struct intel_uncore_ops snb_uncore_imc_ops = {  	.disable_event	= snb_uncore_imc_disable_event,  	.enable_event	= snb_uncore_imc_enable_event,  	.hw_config	= snb_uncore_imc_hw_config, -	.read_counter	= uncore_mmio_read_counter, +	.read_counter	= snb_uncore_imc_read_counter,  };  static struct intel_uncore_type snb_uncore_imc = { diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 973c6bd17f98..0fe9de58af31 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -207,6 +207,20 @@ static __always_inline bool constant_test_bit(long nr, const volatile unsigned l  		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;  } +static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr) +{ +	bool oldbit; + +	asm volatile("testb %2,%1" +		     CC_SET(nz) +		     : CC_OUT(nz) (oldbit) +		     : "m" (((unsigned char *)addr)[nr >> 3]), +		       "i" (1 << (nr & 7)) +		     :"memory"); + +	return oldbit; +} +  static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)  {  	bool oldbit; @@ -226,6 +240,13 @@ arch_test_bit(unsigned long nr, const volatile unsigned long *addr)  					  variable_test_bit(nr, addr);  } +static __always_inline bool +arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) +{ +	return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) : +					  variable_test_bit(nr, addr); +} +  /**   * __ffs - find first set bit in word   * @word: The word to search diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index ea34cc31b047..1a85e1fb0922 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -155,20 +155,6 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);  #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) -#if defined(__clang__) && !defined(CONFIG_CC_HAS_ASM_GOTO) - -/* - * Workaround for the sake of BPF compilation which utilizes kernel - * headers, but clang does not support ASM GOTO and fails the build. - */ -#ifndef __BPF_TRACING__ -#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments" -#endif - -#define static_cpu_has(bit)            boot_cpu_has(bit) - -#else -  /*   * Static testing of CPU features. Used the same as boot_cpu_has(). It   * statically patches the target code for additional performance. Use @@ -208,7 +194,6 @@ t_no:  		boot_cpu_has(bit) :				\  		_static_cpu_has(bit)				\  ) -#endif  #define cpu_has_bug(c, bit)		cpu_has(c, (bit))  #define set_cpu_bug(c, bit)		set_cpu_cap(c, (bit)) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 235dc85c91c3..ef4775c6db01 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -457,7 +457,8 @@  #define X86_BUG_ITLB_MULTIHIT		X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */  #define X86_BUG_SRBDS			X86_BUG(24) /* CPU may leak RNG bits if not mitigated */  #define X86_BUG_MMIO_STALE_DATA		X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ -#define X86_BUG_RETBLEED		X86_BUG(26) /* CPU is affected by RETBleed */ -#define X86_BUG_EIBRS_PBRSB		X86_BUG(27) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ +#define X86_BUG_MMIO_UNKNOWN		X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ +#define X86_BUG_RETBLEED		X86_BUG(27) /* CPU is affected by RETBleed */ +#define X86_BUG_EIBRS_PBRSB		X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */  #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h index 503622627400..991e31cfde94 100644 --- a/arch/x86/include/asm/extable_fixup_types.h +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -64,4 +64,6 @@  #define	EX_TYPE_UCOPY_LEN4		(EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4))  #define	EX_TYPE_UCOPY_LEN8		(EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8)) +#define EX_TYPE_ZEROPAD			20 /* longword load with zeropad on fault */ +  #endif diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h index 689880eca9ba..9b08082a5d9f 100644 --- a/arch/x86/include/asm/ibt.h +++ b/arch/x86/include/asm/ibt.h @@ -31,6 +31,16 @@  #define __noendbr	__attribute__((nocf_check)) +/* + * Create a dummy function pointer reference to prevent objtool from marking + * the function as needing to be "sealed" (i.e. ENDBR converted to NOP by + * apply_ibt_endbr()). + */ +#define IBT_NOSEAL(fname)				\ +	".pushsection .discard.ibt_endbr_noseal\n\t"	\ +	_ASM_PTR fname "\n\t"				\ +	".popsection\n\t" +  static inline __attribute_const__ u32 gen_endbr(void)  {  	u32 endbr; @@ -84,6 +94,7 @@ extern __noendbr void ibt_restore(u64 save);  #ifndef __ASSEMBLY__  #define ASM_ENDBR +#define IBT_NOSEAL(name)  #define __noendbr diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index def6ca121111..5d75fe229342 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -27,6 +27,7 @@   *		_X	- regular server parts   *		_D	- micro server parts   *		_N,_P	- other mobile parts + *		_S	- other client parts   *   *		Historical OPTDIFFs:   * @@ -112,6 +113,10 @@  #define INTEL_FAM6_RAPTORLAKE		0xB7  #define INTEL_FAM6_RAPTORLAKE_P		0xBA +#define INTEL_FAM6_RAPTORLAKE_S		0xBF + +#define INTEL_FAM6_METEORLAKE		0xAC +#define INTEL_FAM6_METEORLAKE_L		0xAA  /* "Small Core" Processors (Atom) */ diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 63f818aedf77..147cb8fdda92 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -203,7 +203,7 @@  			      IRQ_CONSTRAINTS, regs, vector);		\  } -#ifndef CONFIG_PREEMPT_RT +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK  /*   * Macro to invoke __do_softirq on the irq stack. This is only called from   * task context when bottom halves are about to be reenabled and soft diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5ffa578cafe1..aa381ab69a19 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -53,7 +53,7 @@  #define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)  /* memory slots that are not exposed to userspace */ -#define KVM_PRIVATE_MEM_SLOTS 3 +#define KVM_INTERNAL_MEM_SLOTS 3  #define KVM_HALT_POLL_NS_DEFAULT 200000 @@ -729,6 +729,7 @@ struct kvm_vcpu_arch {  	struct fpu_guest guest_fpu;  	u64 xcr0; +	u64 guest_supported_xcr0;  	struct kvm_pio_request pio;  	void *pio_data; diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e64fd20778b6..c936ce9f0c47 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -35,33 +35,56 @@  #define RSB_CLEAR_LOOPS		32	/* To forcibly overwrite all entries */  /* + * Common helper for __FILL_RETURN_BUFFER and __FILL_ONE_RETURN. + */ +#define __FILL_RETURN_SLOT			\ +	ANNOTATE_INTRA_FUNCTION_CALL;		\ +	call	772f;				\ +	int3;					\ +772: + +/* + * Stuff the entire RSB. + *   * Google experimented with loop-unrolling and this turned out to be   * the optimal version - two calls, each with their own speculation   * trap should their return address end up getting used, in a loop.   */ -#define __FILL_RETURN_BUFFER(reg, nr, sp)	\ -	mov	$(nr/2), reg;			\ -771:						\ -	ANNOTATE_INTRA_FUNCTION_CALL;		\ -	call	772f;				\ -773:	/* speculation trap */			\ -	UNWIND_HINT_EMPTY;			\ -	pause;					\ -	lfence;					\ -	jmp	773b;				\ -772:						\ -	ANNOTATE_INTRA_FUNCTION_CALL;		\ -	call	774f;				\ -775:	/* speculation trap */			\ -	UNWIND_HINT_EMPTY;			\ -	pause;					\ -	lfence;					\ -	jmp	775b;				\ -774:						\ -	add	$(BITS_PER_LONG/8) * 2, sp;	\ -	dec	reg;				\ -	jnz	771b;				\ -	/* barrier for jnz misprediction */	\ +#ifdef CONFIG_X86_64 +#define __FILL_RETURN_BUFFER(reg, nr)			\ +	mov	$(nr/2), reg;				\ +771:							\ +	__FILL_RETURN_SLOT				\ +	__FILL_RETURN_SLOT				\ +	add	$(BITS_PER_LONG/8) * 2, %_ASM_SP;	\ +	dec	reg;					\ +	jnz	771b;					\ +	/* barrier for jnz misprediction */		\ +	lfence; +#else +/* + * i386 doesn't unconditionally have LFENCE, as such it can't + * do a loop. + */ +#define __FILL_RETURN_BUFFER(reg, nr)			\ +	.rept nr;					\ +	__FILL_RETURN_SLOT;				\ +	.endr;						\ +	add	$(BITS_PER_LONG/8) * nr, %_ASM_SP; +#endif + +/* + * Stuff a single RSB slot. + * + * To mitigate Post-Barrier RSB speculation, one CALL instruction must be + * forced to retire before letting a RET instruction execute. + * + * On PBRSB-vulnerable CPUs, it is not safe for a RET to be executed + * before this point. + */ +#define __FILL_ONE_RETURN				\ +	__FILL_RETURN_SLOT				\ +	add	$(BITS_PER_LONG/8), %_ASM_SP;		\  	lfence;  #ifdef __ASSEMBLY__ @@ -132,28 +155,15 @@  #endif  .endm -.macro ISSUE_UNBALANCED_RET_GUARD -	ANNOTATE_INTRA_FUNCTION_CALL -	call .Lunbalanced_ret_guard_\@ -	int3 -.Lunbalanced_ret_guard_\@: -	add $(BITS_PER_LONG/8), %_ASM_SP -	lfence -.endm -   /*    * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP    * monstrosity above, manually.    */ -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2 -.ifb \ftr2 -	ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr -.else -	ALTERNATIVE_2 "jmp .Lskip_rsb_\@", "", \ftr, "jmp .Lunbalanced_\@", \ftr2 -.endif -	__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP) -.Lunbalanced_\@: -	ISSUE_UNBALANCED_RET_GUARD +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2=ALT_NOT(X86_FEATURE_ALWAYS) +	ALTERNATIVE_2 "jmp .Lskip_rsb_\@", \ +		__stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr, \ +		__stringify(__FILL_ONE_RETURN), \ftr2 +  .Lskip_rsb_\@:  .endm diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 8a9eba191516..7fa611216417 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -11,7 +11,7 @@  #define __CLOBBERS_MEM(clb...)	"memory", ## clb -#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CONFIG_CC_HAS_ASM_GOTO) +#ifndef __GCC_ASM_FLAG_OUTPUTS__  /* Use asm goto */ @@ -27,7 +27,7 @@ cc_label:	c = true;						\  	c;								\  }) -#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */ +#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) */  /* Use flags output or a set instruction */ @@ -40,7 +40,7 @@ cc_label:	c = true;						\  	c;								\  }) -#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */ +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) */  #define GEN_UNARY_RMWcc_4(op, var, cc, arg0)				\  	__GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM()) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 4a23e52fe0ee..ebc271bb6d8e 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -195,7 +195,7 @@ void snp_set_memory_shared(unsigned long vaddr, unsigned int npages);  void snp_set_memory_private(unsigned long vaddr, unsigned int npages);  void snp_set_wakeup_secondary_cpu(void);  bool snp_init(struct boot_params *bp); -void snp_abort(void); +void __init __noreturn snp_abort(void);  int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err);  #else  static inline void sev_es_ist_enter(struct pt_regs *regs) { } diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h index 8338b0432b50..46b4f1f7f354 100644 --- a/arch/x86/include/asm/word-at-a-time.h +++ b/arch/x86/include/asm/word-at-a-time.h @@ -77,58 +77,18 @@ static inline unsigned long find_zero(unsigned long mask)   * and the next page not being mapped, take the exception and   * return zeroes in the non-existing part.   */ -#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT -  static inline unsigned long load_unaligned_zeropad(const void *addr)  { -	unsigned long offset, data;  	unsigned long ret; -	asm_volatile_goto( +	asm volatile(  		"1:	mov %[mem], %[ret]\n" - -		_ASM_EXTABLE(1b, %l[do_exception]) - -		: [ret] "=r" (ret) -		: [mem] "m" (*(unsigned long *)addr) -		: : do_exception); - -	return ret; - -do_exception: -	offset = (unsigned long)addr & (sizeof(long) - 1); -	addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1)); -	data = *(unsigned long *)addr; -	ret = data >> offset * 8; - -	return ret; -} - -#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ - -static inline unsigned long load_unaligned_zeropad(const void *addr) -{ -	unsigned long offset, data; -	unsigned long ret, err = 0; - -	asm(	"1:	mov %[mem], %[ret]\n"  		"2:\n" - -		_ASM_EXTABLE_FAULT(1b, 2b) - -		: [ret] "=&r" (ret), "+a" (err) +		_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_ZEROPAD) +		: [ret] "=r" (ret)  		: [mem] "m" (*(unsigned long *)addr)); -	if (unlikely(err)) { -		offset = (unsigned long)addr & (sizeof(long) - 1); -		addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1)); -		data = *(unsigned long *)addr; -		ret = data >> offset * 8; -	} -  	return ret;  } -#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ -  #endif /* _ASM_WORD_AT_A_TIME_H */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 510d85261132..da7c361f47e0 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -433,7 +433,8 @@ static void __init mmio_select_mitigation(void)  	u64 ia32_cap;  	if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || -	    cpu_mitigations_off()) { +	     boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || +	     cpu_mitigations_off()) {  		mmio_mitigation = MMIO_MITIGATION_OFF;  		return;  	} @@ -538,6 +539,8 @@ out:  		pr_info("TAA: %s\n", taa_strings[taa_mitigation]);  	if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))  		pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); +	else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) +		pr_info("MMIO Stale Data: Unknown: No mitigations\n");  }  static void __init md_clear_select_mitigation(void) @@ -2275,6 +2278,9 @@ static ssize_t tsx_async_abort_show_state(char *buf)  static ssize_t mmio_stale_data_show_state(char *buf)  { +	if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) +		return sysfs_emit(buf, "Unknown: No mitigations\n"); +  	if (mmio_mitigation == MMIO_MITIGATION_OFF)  		return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); @@ -2421,6 +2427,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr  		return srbds_show_state(buf);  	case X86_BUG_MMIO_STALE_DATA: +	case X86_BUG_MMIO_UNKNOWN:  		return mmio_stale_data_show_state(buf);  	case X86_BUG_RETBLEED: @@ -2480,7 +2487,10 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *  ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf)  { -	return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); +	if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) +		return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); +	else +		return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);  }  ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 64a73f415f03..3e508f239098 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1135,7 +1135,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)  #define NO_SWAPGS		BIT(6)  #define NO_ITLB_MULTIHIT	BIT(7)  #define NO_SPECTRE_V2		BIT(8) -#define NO_EIBRS_PBRSB		BIT(9) +#define NO_MMIO			BIT(9) +#define NO_EIBRS_PBRSB		BIT(10)  #define VULNWL(vendor, family, model, whitelist)	\  	X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) @@ -1158,6 +1159,11 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {  	VULNWL(VORTEX,	6, X86_MODEL_ANY,	NO_SPECULATION),  	/* Intel Family 6 */ +	VULNWL_INTEL(TIGERLAKE,			NO_MMIO), +	VULNWL_INTEL(TIGERLAKE_L,		NO_MMIO), +	VULNWL_INTEL(ALDERLAKE,			NO_MMIO), +	VULNWL_INTEL(ALDERLAKE_L,		NO_MMIO), +  	VULNWL_INTEL(ATOM_SALTWELL,		NO_SPECULATION | NO_ITLB_MULTIHIT),  	VULNWL_INTEL(ATOM_SALTWELL_TABLET,	NO_SPECULATION | NO_ITLB_MULTIHIT),  	VULNWL_INTEL(ATOM_SALTWELL_MID,		NO_SPECULATION | NO_ITLB_MULTIHIT), @@ -1176,9 +1182,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {  	VULNWL_INTEL(ATOM_AIRMONT_MID,		NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),  	VULNWL_INTEL(ATOM_AIRMONT_NP,		NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), -	VULNWL_INTEL(ATOM_GOLDMONT,		NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), -	VULNWL_INTEL(ATOM_GOLDMONT_D,		NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), -	VULNWL_INTEL(ATOM_GOLDMONT_PLUS,	NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), +	VULNWL_INTEL(ATOM_GOLDMONT,		NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), +	VULNWL_INTEL(ATOM_GOLDMONT_D,		NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), +	VULNWL_INTEL(ATOM_GOLDMONT_PLUS,	NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),  	/*  	 * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1193,18 +1199,18 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {  	VULNWL_INTEL(ATOM_TREMONT_D,		NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),  	/* AMD Family 0xf - 0x12 */ -	VULNWL_AMD(0x0f,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), -	VULNWL_AMD(0x10,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), -	VULNWL_AMD(0x11,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), -	VULNWL_AMD(0x12,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), +	VULNWL_AMD(0x0f,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), +	VULNWL_AMD(0x10,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), +	VULNWL_AMD(0x11,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), +	VULNWL_AMD(0x12,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),  	/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ -	VULNWL_AMD(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), -	VULNWL_HYGON(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), +	VULNWL_AMD(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), +	VULNWL_HYGON(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),  	/* Zhaoxin Family 7 */ -	VULNWL(CENTAUR,	7, X86_MODEL_ANY,	NO_SPECTRE_V2 | NO_SWAPGS), -	VULNWL(ZHAOXIN,	7, X86_MODEL_ANY,	NO_SPECTRE_V2 | NO_SWAPGS), +	VULNWL(CENTAUR,	7, X86_MODEL_ANY,	NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), +	VULNWL(ZHAOXIN,	7, X86_MODEL_ANY,	NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO),  	{}  }; @@ -1358,10 +1364,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)  	 * Affected CPU list is generally enough to enumerate the vulnerability,  	 * but for virtualization case check for ARCH_CAP MSR bits also, VMM may  	 * not want the guest to enumerate the bug. +	 * +	 * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, +	 * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits.  	 */ -	if (cpu_matches(cpu_vuln_blacklist, MMIO) && -	    !arch_cap_mmio_immune(ia32_cap)) -		setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); +	if (!arch_cap_mmio_immune(ia32_cap)) { +		if (cpu_matches(cpu_vuln_blacklist, MMIO)) +			setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); +		else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) +			setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); +	}  	if (!cpu_has(c, X86_FEATURE_BTC_NO)) {  		if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 24c1bb8eb196..8bdeae2fc309 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -344,8 +344,11 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,  	}  	va_page = sgx_encl_grow(encl, false); -	if (IS_ERR(va_page)) +	if (IS_ERR(va_page)) { +		if (PTR_ERR(va_page) == -EBUSY) +			vmret = VM_FAULT_NOPAGE;  		goto err_out_epc; +	}  	if (va_page)  		list_add(&va_page->list, &encl->va_pages); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 515e2a5f25bb..0aad028f04d4 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -49,9 +49,13 @@ static LIST_HEAD(sgx_dirty_page_list);   * Reset post-kexec EPC pages to the uninitialized state. The pages are removed   * from the input list, and made available for the page allocator. SECS pages   * prepending their children in the input list are left intact. + * + * Return 0 when sanitization was successful or kthread was stopped, and the + * number of unsanitized pages otherwise.   */ -static void __sgx_sanitize_pages(struct list_head *dirty_page_list) +static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list)  { +	unsigned long left_dirty = 0;  	struct sgx_epc_page *page;  	LIST_HEAD(dirty);  	int ret; @@ -59,7 +63,7 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)  	/* dirty_page_list is thread-local, no need for a lock: */  	while (!list_empty(dirty_page_list)) {  		if (kthread_should_stop()) -			return; +			return 0;  		page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); @@ -92,12 +96,14 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)  		} else {  			/* The page is not yet clean - move to the dirty list. */  			list_move_tail(&page->list, &dirty); +			left_dirty++;  		}  		cond_resched();  	}  	list_splice(&dirty, dirty_page_list); +	return left_dirty;  }  static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) @@ -395,10 +401,7 @@ static int ksgxd(void *p)  	 * required for SECS pages, whose child pages blocked EREMOVE.  	 */  	__sgx_sanitize_pages(&sgx_dirty_page_list); -	__sgx_sanitize_pages(&sgx_dirty_page_list); - -	/* sanity check: */ -	WARN_ON(!list_empty(&sgx_dirty_page_list)); +	WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list));  	while (!kthread_should_stop()) {  		if (try_to_freeze()) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index e5dd6da78713..01833ebf5e8e 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -132,7 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu)  	return 0;  } -#ifndef CONFIG_PREEMPT_RT +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK  void do_softirq_own_stack(void)  {  	struct irq_stack *irqstk; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 74167dc5f55e..4c3c27b6aea3 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -505,7 +505,7 @@ static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs)  		match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^  			((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);  		if (p->ainsn.jcc.type >= 0xe) -			match = match && (regs->flags & X86_EFLAGS_ZF); +			match = match || (regs->flags & X86_EFLAGS_ZF);  	}  	__kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert));  } diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 63dc626627a0..a428c62330d3 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -701,7 +701,13 @@ e_term:  void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,  					 unsigned int npages)  { -	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) +	/* +	 * This can be invoked in early boot while running identity mapped, so +	 * use an open coded check for SNP instead of using cc_platform_has(). +	 * This eliminates worries about jump tables or checking boot_cpu_data +	 * in the cc_platform_has() function. +	 */ +	if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))  		return;  	 /* @@ -717,7 +723,13 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd  void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,  					unsigned int npages)  { -	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) +	/* +	 * This can be invoked in early boot while running identity mapped, so +	 * use an open coded check for SNP instead of using cc_platform_has(). +	 * This eliminates worries about jump tables or checking boot_cpu_data +	 * in the cc_platform_has() function. +	 */ +	if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))  		return;  	/* Invalidate the memory pages before they are marked shared in the RMP table. */ @@ -2100,7 +2112,7 @@ bool __init snp_init(struct boot_params *bp)  	return true;  } -void __init snp_abort(void) +void __init __noreturn snp_abort(void)  {  	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);  } diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 38185aedf7d1..0ea57da92940 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -93,22 +93,27 @@ static struct orc_entry *orc_find(unsigned long ip);  static struct orc_entry *orc_ftrace_find(unsigned long ip)  {  	struct ftrace_ops *ops; -	unsigned long caller; +	unsigned long tramp_addr, offset;  	ops = ftrace_ops_trampoline(ip);  	if (!ops)  		return NULL; +	/* Set tramp_addr to the start of the code copied by the trampoline */  	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) -		caller = (unsigned long)ftrace_regs_call; +		tramp_addr = (unsigned long)ftrace_regs_caller;  	else -		caller = (unsigned long)ftrace_call; +		tramp_addr = (unsigned long)ftrace_caller; + +	/* Now place tramp_addr to the location within the trampoline ip is at */ +	offset = ip - ops->trampoline; +	tramp_addr += offset;  	/* Prevent unlikely recursion */ -	if (ip == caller) +	if (ip == tramp_addr)  		return NULL; -	return orc_find(caller); +	return orc_find(tramp_addr);  }  #else  static struct orc_entry *orc_ftrace_find(unsigned long ip) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 75dcf7a72605..2796dde06302 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -315,7 +315,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)  {  	struct kvm_lapic *apic = vcpu->arch.apic;  	struct kvm_cpuid_entry2 *best; -	u64 guest_supported_xcr0;  	best = kvm_find_cpuid_entry(vcpu, 1);  	if (best && apic) { @@ -327,10 +326,16 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)  		kvm_apic_set_version(vcpu);  	} -	guest_supported_xcr0 = +	vcpu->arch.guest_supported_xcr0 =  		cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); -	vcpu->arch.guest_fpu.fpstate->user_xfeatures = guest_supported_xcr0; +	/* +	 * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if +	 * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't +	 * supported by the host. +	 */ +	vcpu->arch.guest_fpu.fpstate->user_xfeatures = vcpu->arch.guest_supported_xcr0 | +						       XFEATURE_MASK_FPSSE;  	kvm_update_pv_runtime(vcpu); @@ -897,8 +902,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)  			entry->edx = 0;  		}  		break; -	case 9: -		break;  	case 0xa: { /* Architectural Performance Monitoring */  		union cpuid10_eax eax;  		union cpuid10_edx edx; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b4eeb7c75dfa..aacb28c83e43 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -326,7 +326,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);  	".align " __stringify(FASTOP_SIZE) " \n\t" \  	".type " name ", @function \n\t" \  	name ":\n\t" \ -	ASM_ENDBR +	ASM_ENDBR \ +	IBT_NOSEAL(name)  #define FOP_FUNC(name) \  	__FOP_FUNC(#name) @@ -446,27 +447,12 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);  	FOP_END  /* Special case for SETcc - 1 instruction per cc */ - -/* - * Depending on .config the SETcc functions look like: - * - * ENDBR			[4 bytes; CONFIG_X86_KERNEL_IBT] - * SETcc %al			[3 bytes] - * RET | JMP __x86_return_thunk	[1,5 bytes; CONFIG_RETHUNK] - * INT3				[1 byte; CONFIG_SLS] - */ -#define SETCC_ALIGN	16 -  #define FOP_SETCC(op) \ -	".align " __stringify(SETCC_ALIGN) " \n\t" \ -	".type " #op ", @function \n\t" \ -	#op ": \n\t" \ -	ASM_ENDBR \ +	FOP_FUNC(op) \  	#op " %al \n\t" \ -	__FOP_RET(#op) \ -	".skip " __stringify(SETCC_ALIGN) " - (.-" #op "), 0xcc \n\t" +	FOP_RET(op) -__FOP_START(setcc, SETCC_ALIGN) +FOP_START(setcc)  FOP_SETCC(seto)  FOP_SETCC(setno)  FOP_SETCC(setc) @@ -493,7 +479,7 @@ FOP_END;  /*   * XXX: inoutclob user must know where the argument is being expanded. - *      Relying on CONFIG_CC_HAS_ASM_GOTO would allow us to remove _fault. + *      Using asm goto would allow us to remove _fault.   */  #define asm_safe(insn, inoutclob...) \  ({ \ @@ -1079,7 +1065,7 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt)  static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)  {  	u8 rc; -	void (*fop)(void) = (void *)em_setcc + SETCC_ALIGN * (condition & 0xf); +	void (*fop)(void) = (void *)em_setcc + FASTOP_SIZE * (condition & 0xf);  	flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;  	asm("push %[flags]; popf; " CALL_NOSPEC @@ -4146,6 +4132,9 @@ static int em_xsetbv(struct x86_emulate_ctxt *ctxt)  {  	u32 eax, ecx, edx; +	if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE)) +		return emulate_ud(ctxt); +  	eax = reg_read(ctxt, VCPU_REGS_RAX);  	edx = reg_read(ctxt, VCPU_REGS_RDX);  	ecx = reg_read(ctxt, VCPU_REGS_RCX); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index eccddb136954..3552e6af3684 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1596,6 +1596,8 @@ static void __rmap_add(struct kvm *kvm,  	rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);  	rmap_count = pte_list_add(cache, spte, rmap_head); +	if (rmap_count > kvm->stat.max_mmu_rmap_size) +		kvm->stat.max_mmu_rmap_size = rmap_count;  	if (rmap_count > RMAP_RECYCLE_THRESHOLD) {  		kvm_zap_all_rmap_sptes(kvm, rmap_head);  		kvm_flush_remote_tlbs_with_address( @@ -2914,7 +2916,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)  	 * If addresses are being invalidated, skip prefetching to avoid  	 * accidentally prefetching those addresses.  	 */ -	if (unlikely(vcpu->kvm->mmu_notifier_count)) +	if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))  		return;  	__direct_pte_prefetch(vcpu, sp, sptep); @@ -2928,7 +2930,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)   *   * There are several ways to safely use this helper:   * - * - Check mmu_notifier_retry_hva() after grabbing the mapping level, before + * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before   *   consuming it.  In this case, mmu_lock doesn't need to be held during the   *   lookup, but it does need to be held while checking the MMU notifier.   * @@ -3056,7 +3058,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault  		return;  	/* -	 * mmu_notifier_retry() was successful and mmu_lock is held, so +	 * mmu_invalidate_retry() was successful and mmu_lock is held, so  	 * the pmd can't be split from under us.  	 */  	fault->goal_level = fault->req_level; @@ -4203,7 +4205,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,  		return true;  	return fault->slot && -	       mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); +	       mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva);  }  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -4227,7 +4229,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault  	if (r)  		return r; -	mmu_seq = vcpu->kvm->mmu_notifier_seq; +	mmu_seq = vcpu->kvm->mmu_invalidate_seq;  	smp_rmb();  	r = kvm_faultin_pfn(vcpu, fault); @@ -5361,19 +5363,6 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)  	__kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);  } -static bool need_remote_flush(u64 old, u64 new) -{ -	if (!is_shadow_present_pte(old)) -		return false; -	if (!is_shadow_present_pte(new)) -		return true; -	if ((old ^ new) & SPTE_BASE_ADDR_MASK) -		return true; -	old ^= shadow_nx_mask; -	new ^= shadow_nx_mask; -	return (old & ~new & SPTE_PERM_MASK) != 0; -} -  static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,  				    int *bytes)  { @@ -5519,7 +5508,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,  			mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);  			if (gentry && sp->role.level != PG_LEVEL_4K)  				++vcpu->kvm->stat.mmu_pde_zapped; -			if (need_remote_flush(entry, *spte)) +			if (is_shadow_present_pte(entry))  				flush = true;  			++spte;  		} @@ -6055,7 +6044,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)  	write_lock(&kvm->mmu_lock); -	kvm_inc_notifier_count(kvm, gfn_start, gfn_end); +	kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end);  	flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); @@ -6069,7 +6058,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)  		kvm_flush_remote_tlbs_with_address(kvm, gfn_start,  						   gfn_end - gfn_start); -	kvm_dec_notifier_count(kvm, gfn_start, gfn_end); +	kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end);  	write_unlock(&kvm->mmu_lock);  } @@ -6085,47 +6074,18 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,  				      const struct kvm_memory_slot *memslot,  				      int start_level)  { -	bool flush = false; -  	if (kvm_memslots_have_rmaps(kvm)) {  		write_lock(&kvm->mmu_lock); -		flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, -					  start_level, KVM_MAX_HUGEPAGE_LEVEL, -					  false); +		slot_handle_level(kvm, memslot, slot_rmap_write_protect, +				  start_level, KVM_MAX_HUGEPAGE_LEVEL, false);  		write_unlock(&kvm->mmu_lock);  	}  	if (is_tdp_mmu_enabled(kvm)) {  		read_lock(&kvm->mmu_lock); -		flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); +		kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);  		read_unlock(&kvm->mmu_lock);  	} - -	/* -	 * Flush TLBs if any SPTEs had to be write-protected to ensure that -	 * guest writes are reflected in the dirty bitmap before the memslot -	 * update completes, i.e. before enabling dirty logging is visible to -	 * userspace. -	 * -	 * Perform the TLB flush outside the mmu_lock to reduce the amount of -	 * time the lock is held. However, this does mean that another CPU can -	 * now grab mmu_lock and encounter a write-protected SPTE while CPUs -	 * still have a writable mapping for the associated GFN in their TLB. -	 * -	 * This is safe but requires KVM to be careful when making decisions -	 * based on the write-protection status of an SPTE. Specifically, KVM -	 * also write-protects SPTEs to monitor changes to guest page tables -	 * during shadow paging, and must guarantee no CPUs can write to those -	 * page before the lock is dropped. As mentioned in the previous -	 * paragraph, a write-protected SPTE is no guarantee that CPU cannot -	 * perform writes. So to determine if a TLB flush is truly required, KVM -	 * will clear a separate software-only bit (MMU-writable) and skip the -	 * flush if-and-only-if this bit was already clear. -	 * -	 * See is_writable_pte() for more details. -	 */ -	if (flush) -		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);  }  static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) @@ -6493,32 +6453,30 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,  void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,  				   const struct kvm_memory_slot *memslot)  { -	bool flush = false; -  	if (kvm_memslots_have_rmaps(kvm)) {  		write_lock(&kvm->mmu_lock);  		/*  		 * Clear dirty bits only on 4k SPTEs since the legacy MMU only  		 * support dirty logging at a 4k granularity.  		 */ -		flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); +		slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);  		write_unlock(&kvm->mmu_lock);  	}  	if (is_tdp_mmu_enabled(kvm)) {  		read_lock(&kvm->mmu_lock); -		flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); +		kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);  		read_unlock(&kvm->mmu_lock);  	}  	/* +	 * The caller will flush the TLBs after this function returns. +	 *  	 * It's also safe to flush TLBs out of mmu lock here as currently this  	 * function is only used for dirty logging, in which case flushing TLB  	 * out of mmu lock also guarantees no dirty pages will be lost in  	 * dirty_bitmap.  	 */ -	if (flush) -		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);  }  void kvm_mmu_zap_all(struct kvm *kvm) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index f5958071220c..39e0205e7300 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -589,7 +589,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,  	 * If addresses are being invalidated, skip prefetching to avoid  	 * accidentally prefetching those addresses.  	 */ -	if (unlikely(vcpu->kvm->mmu_notifier_count)) +	if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))  		return;  	if (sp->role.direct) @@ -838,7 +838,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault  	else  		fault->max_level = walker.level; -	mmu_seq = vcpu->kvm->mmu_notifier_seq; +	mmu_seq = vcpu->kvm->mmu_invalidate_seq;  	smp_rmb();  	r = kvm_faultin_pfn(vcpu, fault); diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index f3744eea45f5..7670c13ce251 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -343,7 +343,7 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,  }  /* - * An shadow-present leaf SPTE may be non-writable for 3 possible reasons: + * A shadow-present leaf SPTE may be non-writable for 4 possible reasons:   *   *  1. To intercept writes for dirty logging. KVM write-protects huge pages   *     so that they can be split be split down into the dirty logging @@ -361,8 +361,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,   *     read-only memslot or guest memory backed by a read-only VMA. Writes to   *     such pages are disallowed entirely.   * - * To keep track of why a given SPTE is write-protected, KVM uses 2 - * software-only bits in the SPTE: + *  4. To emulate the Accessed bit for SPTEs without A/D bits.  Note, in this + *     case, the SPTE is access-protected, not just write-protected! + * + * For cases #1 and #4, KVM can safely make such SPTEs writable without taking + * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it. + * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits + * in the SPTE:   *   *  shadow_mmu_writable_mask, aka MMU-writable -   *    Cleared on SPTEs that KVM is currently write-protecting for shadow paging @@ -391,7 +396,8 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,   * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging   * (which does not clear the MMU-writable bit), does not flush TLBs before   * dropping the lock, as it only needs to synchronize guest writes with the - * dirty bitmap. + * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for + * access-tracking via the clear_young() MMU notifier also does not flush TLBs.   *   * So, there is the problem: clearing the MMU-writable bit can encounter a   * write-protected SPTE while CPUs still have writable mappings for that SPTE diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d7f8331d6f7e..c9b49a09e6b5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -843,8 +843,7 @@ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)  	if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))  		return true; -	return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, -					 MSR_IA32_SPEC_CTRL); +	return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);  }  unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 205ebdc2b11b..b0c47b41c264 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1011,15 +1011,10 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)  }  EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); -static inline u64 kvm_guest_supported_xcr0(struct kvm_vcpu *vcpu) -{ -	return vcpu->arch.guest_fpu.fpstate->user_xfeatures; -} -  #ifdef CONFIG_X86_64  static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)  { -	return kvm_guest_supported_xcr0(vcpu) & XFEATURE_MASK_USER_DYNAMIC; +	return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;  }  #endif @@ -1042,7 +1037,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)  	 * saving.  However, xcr0 bit 0 is always set, even if the  	 * emulated CPU does not support XSAVE (see kvm_vcpu_reset()).  	 */ -	valid_bits = kvm_guest_supported_xcr0(vcpu) | XFEATURE_MASK_FP; +	valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;  	if (xcr0 & ~valid_bits)  		return 1; @@ -1070,6 +1065,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)  int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)  { +	/* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */  	if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||  	    __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {  		kvm_inject_gp(vcpu, 0); @@ -1557,12 +1553,32 @@ static const u32 msr_based_features_all[] = {  static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];  static unsigned int num_msr_based_features; +/* + * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM + * does not yet virtualize. These include: + *   10 - MISC_PACKAGE_CTRLS + *   11 - ENERGY_FILTERING_CTL + *   12 - DOITM + *   18 - FB_CLEAR_CTRL + *   21 - XAPIC_DISABLE_STATUS + *   23 - OVERCLOCKING_STATUS + */ + +#define KVM_SUPPORTED_ARCH_CAP \ +	(ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \ +	 ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ +	 ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ +	 ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ +	 ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO) +  static u64 kvm_get_arch_capabilities(void)  {  	u64 data = 0; -	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) +	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {  		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); +		data &= KVM_SUPPORTED_ARCH_CAP; +	}  	/*  	 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that @@ -1610,9 +1626,6 @@ static u64 kvm_get_arch_capabilities(void)  		 */  	} -	/* Guests don't need to know "Fill buffer clear control" exists */ -	data &= ~ARCH_CAP_FB_CLEAR_CTRL; -  	return data;  } @@ -10652,7 +10665,8 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)  	case KVM_MP_STATE_INIT_RECEIVED:  		break;  	default: -		return -EINTR; +		WARN_ON_ONCE(1); +		break;  	}  	return 1;  } @@ -11093,9 +11107,22 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,  	vcpu_load(vcpu); -	if (!lapic_in_kernel(vcpu) && -	    mp_state->mp_state != KVM_MP_STATE_RUNNABLE) +	switch (mp_state->mp_state) { +	case KVM_MP_STATE_UNINITIALIZED: +	case KVM_MP_STATE_HALTED: +	case KVM_MP_STATE_AP_RESET_HOLD: +	case KVM_MP_STATE_INIT_RECEIVED: +	case KVM_MP_STATE_SIPI_RECEIVED: +		if (!lapic_in_kernel(vcpu)) +			goto out; +		break; + +	case KVM_MP_STATE_RUNNABLE: +		break; + +	default:  		goto out; +	}  	/*  	 * KVM_MP_STATE_INIT_RECEIVED means the processor is in @@ -11563,7 +11590,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)  	vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),  					    GFP_KERNEL_ACCOUNT);  	if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks) -		goto fail_free_pio_data; +		goto fail_free_mce_banks;  	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;  	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, @@ -11617,7 +11644,6 @@ free_wbinvd_dirty_mask:  fail_free_mce_banks:  	kfree(vcpu->arch.mce_banks);  	kfree(vcpu->arch.mci_ctl2_banks); -fail_free_pio_data:  	free_page((unsigned long)vcpu->arch.pio_data);  fail_free_lapic:  	kvm_free_lapic(vcpu); @@ -12473,6 +12499,50 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,  		} else {  			kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);  		} + +		/* +		 * Unconditionally flush the TLBs after enabling dirty logging. +		 * A flush is almost always going to be necessary (see below), +		 * and unconditionally flushing allows the helpers to omit +		 * the subtly complex checks when removing write access. +		 * +		 * Do the flush outside of mmu_lock to reduce the amount of +		 * time mmu_lock is held.  Flushing after dropping mmu_lock is +		 * safe as KVM only needs to guarantee the slot is fully +		 * write-protected before returning to userspace, i.e. before +		 * userspace can consume the dirty status. +		 * +		 * Flushing outside of mmu_lock requires KVM to be careful when +		 * making decisions based on writable status of an SPTE, e.g. a +		 * !writable SPTE doesn't guarantee a CPU can't perform writes. +		 * +		 * Specifically, KVM also write-protects guest page tables to +		 * monitor changes when using shadow paging, and must guarantee +		 * no CPUs can write to those page before mmu_lock is dropped. +		 * Because CPUs may have stale TLB entries at this point, a +		 * !writable SPTE doesn't guarantee CPUs can't perform writes. +		 * +		 * KVM also allows making SPTES writable outside of mmu_lock, +		 * e.g. to allow dirty logging without taking mmu_lock. +		 * +		 * To handle these scenarios, KVM uses a separate software-only +		 * bit (MMU-writable) to track if a SPTE is !writable due to +		 * a guest page table being write-protected (KVM clears the +		 * MMU-writable flag when write-protecting for shadow paging). +		 * +		 * The use of MMU-writable is also the primary motivation for +		 * the unconditional flush.  Because KVM must guarantee that a +		 * CPU doesn't contain stale, writable TLB entries for a +		 * !MMU-writable SPTE, KVM must flush if it encounters any +		 * MMU-writable SPTE regardless of whether the actual hardware +		 * writable bit was set.  I.e. KVM is almost guaranteed to need +		 * to flush, while unconditionally flushing allows the "remove +		 * write access" helpers to ignore MMU-writable entirely. +		 * +		 * See is_writable_pte() for more details (the case involving +		 * access-tracked SPTEs is particularly relevant). +		 */ +		kvm_arch_flush_remote_tlbs_memslot(kvm, new);  	}  } diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index ad0139d25401..f1bb18617156 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -44,7 +44,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)  	 * called from other contexts.  	 */  	pagefault_disable(); -	ret = __copy_from_user_inatomic(to, from, n); +	ret = raw_copy_from_user(to, from, n);  	pagefault_enable();  	return ret; diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index f8220fd2c169..829c1409ffbd 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -4,10 +4,12 @@ KCOV_INSTRUMENT_tlb.o			:= n  KCOV_INSTRUMENT_mem_encrypt.o		:= n  KCOV_INSTRUMENT_mem_encrypt_amd.o	:= n  KCOV_INSTRUMENT_mem_encrypt_identity.o	:= n +KCOV_INSTRUMENT_pgprot.o		:= n  KASAN_SANITIZE_mem_encrypt.o		:= n  KASAN_SANITIZE_mem_encrypt_amd.o	:= n  KASAN_SANITIZE_mem_encrypt_identity.o	:= n +KASAN_SANITIZE_pgprot.o		:= n  # Disable KCSAN entirely, because otherwise we get warnings that some functions  # reference __initdata sections. @@ -17,6 +19,7 @@ ifdef CONFIG_FUNCTION_TRACER  CFLAGS_REMOVE_mem_encrypt.o		= -pg  CFLAGS_REMOVE_mem_encrypt_amd.o		= -pg  CFLAGS_REMOVE_mem_encrypt_identity.o	= -pg +CFLAGS_REMOVE_pgprot.o			= -pg  endif  obj-y				:=  init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \ diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 331310c29349..60814e110a54 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -41,6 +41,59 @@ static bool ex_handler_default(const struct exception_table_entry *e,  	return true;  } +/* + * This is the *very* rare case where we do a "load_unaligned_zeropad()" + * and it's a page crosser into a non-existent page. + * + * This happens when we optimistically load a pathname a word-at-a-time + * and the name is less than the full word and the  next page is not + * mapped. Typically that only happens for CONFIG_DEBUG_PAGEALLOC. + * + * NOTE! The faulting address is always a 'mov mem,reg' type instruction + * of size 'long', and the exception fixup must always point to right + * after the instruction. + */ +static bool ex_handler_zeropad(const struct exception_table_entry *e, +			       struct pt_regs *regs, +			       unsigned long fault_addr) +{ +	struct insn insn; +	const unsigned long mask = sizeof(long) - 1; +	unsigned long offset, addr, next_ip, len; +	unsigned long *reg; + +	next_ip = ex_fixup_addr(e); +	len = next_ip - regs->ip; +	if (len > MAX_INSN_SIZE) +		return false; + +	if (insn_decode(&insn, (void *) regs->ip, len, INSN_MODE_KERN)) +		return false; +	if (insn.length != len) +		return false; + +	if (insn.opcode.bytes[0] != 0x8b) +		return false; +	if (insn.opnd_bytes != sizeof(long)) +		return false; + +	addr = (unsigned long) insn_get_addr_ref(&insn, regs); +	if (addr == ~0ul) +		return false; + +	offset = addr & mask; +	addr = addr & ~mask; +	if (fault_addr != addr + sizeof(long)) +		return false; + +	reg = insn_get_modrm_reg_ptr(&insn, regs); +	if (!reg) +		return false; + +	*reg = *(unsigned long *)addr >> (offset * 8); +	return ex_handler_default(e, regs); +} +  static bool ex_handler_fault(const struct exception_table_entry *fixup,  			     struct pt_regs *regs, int trapnr)  { @@ -217,6 +270,8 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,  		return ex_handler_sgx(e, regs, trapnr);  	case EX_TYPE_UCOPY_LEN:  		return ex_handler_ucopy_len(e, regs, trapnr, reg, imm); +	case EX_TYPE_ZEROPAD: +		return ex_handler_zeropad(e, regs, fault_addr);  	}  	BUG();  } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 39c5246964a9..0fe690ebc269 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -645,7 +645,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,  			pages++;  			spin_lock(&init_mm.page_table_lock); -			prot = __pgprot(pgprot_val(prot) | __PAGE_KERNEL_LARGE); +			prot = __pgprot(pgprot_val(prot) | _PAGE_PSE);  			set_pte_init((pte_t *)pud,  				     pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index d5ef64ddd35e..66a209f7eb86 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -62,6 +62,7 @@  static bool __read_mostly pat_bp_initialized;  static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); +static bool __initdata pat_force_disabled = !IS_ENABLED(CONFIG_X86_PAT);  static bool __read_mostly pat_bp_enabled;  static bool __read_mostly pat_cm_initialized; @@ -86,6 +87,7 @@ void pat_disable(const char *msg_reason)  static int __init nopat(char *str)  {  	pat_disable("PAT support disabled via boot option."); +	pat_force_disabled = true;  	return 0;  }  early_param("nopat", nopat); @@ -272,7 +274,7 @@ static void pat_ap_init(u64 pat)  	wrmsrl(MSR_IA32_CR_PAT, pat);  } -void init_cache_modes(void) +void __init init_cache_modes(void)  {  	u64 pat = 0; @@ -313,6 +315,12 @@ void init_cache_modes(void)  		 */  		pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |  		      PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); +	} else if (!pat_force_disabled && cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) { +		/* +		 * Clearly PAT is enabled underneath. Allow pat_enabled() to +		 * reflect this. +		 */ +		pat_bp_enabled = true;  	}  	__init_cache_modes(pat); diff --git a/arch/x86/um/shared/sysdep/syscalls_32.h b/arch/x86/um/shared/sysdep/syscalls_32.h index 68fd2cf526fd..f6e9f84397e7 100644 --- a/arch/x86/um/shared/sysdep/syscalls_32.h +++ b/arch/x86/um/shared/sysdep/syscalls_32.h @@ -6,10 +6,9 @@  #include <asm/unistd.h>  #include <sysdep/ptrace.h> -typedef long syscall_handler_t(struct pt_regs); +typedef long syscall_handler_t(struct syscall_args);  extern syscall_handler_t *sys_call_table[];  #define EXECUTE_SYSCALL(syscall, regs) \ -	((long (*)(struct syscall_args)) \ -	 (*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) +	((*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index ac8eee093f9c..66162eafd8e8 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -65,9 +65,6 @@ static int get_free_idx(struct task_struct* task)  	struct thread_struct *t = &task->thread;  	int idx; -	if (!t->arch.tls_array) -		return GDT_ENTRY_TLS_MIN; -  	for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)  		if (!t->arch.tls_array[idx].present)  			return idx + GDT_ENTRY_TLS_MIN; @@ -240,9 +237,6 @@ static int get_tls_entry(struct task_struct *task, struct user_desc *info,  {  	struct thread_struct *t = &task->thread; -	if (!t->arch.tls_array) -		goto clear; -  	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)  		return -EINVAL; diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index 8c0396fd0e6f..6fbe97c52c99 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -65,7 +65,7 @@ quiet_cmd_vdso = VDSO    $@  		       -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \  		 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' -VDSO_LDFLAGS = -fPIC -shared -Wl,--hash-style=sysv +VDSO_LDFLAGS = -fPIC -shared -Wl,--hash-style=sysv -z noexecstack  GCOV_PROFILE := n  #  | 
