diff options
Diffstat (limited to 'arch/x86')
43 files changed, 486 insertions, 276 deletions
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index ef17683484e9..48304b89b601 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1109,6 +1109,8 @@ struct boot_params *make_boot_params(struct efi_config *c) if (!cmdline_ptr) goto fail; hdr->cmd_line_ptr = (unsigned long)cmdline_ptr; + /* Fill in upper bits of command line address, NOP on 32 bit */ + boot_params->ext_cmd_line_ptr = (u64)(unsigned long)cmdline_ptr >> 32; hdr->ramdisk_image = 0; hdr->ramdisk_size = 0; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 89dd0d78013a..805d25ca5f1d 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -2,15 +2,14 @@ #define BOOT_COMPRESSED_MISC_H /* - * we have to be careful, because no indirections are allowed here, and - * paravirt_ops is a kind of one. As it will only run in baremetal anyway, - * we just keep it from happening + * Special hack: we have to be careful, because no indirections are allowed here, + * and paravirt_ops is a kind of one. As it will only run in baremetal anyway, + * we just keep it from happening. (This list needs to be extended when new + * paravirt and debugging variants are added.) */ #undef CONFIG_PARAVIRT +#undef CONFIG_PARAVIRT_SPINLOCKS #undef CONFIG_KASAN -#ifdef CONFIG_X86_32 -#define _ASM_X86_DESC_H 1 -#endif #include <linux/linkage.h> #include <linux/screen_info.h> diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index e42f758a0fbd..055ea9941dd5 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -50,7 +50,7 @@ extern const struct hypervisor_x86 *x86_hyper; /* Recognized hypervisors */ extern const struct hypervisor_x86 x86_hyper_vmware; extern const struct hypervisor_x86 x86_hyper_ms_hyperv; -extern const struct hypervisor_x86 x86_hyper_xen_hvm; +extern const struct hypervisor_x86 x86_hyper_xen; extern const struct hypervisor_x86 x86_hyper_kvm; extern void init_hypervisor(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dea2e7e962e3..f4a555beef19 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -207,6 +207,7 @@ union kvm_mmu_page_role { unsigned nxe:1; unsigned cr0_wp:1; unsigned smep_andnot_wp:1; + unsigned smap_andnot_wp:1; }; }; @@ -400,6 +401,7 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_page_header_cache; struct fpu guest_fpu; + bool eager_fpu; u64 xcr0; u64 guest_supported_xcr0; u32 guest_xstate_size; @@ -743,6 +745,7 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + void (*fpu_activate)(struct kvm_vcpu *vcpu); void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 19507ffa5d28..5fabf1362942 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -107,7 +107,7 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) static inline int user_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_32 - return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL; + return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL; #else return !!(regs->cs & 3); #endif diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 25b1cc07d496..d6b078e9fa28 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, struct pvclock_vsyscall_time_info { struct pvclock_vcpu_time_info pvti; - u32 migrate_count; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 5a9856eb12ba..7d5a1929d76b 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -231,11 +231,21 @@ #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) #ifdef __KERNEL__ + +/* + * early_idt_handler_array is an array of entry points referenced in the + * early IDT. For simplicity, it's a real array with one entry point + * every nine bytes. That leaves room for an optional 'push $0' if the + * vector has no error code (two bytes), a 'push $vector_number' (two + * bytes), and a jump to the common entry code (up to five bytes). + */ +#define EARLY_IDT_HANDLER_SIZE 9 + #ifndef __ASSEMBLY__ -extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; +extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; #ifdef CONFIG_TRACING -# define trace_early_idt_handlers early_idt_handlers +# define trace_early_idt_handler_array early_idt_handler_array #endif /* diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index cf87de3fc390..64b611782ef0 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -169,7 +169,7 @@ static inline int arch_spin_is_contended(arch_spinlock_t *lock) struct __raw_tickets tmp = READ_ONCE(lock->tickets); tmp.head &= ~TICKET_SLOWPATH_FLAG; - return (tmp.tail - tmp.head) > TICKET_LOCK_INC; + return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC; } #define arch_spin_is_contended arch_spin_is_contended diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 358dcd338915..c44a5d53e464 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -269,4 +269,9 @@ static inline bool xen_arch_need_swiotlb(struct device *dev, return false; } +static inline unsigned long xen_get_swiotlb_free_pages(unsigned int order) +{ + return __get_free_pages(__GFP_NOWARN, order); +} + #endif /* _ASM_X86_XEN_PAGE_H */ diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index c469490db4a8..3c6bb342a48f 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -140,6 +140,7 @@ #define MSR_CORE_C3_RESIDENCY 0x000003fc #define MSR_CORE_C6_RESIDENCY 0x000003fd #define MSR_CORE_C7_RESIDENCY 0x000003fe +#define MSR_KNL_CORE_C6_RESIDENCY 0x000003ff #define MSR_PKG_C2_RESIDENCY 0x0000060d #define MSR_PKG_C8_RESIDENCY 0x00000630 #define MSR_PKG_C9_RESIDENCY 0x00000631 diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 36ce402a3fa5..d820d8eae96b 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -27,8 +27,8 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { -#ifdef CONFIG_XEN_PVHVM - &x86_hyper_xen_hvm, +#ifdef CONFIG_XEN + &x86_hyper_xen, #endif &x86_hyper_vmware, &x86_hyper_ms_hyperv, diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e535533d5ab8..20190bdac9d5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -708,6 +708,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, struct pt_regs *regs) { int i, ret = 0; + char *tmp; for (i = 0; i < mca_cfg.banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); @@ -716,9 +717,11 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, if (quirk_no_way_out) quirk_no_way_out(i, m, regs); } - if (mce_severity(m, mca_cfg.tolerant, msg, true) >= - MCE_PANIC_SEVERITY) + + if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + *msg = tmp; ret = 1; + } } return ret; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 87848ebe2bb7..4f7001f28936 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -190,6 +190,7 @@ static bool check_hw_exists(void) u64 val, val_fail, val_new= ~0; int i, reg, reg_fail, ret = 0; int bios_fail = 0; + int reg_safe = -1; /* * Check to see if the BIOS enabled any of the counters, if so @@ -204,6 +205,8 @@ static bool check_hw_exists(void) bios_fail = 1; val_fail = val; reg_fail = reg; + } else { + reg_safe = i; } } @@ -222,11 +225,22 @@ static bool check_hw_exists(void) } /* + * If all the counters are enabled, the below test will always + * fail. The tools will also become useless in this scenario. + * Just fail and disable the hardware counters. + */ + + if (reg_safe == -1) { + reg = reg_safe; + goto msr_fail; + } + + /* * Read the current value, change it and read it back to see if it * matches, this is needed to detect certain hardware emulators * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ - reg = x86_pmu_event_addr(0); + reg = x86_pmu_event_addr(reg_safe); if (rdmsrl_safe(reg, &val)) goto msr_fail; val ^= 0xffffUL; @@ -611,6 +625,7 @@ struct sched_state { int event; /* event index */ int counter; /* counter index */ int unassigned; /* number of events to be assigned left */ + int nr_gp; /* number of GP counters used */ unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; @@ -620,27 +635,29 @@ struct sched_state { struct perf_sched { int max_weight; int max_events; - struct perf_event **events; - struct sched_state state; + int max_gp; int saved_states; + struct event_constraint **constraints; + struct sched_state state; struct sched_state saved[SCHED_STATES_MAX]; }; /* * Initialize interator that runs through all events and counters. */ -static void perf_sched_init(struct perf_sched *sched, struct perf_event **events, - int num, int wmin, int wmax) +static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, + int num, int wmin, int wmax, int gpmax) { int idx; memset(sched, 0, sizeof(*sched)); sched->max_events = num; sched->max_weight = wmax; - sched->events = events; + sched->max_gp = gpmax; + sched->constraints = constraints; for (idx = 0; idx < num; idx++) { - if (events[idx]->hw.constraint->weight == wmin) + if (constraints[idx]->weight == wmin) break; } @@ -687,7 +704,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) if (sched->state.event >= sched->max_events) return false; - c = sched->events[sched->state.event]->hw.constraint; + c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { idx = INTEL_PMC_IDX_FIXED; @@ -696,11 +713,16 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) goto done; } } + /* Grab the first unused counter starting with idx */ idx = sched->state.counter; for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { - if (!__test_and_set_bit(idx, sched->state.used)) + if (!__test_and_set_bit(idx, sched->state.used)) { + if (sched->state.nr_gp++ >= sched->max_gp) + return false; + goto done; + } } return false; @@ -745,7 +767,7 @@ static bool perf_sched_next_event(struct perf_sched *sched) if (sched->state.weight > sched->max_weight) return false; } - c = sched->events[sched->state.event]->hw.constraint; + c = sched->constraints[sched->state.event]; } while (c->weight != sched->state.weight); sched->state.counter = 0; /* start with first counter */ @@ -756,12 +778,12 @@ static bool perf_sched_next_event(struct perf_sched *sched) /* * Assign a counter for each event. */ -int perf_assign_events(struct perf_event **events, int n, - int wmin, int wmax, int *assign) +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int gpmax, int *assign) { struct perf_sched sched; - perf_sched_init(&sched, events, n, wmin, wmax); + perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); do { if (!perf_sched_find_counter(&sched)) @@ -788,9 +810,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) x86_pmu.start_scheduling(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { - hwc = &cpuc->event_list[i]->hw; + cpuc->event_constraint[i] = NULL; c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); - hwc->constraint = c; + cpuc->event_constraint[i] = c; wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); @@ -801,7 +823,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) */ for (i = 0; i < n; i++) { hwc = &cpuc->event_list[i]->hw; - c = hwc->constraint; + c = cpuc->event_constraint[i]; /* never assigned */ if (hwc->idx == -1) @@ -821,9 +843,26 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) } /* slow path */ - if (i != n) - unsched = perf_assign_events(cpuc->event_list, n, wmin, - wmax, assign); + if (i != n) { + int gpmax = x86_pmu.num_counters; + + /* + * Do not allow scheduling of more than half the available + * generic counters. + * + * This helps avoid counter starvation of sibling thread by + * ensuring at most half the counters cannot be in exclusive + * mode. There is no designated counters for the limits. Any + * N/2 counters can be used. This helps with events with + * specific counter constraints. + */ + if (is_ht_workaround_enabled() && !cpuc->is_fake && + READ_ONCE(cpuc->excl_cntrs->exclusive_present)) + gpmax /= 2; + + unsched = perf_assign_events(cpuc->event_constraint, n, wmin, + wmax, gpmax, assign); + } /* * In case of success (unsched = 0), mark events as committed, @@ -840,7 +879,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) e = cpuc->event_list[i]; e->hw.flags |= PERF_X86_EVENT_COMMITTED; if (x86_pmu.commit_scheduling) - x86_pmu.commit_scheduling(cpuc, e, assign[i]); + x86_pmu.commit_scheduling(cpuc, i, assign[i]); } } @@ -1292,8 +1331,10 @@ static void x86_pmu_del(struct perf_event *event, int flags) x86_pmu.put_event_constraints(cpuc, event); /* Delete the array entry. */ - while (++i < cpuc->n_events) + while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; + cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; + } --cpuc->n_events; perf_event_update_userpage(event); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 6ac5cb7a9e14..ef78516850fb 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -74,6 +74,7 @@ struct event_constraint { #define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */ #define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */ #define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ +#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ struct amd_nb { @@ -134,8 +135,6 @@ enum intel_excl_state_type { struct intel_excl_states { enum intel_excl_state_type init_state[X86_PMC_IDX_MAX]; enum intel_excl_state_type state[X86_PMC_IDX_MAX]; - int num_alloc_cntrs;/* #counters allocated */ - int max_alloc_cntrs;/* max #counters allowed */ bool sched_started; /* true if scheduling has started */ }; @@ -144,6 +143,11 @@ struct intel_excl_cntrs { struct intel_excl_states states[2]; + union { + u16 has_exclusive[2]; + u32 exclusive_present; + }; + int refcnt; /* per-core: #HT threads */ unsigned core_id; /* per-core: core id */ }; @@ -172,7 +176,11 @@ struct cpu_hw_events { added in the current transaction */ int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; + struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + struct event_constraint *event_constraint[X86_PMC_IDX_MAX]; + + int n_excl; /* the number of exclusive events */ unsigned int group_flag; int is_fake; @@ -519,9 +527,7 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); - void (*commit_scheduling)(struct cpu_hw_events *cpuc, - struct perf_event *event, - int cntr); + void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr); void (*start_scheduling)(struct cpu_hw_events *cpuc); @@ -717,8 +723,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, void x86_pmu_enable_all(int added); -int perf_assign_events(struct perf_event **events, int n, - int wmin, int wmax, int *assign); +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int gpmax, int *assign); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); void x86_pmu_stop(struct perf_event *event, int flags); @@ -929,4 +935,8 @@ static inline struct intel_shared_regs *allocate_shared_regs(int cpu) return NULL; } +static inline int is_ht_workaround_enabled(void) +{ + return 0; +} #endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 219d3fb423a1..a1e35c9f06b9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1134,7 +1134,7 @@ static __initconst const u64 slm_hw_cache_extra_regs [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS, - [ C(RESULT_MISS) ] = SLM_DMND_READ|SLM_LLC_MISS, + [ C(RESULT_MISS) ] = 0, }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS, @@ -1184,8 +1184,7 @@ static __initconst const u64 slm_hw_cache_event_ids [ C(OP_READ) ] = { /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0, }, [ C(OP_WRITE) ] = { /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ @@ -1217,7 +1216,7 @@ static __initconst const u64 slm_hw_cache_event_ids [ C(ITLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + [ C(RESULT_MISS) ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, @@ -1924,7 +1923,6 @@ intel_start_scheduling(struct cpu_hw_events *cpuc) xl = &excl_cntrs->states[tid]; xl->sched_started = true; - xl->num_alloc_cntrs = 0; /* * lock shared state until we are done scheduling * in stop_event_scheduling() @@ -2001,6 +1999,11 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * across HT threads */ is_excl = c->flags & PERF_X86_EVENT_EXCL; + if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) { + event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT; + if (!cpuc->n_excl++) + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1); + } /* * xl = state of current HT @@ -2009,18 +2012,6 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, xl = &excl_cntrs->states[tid]; xlo = &excl_cntrs->states[o_tid]; - /* - * do not allow scheduling of more than max_alloc_cntrs - * which is set to half the available generic counters. - * this helps avoid counter starvation of sibling thread - * by ensuring at most half the counters cannot be in - * exclusive mode. There is not designated counters for the - * limits. Any N/2 counters can be used. This helps with - * events with specifix counter constraints - */ - if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs) - return &emptyconstraint; - cx = c; /* @@ -2107,7 +2098,7 @@ static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { - struct event_constraint *c1 = event->hw.constraint; + struct event_constraint *c1 = cpuc->event_constraint[idx]; struct event_constraint *c2; /* @@ -2151,6 +2142,11 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, xl = &excl_cntrs->states[tid]; xlo = &excl_cntrs->states[o_tid]; + if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) { + hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT; + if (!--cpuc->n_excl) + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0); + } /* * put_constraint may be called from x86_schedule_events() @@ -2189,8 +2185,6 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, static void intel_put_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - struct event_constraint *c = event->hw.constraint; - intel_put_shared_regs_event_constraints(cpuc, event); /* @@ -2198,19 +2192,14 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc, * all events are subject to and must call the * put_excl_constraints() routine */ - if (c && cpuc->excl_cntrs) + if (cpuc->excl_cntrs) intel_put_excl_constraints(cpuc, event); - - /* cleanup dynamic constraint */ - if (c && (c->flags & PERF_X86_EVENT_DYNAMIC)) - event->hw.constraint = NULL; } -static void intel_commit_scheduling(struct cpu_hw_events *cpuc, - struct perf_event *event, int cntr) +static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr) { struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct event_constraint *c = event->hw.constraint; + struct event_constraint *c = cpuc->event_constraint[idx]; struct intel_excl_states *xlo, *xl; int tid = cpuc->excl_thread_id; int o_tid = 1 - tid; @@ -2533,34 +2522,6 @@ ssize_t intel_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } -static __initconst const struct x86_pmu core_pmu = { - .name = "core", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = core_pmu_enable_all, - .enable = core_pmu_enable_event, - .disable = x86_pmu_disable_event, - .hw_config = x86_pmu_hw_config, - .schedule_events = x86_schedule_events, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic event period: - */ - .max_period = (1ULL << 31) - 1, - .get_event_constraints = intel_get_event_constraints, - .put_event_constraints = intel_put_event_constraints, - .event_constraints = intel_core_event_constraints, - .guest_get_msrs = core_guest_get_msrs, - .format_attrs = intel_arch_formats_attr, - .events_sysfs_show = intel_event_sysfs_show, -}; - struct intel_shared_regs *allocate_shared_regs(int cpu) { struct intel_shared_regs *regs; @@ -2668,8 +2629,6 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { - int h = x86_pmu.num_counters >> 1; - for_each_cpu(i, topology_thread_cpumask(cpu)) { struct intel_excl_cntrs *c; @@ -2683,11 +2642,6 @@ static void intel_pmu_cpu_starting(int cpu) } cpuc->excl_cntrs->core_id = core_id; cpuc->excl_cntrs->refcnt++; - /* - * set hard limit to half the number of generic counters - */ - cpuc->excl_cntrs->states[0].max_alloc_cntrs = h; - cpuc->excl_cntrs->states[1].max_alloc_cntrs = h; } } @@ -2743,6 +2697,44 @@ static struct attribute *intel_arch3_formats_attr[] = { NULL, }; +static __initconst const struct x86_pmu core_pmu = { + .name = "core", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = core_pmu_enable_all, + .enable = core_pmu_enable_event, + .disable = x86_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32-bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL<<31) - 1, + .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, + .event_constraints = intel_core_event_constraints, + .guest_get_msrs = core_guest_get_msrs, + .format_attrs = intel_arch_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, + + /* + * Virtual (or funny metal) CPU can define x86_pmu.extra_regs + * together with PMU version 1 and thus be using core_pmu with + * shared_regs. We need following callbacks here to allocate + * it properly. + */ + .cpu_prepare = intel_pmu_cpu_prepare, + .cpu_starting = intel_pmu_cpu_starting, + .cpu_dying = intel_pmu_cpu_dying, +}; + static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 813f75d71175..7f73b3553e2e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -706,9 +706,9 @@ void intel_pmu_pebs_disable(struct perf_event *event) cpuc->pebs_enabled &= ~(1ULL << hwc->idx); - if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT) + if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32)); - else if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_ST) + else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); if (cpuc->enabled) diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index ffe666c2c6b5..123ff1bb2f60 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -151,7 +151,7 @@ static int __init pt_pmu_hw_init(void) de_attr->attr.attr.name = pt_caps[i].name; - sysfs_attr_init(&de_attrs->attr.attr); + sysfs_attr_init(&de_attr->attr.attr); de_attr->attr.attr.mode = S_IRUGO; de_attr->attr.show = pt_cap_show; @@ -615,7 +615,8 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, struct perf_output_handle *handle) { - unsigned long idx, npages, end; + unsigned long head = local64_read(&buf->head); + unsigned long idx, npages, wakeup; if (buf->snapshot) return 0; @@ -634,17 +635,26 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, buf->topa_index[buf->stop_pos]->stop = 0; buf->topa_index[buf->intr_pos]->intr = 0; - if (pt_cap_get(PT_CAP_topa_multiple_entries)) { - npages = (handle->size + 1) >> PAGE_SHIFT; - end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages; - /*if (end > handle->wakeup >> PAGE_SHIFT) - end = handle->wakeup >> PAGE_SHIFT;*/ - idx = end & (buf->nr_pages - 1); - buf->stop_pos = idx; - idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1; - idx &= buf->nr_pages - 1; - buf->intr_pos = idx; - } + /* how many pages till the STOP marker */ + npages = handle->size >> PAGE_SHIFT; + + /* if it's on a page boundary, fill up one more page */ + if (!offset_in_page(head + handle->size + 1)) + npages++; + + idx = (head >> PAGE_SHIFT) + npages; + idx &= buf->nr_pages - 1; + buf->stop_pos = idx; + + wakeup = handle->wakeup >> PAGE_SHIFT; + + /* in the worst case, wake up the consumer one page before hard stop */ + idx = (head >> PAGE_SHIFT) + npages - 1; + if (idx > wakeup) + idx = wakeup; + + idx &= buf->nr_pages - 1; + buf->intr_pos = idx; buf->topa_index[buf->stop_pos]->stop = 1; buf->topa_index[buf->intr_pos]->intr = 1; diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 999289b94025..358c54ad20d4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -722,6 +722,7 @@ static int __init rapl_pmu_init(void) break; case 60: /* Haswell */ case 69: /* Haswell-Celeron */ + case 61: /* Broadwell */ rapl_cntr_mask = RAPL_IDX_HSW; rapl_pmu_events_group.attrs = rapl_events_hsw_attr; break; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index c635b8b49e93..90b7c501c95b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -365,9 +365,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { - hwc = &box->event_list[i]->hw; c = uncore_get_event_constraint(box, box->event_list[i]); - hwc->constraint = c; + box->event_constraint[i] = c; wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); } @@ -375,7 +374,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int /* fastpath, try to reuse previous register */ for (i = 0; i < n; i++) { hwc = &box->event_list[i]->hw; - c = hwc->constraint; + c = box->event_constraint[i]; /* never assigned */ if (hwc->idx == -1) @@ -395,8 +394,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int } /* slow path */ if (i != n) - ret = perf_assign_events(box->event_list, n, - wmin, wmax, assign); + ret = perf_assign_events(box->event_constraint, n, + wmin, wmax, n, assign); if (!assign || ret) { for (i = 0; i < n; i++) @@ -840,6 +839,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id box->phys_id = phys_id; box->pci_dev = pdev; box->pmu = pmu; + uncore_box_init(box); pci_set_drvdata(pdev, box); raw_spin_lock(&uncore_box_lock); @@ -1003,8 +1003,10 @@ static int uncore_cpu_starting(int cpu) pmu = &type->pmus[j]; box = *per_cpu_ptr(pmu->box, cpu); /* called by uncore_cpu_init? */ - if (box && box->phys_id >= 0) + if (box && box->phys_id >= 0) { + uncore_box_init(box); continue; + } for_each_online_cpu(k) { exist = *per_cpu_ptr(pmu->box, k); @@ -1020,8 +1022,10 @@ static int uncore_cpu_starting(int cpu) } } - if (box) + if (box) { box->phys_id = phys_id; + uncore_box_init(box); + } } } return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 6c8c1e7e69d8..ceac8f5dc018 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -97,6 +97,7 @@ struct intel_uncore_box { atomic_t refcnt; struct perf_event *events[UNCORE_PMC_IDX_MAX]; struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; + struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX]; unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; u64 tags[UNCORE_PMC_IDX_MAX]; struct pci_dev *pci_dev; @@ -257,14 +258,6 @@ static inline int uncore_num_counters(struct intel_uncore_box *box) return box->pmu->type->num_counters; } -static inline void uncore_box_init(struct intel_uncore_box *box) -{ - if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { - if (box->pmu->type->ops->init_box) - box->pmu->type->ops->init_box(box); - } -} - static inline void uncore_disable_box(struct intel_uncore_box *box) { if (box->pmu->type->ops->disable_box) @@ -273,8 +266,6 @@ static inline void uncore_disable_box(struct intel_uncore_box *box) static inline void uncore_enable_box(struct intel_uncore_box *box) { - uncore_box_init(box); - if (box->pmu->type->ops->enable_box) box->pmu->type->ops->enable_box(box); } @@ -297,6 +288,14 @@ static inline u64 uncore_read_counter(struct intel_uncore_box *box, return box->pmu->type->ops->read_counter(box, event); } +static inline void uncore_box_init(struct intel_uncore_box *box) +{ + if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { + if (box->pmu->type->ops->init_box) + box->pmu->type->ops->init_box(box); + } +} + static inline bool uncore_box_is_fake(struct intel_uncore_box *box) { return (box->phys_id < 0); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index 3001015b755c..4562e9e22c60 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -1,6 +1,13 @@ /* Nehalem/SandBridge/Haswell uncore support */ #include "perf_event_intel_uncore.h" +/* Uncore IMC PCI IDs */ +#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100 +#define PCI_DEVICE_ID_INTEL_IVB_IMC 0x0154 +#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150 +#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 +#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 + /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff #define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 @@ -472,6 +479,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -502,6 +513,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver), /* 3rd Gen Core processor */ IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */ IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */ + IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */ { /* end marker */ } }; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 12d9548457e7..6d6e85dd5849 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -164,8 +164,8 @@ ((1ULL << (n)) - 1))) /* Haswell-EP Ubox */ -#define HSWEP_U_MSR_PMON_CTR0 0x705 -#define HSWEP_U_MSR_PMON_CTL0 0x709 +#define HSWEP_U_MSR_PMON_CTR0 0x709 +#define HSWEP_U_MSR_PMON_CTL0 0x705 #define HSWEP_U_MSR_PMON_FILTER 0x707 #define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL 0x703 @@ -1914,7 +1914,7 @@ static struct intel_uncore_type hswep_uncore_cbox = { .name = "cbox", .num_counters = 4, .num_boxes = 18, - .perf_ctr_bits = 44, + .perf_ctr_bits = 48, .event_ctl = HSWEP_C0_MSR_PMON_CTL0, .perf_ctr = HSWEP_C0_MSR_PMON_CTR0, .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2b55ee6db053..5a4668136e98 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -167,7 +167,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handlers[i]); + set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index d031bad9e07e..53eeb226657c 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -478,21 +478,22 @@ is486: __INIT setup_once: /* - * Set up a idt with 256 entries pointing to ignore_int, - * interrupt gates. It doesn't actually load idt - that needs - * to be done on each CPU. Interrupts are enabled elsewhere, - * when we can be relatively sure everything is ok. + * Set up a idt with 256 interrupt gates that push zero if there + * is no error code and then jump to early_idt_handler_common. + * It doesn't actually load the idt - that needs to be done on + * each CPU. Interrupts are enabled elsewhere, when we can be + * relatively sure everything is ok. */ movl $idt_table,%edi - movl $early_idt_handlers,%eax + movl $early_idt_handler_array,%eax movl $NUM_EXCEPTION_VECTORS,%ecx 1: movl %eax,(%edi) movl %eax,4(%edi) /* interrupt gate, dpl=0, present */ movl $(0x8E000000 + __KERNEL_CS),2(%edi) - addl $9,%eax + addl $EARLY_IDT_HANDLER_SIZE,%eax addl $8,%edi loop 1b @@ -524,26 +525,28 @@ setup_once: andl $0,setup_once_ref /* Once is enough, thanks */ ret -ENTRY(early_idt_handlers) +ENTRY(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs # 28(%esp) %eip # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handlers) +ENDPROC(early_idt_handler_array) - /* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%esp) # X86_TRAP_NMI @@ -603,7 +606,7 @@ ex_entry: is_nmi: addl $8,%esp /* drop vector number and error code */ iret -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ ALIGN diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ae6588b301c2..df7e78057ae0 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -321,26 +321,28 @@ bad_address: jmp bad_address __INIT - .globl early_idt_handlers -early_idt_handlers: +ENTRY(early_idt_handler_array) # 104(%rsp) %rflags # 96(%rsp) %cs # 88(%rsp) %rip # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushq $0 # Dummy error code, to make stack frame uniform .endif pushq $i # 72(%rsp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr +ENDPROC(early_idt_handler_array) -/* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%rsp) # X86_TRAP_NMI @@ -412,7 +414,7 @@ ENTRY(early_idt_handler) is_nmi: addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) __INITDATA diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 009183276bb7..6185d3141219 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -173,6 +173,21 @@ static void init_thread_xstate(void) xstate_size = sizeof(struct i387_fxsave_struct); else xstate_size = sizeof(struct i387_fsave_struct); + + /* + * Quirk: we don't yet handle the XSAVES* instructions + * correctly, as we don't correctly convert between + * standard and compacted format when interfacing + * with user-space - so disable it for now. + * + * The difference is small: with recent CPUs the + * compacted format is only marginally smaller than + * the standard FPU state format. + * + * ( This is easy to backport while we are fixing + * XSAVES* support. ) + */ + setup_clear_cpu_cap(X86_FEATURE_XSAVES); } /* diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 8213da62b1b7..6e338e3b1dc0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -57,7 +57,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif }; -EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss); +EXPORT_PER_CPU_SYMBOL(cpu_tss); #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); @@ -156,11 +156,13 @@ void flush_thread(void) /* FPU state will be reallocated lazily at the first use. */ drop_fpu(tsk); free_thread_xstate(tsk); - } else if (!used_math()) { - /* kthread execs. TODO: cleanup this horror. */ - if (WARN_ON(init_fpu(tsk))) - force_sig(SIGKILL, tsk); - user_fpu_begin(); + } else { + if (!tsk_used_math(tsk)) { + /* kthread execs. TODO: cleanup this horror. */ + if (WARN_ON(init_fpu(tsk))) + force_sig(SIGKILL, tsk); + user_fpu_begin(); + } restore_init_xstate(); } } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index e5ecd20e72dd..2f355d229a58 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } -static struct pvclock_vsyscall_time_info *pvclock_vdso_info; - -static struct pvclock_vsyscall_time_info * -pvclock_get_vsyscall_user_time_info(int cpu) -{ - if (!pvclock_vdso_info) { - BUG(); - return NULL; - } - - return &pvclock_vdso_info[cpu]; -} - -struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) -{ - return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; -} - #ifdef CONFIG_X86_64 -static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, - void *v) -{ - struct task_migration_notifier *mn = v; - struct pvclock_vsyscall_time_info *pvti; - - pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); - - /* this is NULL when pvclock vsyscall is not initialized */ - if (unlikely(pvti == NULL)) - return NOTIFY_DONE; - - pvti->migrate_count++; - - return NOTIFY_DONE; -} - -static struct notifier_block pvclock_migrate = { - .notifier_call = pvclock_task_migrate, -}; - /* * Initialize the generic pvclock vsyscall state. This will allocate * a/some page(s) for the per-vcpu pvclock information, set up a @@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); - pvclock_vdso_info = i; - for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, __pa(i) + (idx*PAGE_SIZE), PAGE_KERNEL_VVAR); } - - register_task_migration_notifier(&pvclock_migrate); - return 0; } #endif diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 59b69f6a2844..1d08ad3582d0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -16,6 +16,8 @@ #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> +#include <asm/i387.h> /* For use_eager_fpu. Ugh! */ +#include <asm/fpu-internal.h> /* For use_eager_fpu. Ugh! */ #include <asm/user.h> #include <asm/xsave.h> #include "cpuid.h" @@ -95,6 +97,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); + vcpu->arch.eager_fpu = guest_cpuid_has_mpx(vcpu); + /* * The existing code assumes virtual address is 48-bit in the canonical * address checks; exit if it is ever changed. diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index c3b1ad9fca81..496b3695d3d3 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -117,4 +117,12 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_RTM)); } + +static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_MPX)); +} #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d43867c33bc4..b73337634214 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3736,8 +3736,8 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, } } -void update_permission_bitmask(struct kvm_vcpu *vcpu, - struct kvm_mmu *mmu, bool ept) +static void update_permission_bitmask(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; @@ -3918,6 +3918,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) { bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); struct kvm_mmu *context = &vcpu->arch.mmu; MMU_WARN_ON(VALID_PAGE(context->root_hpa)); @@ -3936,6 +3937,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) context->base_role.cr0_wp = is_write_protection(vcpu); context->base_role.smep_andnot_wp = smep && !is_write_protection(vcpu); + context->base_role.smap_andnot_wp + = smap && !is_write_protection(vcpu); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); @@ -4207,12 +4210,18 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; - union kvm_mmu_page_role mask = { .word = 0 }; struct kvm_mmu_page *sp; LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; bool remote_flush, local_flush, zap_page; + union kvm_mmu_page_role mask = { }; + + mask.cr0_wp = 1; + mask.cr4_pae = 1; + mask.nxe = 1; + mask.smep_andnot_wp = 1; + mask.smap_andnot_wp = 1; /* * If we don't have indirect shadow pages, it means no page is @@ -4238,7 +4247,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); - mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index c7d65637c851..0ada65ecddcf 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -71,8 +71,6 @@ enum { int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); -void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - bool ept); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -166,6 +164,8 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index = (pfec >> 1) + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + WARN_ON(pfec & PFERR_RSVD_MASK); + return (mmu->permissions[index] >> pte_access) & 1; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index fd49c867b25a..6e6d115fe9b5 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -718,6 +718,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, mmu_is_nested(vcpu)); if (likely(r != RET_MMIO_PF_INVALID)) return r; + + /* + * page fault with PFEC.RSVD = 1 is caused by shadow + * page fault, should not be used to walk guest page + * table. + */ + error_code &= ~PFERR_RSVD_MASK; }; r = mmu_topup_memory_caches(vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce741b8650f6..9afa233b5482 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4381,6 +4381,7 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_activate = svm_fpu_activate, .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f7b61687bd79..2d73807f0d31 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10185,6 +10185,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .fpu_activate = vmx_fpu_activate, .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ed31c31b2485..ea306adbbc13 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -702,8 +702,9 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | - X86_CR4_PAE | X86_CR4_SMEP; + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | + X86_CR4_SMEP | X86_CR4_SMAP; + if (cr4 & CR4_RESERVED_BITS) return 1; @@ -744,9 +745,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); - if ((cr4 ^ old_cr4) & X86_CR4_SMAP) - update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false); - if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) kvm_update_cpuid(vcpu); @@ -1669,12 +1667,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) &guest_hv_clock, sizeof(guest_hv_clock)))) return 0; - /* - * The interface expects us to write an even number signaling that the - * update is finished. Since the guest won't see the intermediate - * state, we just increase by 2 at the end. + /* This VCPU is paused, but it's legal for a guest to read another + * VCPU's kvmclock, so we really have to follow the specification where + * it says that version is odd if data is being modified, and even after + * it is consistent. + * + * Version field updates must be kept separate. This is because + * kvm_write_guest_cached might use a "rep movs" instruction, and + * writes within a string instruction are weakly ordered. So there + * are three writes overall. + * + * As a small optimization, only write the version field in the first + * and third write. The vcpu->pv_time cache is still valid, because the + * version field is the first in the struct. */ - vcpu->hv_clock.version = guest_hv_clock.version + 2; + BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); + + vcpu->hv_clock.version = guest_hv_clock.version + 1; + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); + + smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); @@ -1695,6 +1709,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kvm_write_guest_cached(v->kvm, &vcpu->pv_time, &vcpu->hv_clock, sizeof(vcpu->hv_clock)); + + smp_wmb(); + + vcpu->hv_clock.version++; + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); return 0; } @@ -6174,6 +6195,8 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) return; page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_page(page)) + return; kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page)); /* @@ -7037,7 +7060,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) fpu_save_init(&vcpu->arch.guest_fpu); __kernel_fpu_end(); ++vcpu->stat.fpu_reload; - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + if (!vcpu->arch.eager_fpu) + kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + trace_kvm_fpu(0); } @@ -7053,11 +7078,21 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { + struct kvm_vcpu *vcpu; + if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) printk_once(KERN_WARNING "kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); - return kvm_x86_ops->vcpu_create(kvm, id); + + vcpu = kvm_x86_ops->vcpu_create(kvm, id); + + /* + * Activate fpu unconditionally in case the guest needs eager FPU. It will be + * deactivated soon if it doesn't. + */ + kvm_x86_ops->fpu_activate(vcpu); + return vcpu; } int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 5ead4d6cf3a7..70e7444c6835 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -351,18 +351,20 @@ int arch_ioremap_pmd_supported(void) */ void *xlate_dev_mem_ptr(phys_addr_t phys) { - void *addr; - unsigned long start = phys & PAGE_MASK; + unsigned long start = phys & PAGE_MASK; + unsigned long offset = phys & ~PAGE_MASK; + unsigned long vaddr; /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ if (page_is_ram(start >> PAGE_SHIFT)) return __va(phys); - addr = (void __force *)ioremap_cache(start, PAGE_SIZE); - if (addr) - addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); + vaddr = (unsigned long)ioremap_cache(start, PAGE_SIZE); + /* Only add the offset on success and return NULL if the ioremap() failed: */ + if (vaddr) + vaddr += offset; - return addr; + return (void *)vaddr; } void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 987514396c1e..ddeff4844a10 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -559,6 +559,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, if (is_ereg(dst_reg)) EMIT1(0x41); EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8); + + /* emit 'movzwl eax, ax' */ + if (is_ereg(dst_reg)) + EMIT3(0x45, 0x0F, 0xB7); + else + EMIT2(0x0F, 0xB7); + EMIT1(add_2reg(0xC0, dst_reg, dst_reg)); break; case 32: /* emit 'bswap eax' to swap lower 4 bytes */ @@ -577,6 +584,27 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, break; case BPF_ALU | BPF_END | BPF_FROM_LE: + switch (imm32) { + case 16: + /* emit 'movzwl eax, ax' to zero extend 16-bit + * into 64 bit + */ + if (is_ereg(dst_reg)) + EMIT3(0x45, 0x0F, 0xB7); + else + EMIT2(0x0F, 0xB7); + EMIT1(add_2reg(0xC0, dst_reg, dst_reg)); + break; + case 32: + /* emit 'mov eax, eax' to clear upper 32-bits */ + if (is_ereg(dst_reg)) + EMIT1(0x45); + EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg)); + break; + case 64: + /* nop */ + break; + } break; /* ST: *(u8*)(dst_reg + off) = imm */ @@ -938,7 +966,12 @@ void bpf_int_jit_compile(struct bpf_prog *prog) } ctx.cleanup_addr = proglen; - for (pass = 0; pass < 10; pass++) { + /* JITed image shrinks with every pass and the loop iterates + * until the image stops shrinking. Very large bpf programs + * may converge on the last pass. In such case do one more + * pass to emit the final image + */ + for (pass = 0; pass < 10 || image; pass++) { proglen = do_jit(prog, addrs, image, oldproglen, &ctx); if (proglen <= 0) { image = NULL; diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index e4695985f9de..14a63ed6fe09 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -325,6 +325,26 @@ static void release_pci_root_info(struct pci_host_bridge *bridge) kfree(info); } +/* + * An IO port or MMIO resource assigned to a PCI host bridge may be + * consumed by the host bridge itself or available to its child + * bus/devices. The ACPI specification defines a bit (Producer/Consumer) + * to tell whether the resource is consumed by the host bridge itself, + * but firmware hasn't used that bit consistently, so we can't rely on it. + * + * On x86 and IA64 platforms, all IO port and MMIO resources are assumed + * to be available to child bus/devices except one special case: + * IO port [0xCF8-0xCFF] is consumed by the host bridge itself + * to access PCI configuration space. + * + * So explicitly filter out PCI CFG IO ports[0xCF8-0xCFF]. + */ +static bool resource_is_pcicfg_ioport(struct resource *res) +{ + return (res->flags & IORESOURCE_IO) && + res->start == 0xCF8 && res->end == 0xCFF; +} + static void probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, int busnum, int domain, @@ -346,8 +366,8 @@ static void probe_pci_root_info(struct pci_root_info *info, "no IO and memory resources present in _CRS\n"); else resource_list_for_each_entry_safe(entry, tmp, list) { - if ((entry->res->flags & IORESOURCE_WINDOW) == 0 || - (entry->res->flags & IORESOURCE_DISABLED)) + if ((entry->res->flags & IORESOURCE_DISABLED) || + resource_is_pcicfg_ioport(entry->res)) resource_list_destroy_entry(entry); else entry->res->name = info->name; @@ -462,9 +482,16 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) { - struct pci_sysdata *sd = bridge->bus->sysdata; - - ACPI_COMPANION_SET(&bridge->dev, sd->companion); + /* + * We pass NULL as parent to pci_create_root_bus(), so if it is not NULL + * here, pci_create_root_bus() has been called by someone else and + * sysdata is likely to be different from what we expect. Let it go in + * that case. + */ + if (!bridge->dev.parent) { + struct pci_sysdata *sd = bridge->bus->sysdata; + ACPI_COMPANION_SET(&bridge->dev, sd->companion); + } return 0; } diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 275a3a8b78af..e97032069f88 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -51,7 +51,7 @@ VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ $(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) -HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi +HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/x86/include/uapi hostprogs-y += vdso2c quiet_cmd_vdso2c = VDSO2C $@ diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 40d2473836c9..9793322751e0 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode) cycle_t ret; u64 last; u32 version; - u32 migrate_count; u8 flags; unsigned cpu, cpu1; /* - * When looping to get a consistent (time-info, tsc) pair, we - * also need to deal with the possibility we can switch vcpus, - * so make sure we always re-fetch time-info for the current vcpu. + * Note: hypervisor must guarantee that: + * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. + * 2. that per-CPU pvclock time info is updated if the + * underlying CPU changes. + * 3. that version is increased whenever underlying CPU + * changes. + * */ do { cpu = __getcpu() & VGETCPU_CPU_MASK; @@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode) * __getcpu() calls (Gleb). */ - /* Make sure migrate_count will change if we leave the VCPU. */ - do { - pvti = get_pvti(cpu); - migrate_count = pvti->migrate_count; - - cpu1 = cpu; - cpu = __getcpu() & VGETCPU_CPU_MASK; - } while (unlikely(cpu != cpu1)); + pvti = get_pvti(cpu); version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); /* * Test we're still on the cpu as well as the version. - * - We must read TSC of pvti's VCPU. - * - KVM doesn't follow the versioning protocol, so data could - * change before version if we left the VCPU. + * We could have been migrated just after the first + * vgetcpu but before fetching the version, so we + * wouldn't notice a version change. */ - smp_rmb(); - } while (unlikely((pvti->pvti.version & 1) || - pvti->pvti.version != version || - pvti->migrate_count != migrate_count)); + cpu1 = __getcpu() & VGETCPU_CPU_MASK; + } while (unlikely(cpu != cpu1 || + (pvti->pvti.version & 1) || + pvti->pvti.version != version)); if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) *mode = VCLOCK_NONE; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 94578efd3067..46957ead3060 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1760,6 +1760,9 @@ static struct notifier_block xen_hvm_cpu_notifier = { static void __init xen_hvm_guest_init(void) { + if (xen_pv_domain()) + return; + init_hvm_pv_info(); xen_hvm_init_shared_info(); @@ -1775,6 +1778,7 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_time_ops(); xen_hvm_init_mmu_ops(); } +#endif static bool xen_nopv = false; static __init int xen_parse_nopv(char *arg) @@ -1784,14 +1788,11 @@ static __init int xen_parse_nopv(char *arg) } early_param("xen_nopv", xen_parse_nopv); -static uint32_t __init xen_hvm_platform(void) +static uint32_t __init xen_platform(void) { if (xen_nopv) return 0; - if (xen_pv_domain()) - return 0; - return xen_cpuid_base(); } @@ -1809,11 +1810,19 @@ bool xen_hvm_need_lapic(void) } EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); -const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = { - .name = "Xen HVM", - .detect = xen_hvm_platform, +static void xen_set_cpu_features(struct cpuinfo_x86 *c) +{ + if (xen_pv_domain()) + clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); +} + +const struct hypervisor_x86 x86_hyper_xen = { + .name = "Xen", + .detect = xen_platform, +#ifdef CONFIG_XEN_PVHVM .init_platform = xen_hvm_guest_init, +#endif .x2apic_available = xen_x2apic_para_available, + .set_cpu_features = xen_set_cpu_features, }; -EXPORT_SYMBOL(x86_hyper_xen_hvm); -#endif +EXPORT_SYMBOL(x86_hyper_xen); diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index d9497698645a..53b4c0811f4f 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -88,7 +88,17 @@ static void xen_vcpu_notify_restore(void *data) tick_resume_local(); } +static void xen_vcpu_notify_suspend(void *data) +{ + tick_suspend_local(); +} + void xen_arch_resume(void) { on_each_cpu(xen_vcpu_notify_restore, NULL, 1); } + +void xen_arch_suspend(void) +{ + on_each_cpu(xen_vcpu_notify_suspend, NULL, 1); +} |