From b8f05c8803fce899d79ca66f8d7f348cf15fb40e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 27 Feb 2015 15:45:29 +0100 Subject: x86/xen: correct bug in p2m list initialization Commit 054954eb051f35e74b75a566a96fe756015352c8 ("xen: switch to linear virtual mapped sparse p2m list") introduced an error. During initialization of the p2m list a p2m identity area mapped by a complete identity pmd entry has to be split up into smaller chunks sometimes, if a non-identity pfn is introduced in this area. If this non-identity pfn is not at index 0 of a p2m page the new p2m page needed is initialized with wrong identity entries, as the identity pfns don't start with the value corresponding to index 0, but with the initial non-identity pfn. This results in weird wrong mappings. Correct the wrong initialization by starting with the correct pfn. Cc: stable@vger.kernel.org # 3.19 Reported-by: Stefan Bader Signed-off-by: Juergen Gross Tested-by: Stefan Bader Signed-off-by: David Vrabel --- arch/x86/xen/p2m.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 740ae3026a14..9f93af56a5fc 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -563,7 +563,7 @@ static bool alloc_p2m(unsigned long pfn) if (p2m_pfn == PFN_DOWN(__pa(p2m_missing))) p2m_init(p2m); else - p2m_init_identity(p2m, pfn); + p2m_init_identity(p2m, pfn & ~(P2M_PER_PAGE - 1)); spin_lock_irqsave(&p2m_update_lock, flags); -- cgit v1.2.3-70-g09d2 From e893286918d2cde3a94850d8f7101cd1039e0c62 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 5 Mar 2015 09:13:31 +0100 Subject: x86/vdso: Fix the build on GCC5 On gcc5 the kernel does not link: ld: .eh_frame_hdr table[4] FDE at 0000000000000648 overlaps table[5] FDE at 0000000000000670. Because prior GCC versions always emitted NOPs on ALIGN directives, but gcc5 started omitting them. .LSTARTFDEDLSI1 says: /* HACK: The dwarf2 unwind routines will subtract 1 from the return address to get an address in the middle of the presumed call instruction. Since we didn't get here via a call, we need to include the nop before the real start to make up for it. */ .long .LSTART_sigreturn-1-. /* PC-relative start address */ But commit 69d0627a7f6e ("x86 vDSO: reorder vdso32 code") from 2.6.25 replaced .org __kernel_vsyscall+32,0x90 by ALIGN right before __kernel_sigreturn. Of course, ALIGN need not generate any NOP in there. Esp. gcc5 collapses vclock_gettime.o and int80.o together with no generated NOPs as "ALIGN". So fix this by adding to that point at least a single NOP and make the function ALIGN possibly with more NOPs then. Kudos for reporting and diagnosing should go to Richard. Reported-by: Richard Biener Signed-off-by: Jiri Slaby Acked-by: Andy Lutomirski Cc: Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1425543211-12542-1-git-send-email-jslaby@suse.cz Signed-off-by: Ingo Molnar --- arch/x86/vdso/vdso32/sigreturn.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso32/sigreturn.S b/arch/x86/vdso/vdso32/sigreturn.S index 31776d0efc8c..d7ec4e251c0a 100644 --- a/arch/x86/vdso/vdso32/sigreturn.S +++ b/arch/x86/vdso/vdso32/sigreturn.S @@ -17,6 +17,7 @@ .text .globl __kernel_sigreturn .type __kernel_sigreturn,@function + nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */ ALIGN __kernel_sigreturn: .LSTART_sigreturn: -- cgit v1.2.3-70-g09d2 From 394838c96013ba414a24ffe7a2a593a9154daadf Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 9 Mar 2015 17:42:31 -0700 Subject: x86/asm/entry/32: Fix user_mode() misuses The one in do_debug() is probably harmless, but better safe than sorry. Signed-off-by: Andy Lutomirski Cc: Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d67deaa9df5458363623001f252d1aee3215d014.1425948056.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9d2073e2ecc9..4ff5d162ff9f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -384,7 +384,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) goto exit; conditional_sti(regs); - if (!user_mode(regs)) + if (!user_mode_vm(regs)) die("bounds", regs, error_code); if (!cpu_feature_enabled(X86_FEATURE_MPX)) { @@ -637,7 +637,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) * then it's very likely the result of an icebp/int01 trap. * User wants a sigtrap for that. */ - if (!dr6 && user_mode(regs)) + if (!dr6 && user_mode_vm(regs)) user_icebp = 1; /* Catch kmemcheck conditions first of all! */ -- cgit v1.2.3-70-g09d2 From dc9be0fac70a2ad86e31a81372bb0bdfb6945353 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 5 Mar 2015 11:54:46 +0100 Subject: kvm: move advertising of KVM_CAP_IRQFD to common code POWER supports irqfds but forgot to advertise them. Some userspace does not check for the capability, but others check it---thus they work on x86 and s390 but not POWER. To avoid that other architectures in the future make the same mistake, let common code handle KVM_CAP_IRQFD the same way as KVM_CAP_IRQFD_RESAMPLE. Reported-and-tested-by: Greg Kurz Cc: stable@vger.kernel.org Fixes: 297e21053a52f060944e9f0de4c64fad9bcd72fc Signed-off-by: Paolo Bonzini Signed-off-by: Marcelo Tosatti --- arch/s390/kvm/kvm-s390.c | 1 - arch/x86/kvm/x86.c | 1 - virt/kvm/kvm_main.c | 1 + 3 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f6579cfde2df..19e17bd7aec0 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -165,7 +165,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ONE_REG: case KVM_CAP_ENABLE_CAP: case KVM_CAP_S390_CSS_SUPPORT: - case KVM_CAP_IRQFD: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: case KVM_CAP_ENABLE_CAP_VM: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd7a70be41b3..32bf19ef3115 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2744,7 +2744,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_USER_NMI: case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: - case KVM_CAP_IRQFD: case KVM_CAP_IOEVENTFD: case KVM_CAP_IOEVENTFD_NO_LENGTH: case KVM_CAP_PIT2: diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a1093700f3a4..a2214d9609bd 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2492,6 +2492,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_SIGNAL_MSI: #endif #ifdef CONFIG_HAVE_KVM_IRQFD + case KVM_CAP_IRQFD: case KVM_CAP_IRQFD_RESAMPLE: #endif case KVM_CAP_CHECK_EXTENSION_VM: -- cgit v1.2.3-70-g09d2 From 7486341a98f26857f383aec88ffa10950087c3a1 Mon Sep 17 00:00:00 2001 From: "Li, Aubrey" Date: Wed, 11 Mar 2015 16:09:00 +0800 Subject: x86/platform, acpi: Bypass legacy PIC and PIT in ACPI hardware reduced mode On a platform in ACPI Hardware-reduced mode, the legacy PIC and PIT may not be initialized even though they may be present in silicon. Touching these legacy components causes unexpected results on the system. On the Bay Trail-T(ASUS-T100) platform, touching these legacy components blocks platform hardware low idle power state(S0ix) during system suspend. So we should bypass them in ACPI hardware reduced mode. Suggested-by: Arjan van de Ven Signed-off-by: Li Aubrey Cc: Cc: Alan Cox Cc: H. Peter Anvin Cc: Len Brown Cc: Rafael J. Wysocki Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/54FFF81C.20703@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 3d525c6124f6..803b684676ff 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1337,6 +1337,26 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) return 0; } +/* + * ACPI offers an alternative platform interface model that removes + * ACPI hardware requirements for platforms that do not implement + * the PC Architecture. + * + * We initialize the Hardware-reduced ACPI model here: + */ +static void __init acpi_reduced_hw_init(void) +{ + if (acpi_gbl_reduced_hardware) { + /* + * Override x86_init functions and bypass legacy pic + * in Hardware-reduced ACPI mode + */ + x86_init.timers.timer_init = x86_init_noop; + x86_init.irqs.pre_vector_init = x86_init_noop; + legacy_pic = &null_legacy_pic; + } +} + /* * If your system is blacklisted here, but you find that acpi=force * works for you, please contact linux-acpi@vger.kernel.org @@ -1536,6 +1556,11 @@ int __init early_acpi_boot_init(void) */ early_acpi_process_madt(); + /* + * Hardware-reduced ACPI mode initialization: + */ + acpi_reduced_hw_init(); + return 0; } -- cgit v1.2.3-70-g09d2 From c8a470cab030bae8f9e6e5cfff72b047b7c627a7 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Thu, 12 Mar 2015 16:55:13 +0100 Subject: x86/apic/numachip: Fix sibling map with NumaChip On NumaChip systems, the physical processor ID assignment wasn't accounting for the number of nodes in AMD multi-module processors, giving an incorrect sibling map: $ cd /sys/devices/system/cpu/cpu29/topology $ grep . * core_id:5 core_siblings:00000000,ff000000 core_siblings_list:24-31 physical_package_id:3 thread_siblings:00000000,30000000 thread_siblings_list:28-29 This fixes it: $ cd /sys/devices/system/cpu/cpu29/topology $ grep . * core_id:5 core_siblings:00000000,ffff0000 core_siblings_list:16-31 physical_package_id:1 thread_siblings:00000000,30000000 thread_siblings_list:28-29 Signed-off-by: Daniel J Blueman Signed-off-by: Borislav Petkov Cc: Cc: H. Peter Anvin Cc: Steffen Persvold Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1426135950-10110-1-git-send-email-daniel@numascale.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_numachip.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index c2fd21fed002..017149cded07 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -37,10 +37,12 @@ static const struct apic apic_numachip; static unsigned int get_apic_id(unsigned long x) { unsigned long value; - unsigned int id; + unsigned int id = (x >> 24) & 0xff; - rdmsrl(MSR_FAM10H_NODE_ID, value); - id = ((x >> 24) & 0xffU) | ((value << 2) & 0xff00U); + if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) { + rdmsrl(MSR_FAM10H_NODE_ID, value); + id |= (value << 2) & 0xff00; + } return id; } @@ -155,10 +157,18 @@ static int __init numachip_probe(void) static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) { - if (c->phys_proc_id != node) { - c->phys_proc_id = node; - per_cpu(cpu_llc_id, smp_processor_id()) = node; + u64 val; + u32 nodes = 1; + + this_cpu_write(cpu_llc_id, node); + + /* Account for nodes per socket in multi-core-module processors */ + if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) { + rdmsrl(MSR_FAM10H_NODE_ID, val); + nodes = ((val >> 3) & 7) + 1; } + + c->phys_proc_id = node / nodes; } static int __init numachip_system_init(void) -- cgit v1.2.3-70-g09d2 From c1a6bff28cbff796bf6e7db5cf42ec9244911be2 Mon Sep 17 00:00:00 2001 From: Petr Matousek Date: Wed, 11 Mar 2015 12:16:09 +0100 Subject: kvm: x86: i8259: return initialized data on invalid-size read If data is read from PIC with invalid access size, the return data stays uninitialized even though success is returned. Fix this by always initializing the data. Signed-off-by: Petr Matousek Reported-by: Nadav Amit Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8259.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index cc31f7c06d3d..9541ba34126b 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -507,6 +507,7 @@ static int picdev_read(struct kvm_pic *s, return -EOPNOTSUPP; if (len != 1) { + memset(val, 0, len); pr_pic_unimpl("non byte read\n"); return 0; } -- cgit v1.2.3-70-g09d2 From ccfe8c3f7e52ae83155cb038753f4c75b774ca8a Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Thu, 12 Mar 2015 09:17:51 +0100 Subject: crypto: aesni - fix memory usage in GCM decryption The kernel crypto API logic requires the caller to provide the length of (ciphertext || authentication tag) as cryptlen for the AEAD decryption operation. Thus, the cipher implementation must calculate the size of the plaintext output itself and cannot simply use cryptlen. The RFC4106 GCM decryption operation tries to overwrite cryptlen memory in req->dst. As the destination buffer for decryption only needs to hold the plaintext memory but cryptlen references the input buffer holding (ciphertext || authentication tag), the assumption of the destination buffer length in RFC4106 GCM operation leads to a too large size. This patch simply uses the already calculated plaintext size. In addition, this patch fixes the offset calculation of the AAD buffer pointer: as mentioned before, cryptlen already includes the size of the tag. Thus, the tag does not need to be added. With the addition, the AAD will be written beyond the already allocated buffer. Note, this fixes a kernel crash that can be triggered from user space via AF_ALG(aead) -- simply use the libkcapi test application from [1] and update it to use rfc4106-gcm-aes. Using [1], the changes were tested using CAVS vectors to demonstrate that the crypto operation still delivers the right results. [1] http://www.chronox.de/libkcapi.html CC: Tadeusz Struk Cc: stable@vger.kernel.org Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 947c6bf52c33..54f60ab41c63 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1155,7 +1155,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); if (!src) return -ENOMEM; - assoc = (src + req->cryptlen + auth_tag_len); + assoc = (src + req->cryptlen); scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); scatterwalk_map_and_copy(assoc, req->assoc, 0, req->assoclen, 0); @@ -1180,7 +1180,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) scatterwalk_done(&src_sg_walk, 0, 0); scatterwalk_done(&assoc_sg_walk, 0, 0); } else { - scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1); + scatterwalk_map_and_copy(dst, req->dst, 0, tempCipherLen, 1); kfree(src); } return retval; -- cgit v1.2.3-70-g09d2 From a7c80ebcac3068b1c3cb27d538d29558c30010c8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 13 Mar 2015 09:53:09 +0100 Subject: x86/fpu: Avoid math_state_restore() without used_math() in __restore_xstate_sig() math_state_restore() assumes it is called with irqs disabled, but this is not true if the caller is __restore_xstate_sig(). This means that if ia32_fxstate == T and __copy_from_user() fails, __restore_xstate_sig() returns with irqs disabled too. This triggers: BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:41 dump_stack ___might_sleep ? _raw_spin_unlock_irqrestore __might_sleep down_read ? _raw_spin_unlock_irqrestore print_vma_addr signal_fault sys32_rt_sigreturn Change __restore_xstate_sig() to call set_used_math() unconditionally. This avoids enabling and disabling interrupts in math_state_restore(). If copy_from_user() fails, we can simply do fpu_finit() by hand. [ Note: this is only the first step. math_state_restore() should not check used_math(), it should set this flag. While init_fpu() should simply die. ] Signed-off-by: Oleg Nesterov Signed-off-by: Borislav Petkov Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Pekka Riikonen Cc: Quentin Casasnovas Cc: Rik van Riel Cc: Suresh Siddha Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150307153844.GB25954@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/xsave.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 34f66e58a896..cdc6cf903078 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -379,7 +379,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) * thread's fpu state, reconstruct fxstate from the fsave * header. Sanitize the copied state etc. */ - struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; int err = 0; @@ -393,14 +393,15 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) */ drop_fpu(tsk); - if (__copy_from_user(xsave, buf_fx, state_size) || + if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) || __copy_from_user(&env, buf, sizeof(env))) { + fpu_finit(fpu); err = -1; } else { sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); - set_used_math(); } + set_used_math(); if (use_eager_fpu()) { preempt_disable(); math_state_restore(); -- cgit v1.2.3-70-g09d2 From f4c3686386393c120710dd34df2a74183ab805fd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 13 Mar 2015 09:53:10 +0100 Subject: x86/fpu: Drop_fpu() should not assume that tsk equals current drop_fpu() does clear_used_math() and usually this is correct because tsk == current. However switch_fpu_finish()->restore_fpu_checking() is called before __switch_to() updates the "current_task" variable. If it fails, we will wrongly clear the PF_USED_MATH flag of the previous task. So use clear_stopped_child_used_math() instead. Signed-off-by: Oleg Nesterov Signed-off-by: Borislav Petkov Reviewed-by: Rik van Riel Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Pekka Riikonen Cc: Quentin Casasnovas Cc: Suresh Siddha Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150309171041.GB11388@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu-internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 0dbc08282291..72ba21a8b5fc 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -370,7 +370,7 @@ static inline void drop_fpu(struct task_struct *tsk) preempt_disable(); tsk->thread.fpu_counter = 0; __drop_fpu(tsk); - clear_used_math(); + clear_stopped_child_used_math(tsk); preempt_enable(); } -- cgit v1.2.3-70-g09d2 From 670125bda1d86edfadf81dc56a87582ac7fbd47b Mon Sep 17 00:00:00 2001 From: Wincy Van Date: Wed, 4 Mar 2015 14:31:56 +0800 Subject: KVM: VMX: Set msr bitmap correctly if vcpu is in guest mode In commit 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap"), we are setting MSR_BITMAP in prepare_vmcs02 if we should use hardware. This is not enough since the field will be modified by following vmx_set_efer. Fix this by setting vmx_msr_bitmap_nested in vmx_set_msr_bitmap if vcpu is in guest mode. Signed-off-by: Wincy Van Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f7b20b417a3a..10a481b7674d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2168,7 +2168,10 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) { unsigned long *msr_bitmap; - if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) { + if (is_guest_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_nested; + else if (irqchip_in_kernel(vcpu->kvm) && + apic_x2apic_mode(vcpu->arch.apic)) { if (is_long_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else @@ -9218,9 +9221,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } if (cpu_has_vmx_msr_bitmap() && - exec_control & CPU_BASED_USE_MSR_BITMAPS && - nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) { - vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested)); + exec_control & CPU_BASED_USE_MSR_BITMAPS) { + nested_vmx_merge_msr_bitmap(vcpu, vmcs12); + /* MSR_BITMAP will be set by following vmx_set_efer. */ } else exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; -- cgit v1.2.3-70-g09d2 From 69797dafe35541bfff1989c0b37c66ed785faf0e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 16 Mar 2015 11:06:28 +0100 Subject: Revert "x86/mm/ASLR: Propagate base load address calculation" This reverts commit: f47233c2d34f ("x86/mm/ASLR: Propagate base load address calculation") The main reason for the revert is that the new boot flag does not work at all currently, and in order to make this work, we need non-trivial changes to the x86 boot code which we didn't manage to get done in time for merging. And even if we did, they would've been too risky so instead of rushing things and break booting 4.1 on boxes left and right, we will be very strict and conservative and will take our time with this to fix and test it properly. Reported-by: Yinghai Lu Signed-off-by: Borislav Petkov Cc: Ard Biesheuvel Cc: Baoquan He Cc: H. Peter Anvin Cc: Josh Triplett Cc: Junjie Mao Cc: Kees Cook Cc: Linus Torvalds Cc: Matt Fleming Link: http://lkml.kernel.org/r/20150316100628.GD22995@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/aslr.c | 34 +--------------------------------- arch/x86/boot/compressed/misc.c | 3 +-- arch/x86/boot/compressed/misc.h | 6 ++---- arch/x86/include/asm/page_types.h | 2 -- arch/x86/include/uapi/asm/bootparam.h | 1 - arch/x86/kernel/module.c | 10 +++++++++- arch/x86/kernel/setup.c | 22 ++++------------------ 7 files changed, 17 insertions(+), 61 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 7083c16cccba..bb1376381985 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -14,13 +14,6 @@ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; -struct kaslr_setup_data { - __u64 next; - __u32 type; - __u32 len; - __u8 data[1]; -} kaslr_setup_data; - #define I8254_PORT_CONTROL 0x43 #define I8254_PORT_COUNTER0 0x40 #define I8254_CMD_READBACK 0xC0 @@ -302,29 +295,7 @@ static unsigned long find_random_addr(unsigned long minimum, return slots_fetch_random(); } -static void add_kaslr_setup_data(struct boot_params *params, __u8 enabled) -{ - struct setup_data *data; - - kaslr_setup_data.type = SETUP_KASLR; - kaslr_setup_data.len = 1; - kaslr_setup_data.next = 0; - kaslr_setup_data.data[0] = enabled; - - data = (struct setup_data *)(unsigned long)params->hdr.setup_data; - - while (data && data->next) - data = (struct setup_data *)(unsigned long)data->next; - - if (data) - data->next = (unsigned long)&kaslr_setup_data; - else - params->hdr.setup_data = (unsigned long)&kaslr_setup_data; - -} - -unsigned char *choose_kernel_location(struct boot_params *params, - unsigned char *input, +unsigned char *choose_kernel_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) @@ -335,17 +306,14 @@ unsigned char *choose_kernel_location(struct boot_params *params, #ifdef CONFIG_HIBERNATION if (!cmdline_find_option_bool("kaslr")) { debug_putstr("KASLR disabled by default...\n"); - add_kaslr_setup_data(params, 0); goto out; } #else if (cmdline_find_option_bool("nokaslr")) { debug_putstr("KASLR disabled by cmdline...\n"); - add_kaslr_setup_data(params, 0); goto out; } #endif - add_kaslr_setup_data(params, 1); /* Record the various known unsafe memory ranges. */ mem_avoid_init((unsigned long)input, input_size, diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 5903089c818f..a950864a64da 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -401,8 +401,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_kernel_location(real_mode, input_data, input_len, - output, + output = choose_kernel_location(input_data, input_len, output, output_len > run_size ? output_len : run_size); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index ee3576b2666b..04477d68403f 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -57,8 +57,7 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE /* aslr.c */ -unsigned char *choose_kernel_location(struct boot_params *params, - unsigned char *input, +unsigned char *choose_kernel_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size); @@ -66,8 +65,7 @@ unsigned char *choose_kernel_location(struct boot_params *params, bool has_cpuflag(int flag); #else static inline -unsigned char *choose_kernel_location(struct boot_params *params, - unsigned char *input, +unsigned char *choose_kernel_location(unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 95e11f79f123..f97fbe3abb67 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,8 +51,6 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; -extern bool kaslr_enabled; - static inline phys_addr_t get_max_mapped(void) { return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 44e6dd7e36a2..225b0988043a 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -7,7 +7,6 @@ #define SETUP_DTB 2 #define SETUP_PCI 3 #define SETUP_EFI 4 -#define SETUP_KASLR 5 /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 9bbb9b35c144..d1ac80b72c72 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -47,13 +47,21 @@ do { \ #ifdef CONFIG_RANDOMIZE_BASE static unsigned long module_load_offset; +static int randomize_modules = 1; /* Mutex protects the module_load_offset. */ static DEFINE_MUTEX(module_kaslr_mutex); +static int __init parse_nokaslr(char *p) +{ + randomize_modules = 0; + return 0; +} +early_param("nokaslr", parse_nokaslr); + static unsigned long int get_module_load_offset(void) { - if (kaslr_enabled) { + if (randomize_modules) { mutex_lock(&module_kaslr_mutex); /* * Calculate the module_load_offset the first time this diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 98dc9317286e..0a2421cca01f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -122,8 +122,6 @@ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; -bool __read_mostly kaslr_enabled = false; - #ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); #endif @@ -427,11 +425,6 @@ static void __init reserve_initrd(void) } #endif /* CONFIG_BLK_DEV_INITRD */ -static void __init parse_kaslr_setup(u64 pa_data, u32 data_len) -{ - kaslr_enabled = (bool)(pa_data + sizeof(struct setup_data)); -} - static void __init parse_setup_data(void) { struct setup_data *data; @@ -457,9 +450,6 @@ static void __init parse_setup_data(void) case SETUP_EFI: parse_efi_setup(pa_data, data_len); break; - case SETUP_KASLR: - parse_kaslr_setup(pa_data, data_len); - break; default: break; } @@ -842,14 +832,10 @@ static void __init trim_low_memory_range(void) static int dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) { - if (kaslr_enabled) - pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", - (unsigned long)&_text - __START_KERNEL, - __START_KERNEL, - __START_KERNEL_map, - MODULES_VADDR-1); - else - pr_emerg("Kernel Offset: disabled\n"); + pr_emerg("Kernel Offset: 0x%lx from 0x%lx " + "(relocation range: 0x%lx-0x%lx)\n", + (unsigned long)&_text - __START_KERNEL, __START_KERNEL, + __START_KERNEL_map, MODULES_VADDR-1); return 0; } -- cgit v1.2.3-70-g09d2 From 0790ec172de1bd2e23f1dbd4925426b6cc3c1b72 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Tue, 17 Mar 2015 14:02:32 +0100 Subject: KVM: nVMX: mask unrestricted_guest if disabled on L0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If EPT was enabled, unrestricted_guest was allowed in L1 regardless of L0. L1 triple faulted when running L2 guest that required emulation. Another side effect was 'WARN_ON_ONCE(vmx->nested.nested_run_pending)' in L0's dmesg: WARNING: CPU: 0 PID: 0 at arch/x86/kvm/vmx.c:9190 nested_vmx_vmexit+0x96e/0xb00 [kvm_intel] () Prevent this scenario by masking SECONDARY_EXEC_UNRESTRICTED_GUEST when the host doesn't have it enabled. Fixes: 78051e3b7e35 ("KVM: nVMX: Disable unrestricted mode if ept=0") Cc: stable@vger.kernel.org Tested-By: Kashyap Chamarthy Signed-off-by: Radim Krčmář Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 10a481b7674d..ae4f6d35d19c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2479,8 +2479,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ vmx->nested.nested_vmx_secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_EPT | - SECONDARY_EXEC_UNRESTRICTED_GUEST; + SECONDARY_EXEC_ENABLE_EPT; vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_INVEPT_BIT; @@ -2494,6 +2493,10 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) } else vmx->nested.nested_vmx_ept_caps = 0; + if (enable_unrestricted_guest) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_UNRESTRICTED_GUEST; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, vmx->nested.nested_vmx_misc_low, -- cgit v1.2.3-70-g09d2 From 9e8ce4b96b781b003e3174fbbc62e1d4388c8b8f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 20 Mar 2015 14:56:19 +0100 Subject: Revert "x86/PCI: Refine the way to release PCI IRQ resources" Commit b4b55cda5874 (Refine the way to release PCI IRQ resources) introduced a regression in the PCI IRQ resource management by causing the IRQ resource of a device, established when pci_enabled_device() is called on a fully disabled device, to be released when the driver is unbound from the device, regardless of the enable_cnt. This leads to the situation that an ill-behaved driver can now make a device unusable to subsequent drivers by an imbalance in their use of pci_enable/disable_device(). That is a serious problem for secondary drivers like vfio-pci, which are innocent of the transgressions of the previous driver. Since the solution of this problem is not immediate and requires further discussion, revert commit b4b55cda5874 and the issue it was supposed to address (a bug related to xen-pciback) will be taken care of in a different way going forward. Reported-by: Alex Williamson Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/pci_x86.h | 2 ++ arch/x86/pci/common.c | 34 ++++++---------------------------- arch/x86/pci/intel_mid_pci.c | 4 ++-- arch/x86/pci/irq.c | 15 ++++++++++++++- drivers/acpi/pci_irq.c | 9 ++++++++- 5 files changed, 32 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index fa1195dae425..164e3f8d3c3d 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -93,6 +93,8 @@ extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); +extern bool mp_should_keep_irq(struct device *dev); + struct pci_raw_ops { int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val); diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 3d2612b68694..2fb384724ebb 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -513,31 +513,6 @@ void __init pcibios_set_cache_line_size(void) } } -/* - * Some device drivers assume dev->irq won't change after calling - * pci_disable_device(). So delay releasing of IRQ resource to driver - * unbinding time. Otherwise it will break PM subsystem and drivers - * like xen-pciback etc. - */ -static int pci_irq_notifier(struct notifier_block *nb, unsigned long action, - void *data) -{ - struct pci_dev *dev = to_pci_dev(data); - - if (action != BUS_NOTIFY_UNBOUND_DRIVER) - return NOTIFY_DONE; - - if (pcibios_disable_irq) - pcibios_disable_irq(dev); - - return NOTIFY_OK; -} - -static struct notifier_block pci_irq_nb = { - .notifier_call = pci_irq_notifier, - .priority = INT_MIN, -}; - int __init pcibios_init(void) { if (!raw_pci_ops) { @@ -550,9 +525,6 @@ int __init pcibios_init(void) if (pci_bf_sort >= pci_force_bf) pci_sort_breadthfirst(); - - bus_register_notifier(&pci_bus_type, &pci_irq_nb); - return 0; } @@ -711,6 +683,12 @@ int pcibios_enable_device(struct pci_dev *dev, int mask) return 0; } +void pcibios_disable_device (struct pci_dev *dev) +{ + if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq) + pcibios_disable_irq(dev); +} + int pci_ext_cfg_avail(void) { if (raw_pci_ext_ops) diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index efb849323c74..852aa4c92da0 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -234,10 +234,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) static void intel_mid_pci_irq_disable(struct pci_dev *dev) { - if (dev->irq_managed && dev->irq > 0) { + if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed && + dev->irq > 0) { mp_unmap_irq(dev->irq); dev->irq_managed = 0; - dev->irq = 0; } } diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index e71b3dbd87b8..5dc6ca5e1741 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1256,9 +1256,22 @@ static int pirq_enable_irq(struct pci_dev *dev) return 0; } +bool mp_should_keep_irq(struct device *dev) +{ + if (dev->power.is_prepared) + return true; +#ifdef CONFIG_PM + if (dev->power.runtime_status == RPM_SUSPENDING) + return true; +#endif + + return false; +} + static void pirq_disable_irq(struct pci_dev *dev) { - if (io_apic_assign_pci_irqs && dev->irq_managed && dev->irq) { + if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) && + dev->irq_managed && dev->irq) { mp_unmap_irq(dev->irq); dev->irq = 0; dev->irq_managed = 0; diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index e7f718d6918a..b1def411c0b8 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -485,6 +485,14 @@ void acpi_pci_irq_disable(struct pci_dev *dev) if (!pin || !dev->irq_managed || dev->irq <= 0) return; + /* Keep IOAPIC pin configuration when suspending */ + if (dev->dev.power.is_prepared) + return; +#ifdef CONFIG_PM + if (dev->dev.power.runtime_status == RPM_SUSPENDING) + return; +#endif + entry = acpi_pci_irq_lookup(dev, pin); if (!entry) return; @@ -505,6 +513,5 @@ void acpi_pci_irq_disable(struct pci_dev *dev) if (gsi >= 0) { acpi_unregister_gsi(gsi); dev->irq_managed = 0; - dev->irq = 0; } } -- cgit v1.2.3-70-g09d2 From c806a6ad35bfa6c92249cd0ca4772d5ac3f8cb68 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Wed, 18 Mar 2015 19:38:22 +0100 Subject: KVM: x86: call irq notifiers with directed EOI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_ioapic_update_eoi() wasn't called if directed EOI was enabled. We need to do that for irq notifiers. (Like with edge interrupts.) Fix it by skipping EOI broadcast only. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=82211 Signed-off-by: Radim Krčmář Reviewed-by: Paolo Bonzini Tested-by: Bandan Das Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/ioapic.c | 4 +++- arch/x86/kvm/lapic.c | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index b1947e0f3e10..46d4449772bc 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -422,6 +422,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, struct kvm_ioapic *ioapic, int vector, int trigger_mode) { int i; + struct kvm_lapic *apic = vcpu->arch.apic; for (i = 0; i < IOAPIC_NUM_PINS; i++) { union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; @@ -443,7 +444,8 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i); spin_lock(&ioapic->lock); - if (trigger_mode != IOAPIC_LEVEL_TRIG) + if (trigger_mode != IOAPIC_LEVEL_TRIG || + kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) continue; ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index bd4e34de24c7..4ee827d7bf36 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -833,8 +833,7 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { - if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && - kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { + if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { int trigger_mode; if (apic_test_vector(vector, apic->regs + APIC_TMR)) trigger_mode = IOAPIC_LEVEL_TRIG; -- cgit v1.2.3-70-g09d2 From b3494a4ab20f6bdf74cdf2badf7918bb65ee8a00 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 23 Mar 2015 12:32:54 -0700 Subject: x86/asm/entry: Check for syscall exit work with IRQs disabled We currently have a race: if we're preempted during syscall exit, we can fail to process syscall return work that is queued up while we're preempted in ret_from_sys_call after checking ti.flags. Fix it by disabling interrupts before checking ti.flags. Reported-by: Stefan Seyfried Reported-by: Takashi Iwai Signed-off-by: Andy Lutomirski Acked-by: Denys Vlasenko Cc: Jiri Kosina Cc: Tejun Heo Fixes: 96b6352c1271 ("x86_64, entry: Remove the syscall exit audit") Link: http://lkml.kernel.org/r/189320d42b4d671df78c10555976bb10af1ffc75.1427137498.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1d74d161687c..2babb393915e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -364,12 +364,21 @@ system_call_fastpath: * Has incomplete stack frame and undefined top of stack. */ ret_from_sys_call: - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) - jnz int_ret_from_sys_call_fixup /* Go the the slow path */ - LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF + + /* + * We must check ti flags with interrupts (or at least preemption) + * off because we must *never* return to userspace without + * processing exit work that is enqueued if we're preempted here. + * In particular, returning to userspace with any of the one-shot + * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is + * very bad. + */ + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + jnz int_ret_from_sys_call_fixup /* Go the the slow path */ + CFI_REMEMBER_STATE /* * sysretq will re-enable interrupts: @@ -386,7 +395,7 @@ ret_from_sys_call: int_ret_from_sys_call_fixup: FIXUP_TOP_OF_STACK %r11, -ARGOFFSET - jmp int_ret_from_sys_call + jmp int_ret_from_sys_call_irqs_off /* Do syscall tracing */ tracesys: @@ -432,6 +441,7 @@ tracesys_phase2: GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF +int_ret_from_sys_call_irqs_off: movl $_TIF_ALLWORK_MASK,%edi /* edi: mask to check */ GLOBAL(int_with_check) -- cgit v1.2.3-70-g09d2