diff options
Diffstat (limited to 'arch/x86/mm/fault.c')
-rw-r--r-- | arch/x86/mm/fault.c | 289 |
1 files changed, 87 insertions, 202 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a51df516b87b..66be9bd60307 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -30,6 +30,7 @@ #include <asm/desc.h> /* store_idt(), ... */ #include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ +#include <asm/kvm_para.h> /* kvm_handle_async_pf */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -190,16 +191,13 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -static void vmalloc_sync(void) +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { - unsigned long address; - - if (SHARED_KERNEL_PMD) - return; + unsigned long addr; - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE_MAX && address < VMALLOC_END; - address += PMD_SIZE) { + for (addr = start & PMD_MASK; + addr >= TASK_SIZE_MAX && addr < VMALLOC_END; + addr += PMD_SIZE) { struct page *page; spin_lock(&pgd_lock); @@ -210,61 +208,13 @@ static void vmalloc_sync(void) pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - vmalloc_sync_one(page_address(page), address); + vmalloc_sync_one(page_address(page), addr); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } -void vmalloc_sync_mappings(void) -{ - vmalloc_sync(); -} - -void vmalloc_sync_unmappings(void) -{ - vmalloc_sync(); -} - -/* - * 32-bit: - * - * Handle a fault on the vmalloc or module mapping area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3_pa(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - - if (pmd_large(*pmd_k)) - return 0; - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - /* * Did it hit the DOS screen memory VA from vm86 mode? */ @@ -329,96 +279,6 @@ out: #else /* CONFIG_X86_64: */ -void vmalloc_sync_mappings(void) -{ - /* - * 64-bit mappings might allocate new p4d/pud pages - * that need to be propagated to all tasks' PGDs. - */ - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); -} - -void vmalloc_sync_unmappings(void) -{ - /* - * Unmappings never allocate or free p4d/pud pages. - * No work is required here. - */ -} - -/* - * 64-bit: - * - * Handle a fault on the vmalloc area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_k; - p4d_t *p4d, *p4d_k; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Copy kernel mappings over when needed. This can also - * happen within a race in page table update. In the later - * case just flush: - */ - pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); - pgd_k = pgd_offset_k(address); - if (pgd_none(*pgd_k)) - return -1; - - if (pgtable_l5_enabled()) { - if (pgd_none(*pgd)) { - set_pgd(pgd, *pgd_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k)); - } - } - - /* With 4-level paging, copying happens on the p4d level. */ - p4d = p4d_offset(pgd, address); - p4d_k = p4d_offset(pgd_k, address); - if (p4d_none(*p4d_k)) - return -1; - - if (p4d_none(*p4d) && !pgtable_l5_enabled()) { - set_p4d(p4d, *p4d_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k)); - } - - BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); - - pud = pud_offset(p4d, address); - if (pud_none(*pud)) - return -1; - - if (pud_large(*pud)) - return 0; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - return -1; - - if (pmd_large(*pmd)) - return 0; - - pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR @@ -554,21 +414,13 @@ static int is_errata100(struct pt_regs *regs, unsigned long address) return 0; } +/* Pentium F0 0F C7 C8 bug workaround: */ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG - unsigned long nr; - - /* - * Pentium F0 0F C7 C8 bug workaround: - */ - if (boot_cpu_has_bug(X86_BUG_F00F)) { - nr = (address - idt_descr.address) >> 3; - - if (nr == 6) { - do_invalid_op(regs, 0); - return 1; - } + if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) { + handle_invalid_op(regs); + return 1; } #endif return 0; @@ -926,6 +778,8 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, force_sig_fault(SIGSEGV, si_code, (void __user *)address); + local_irq_disable(); + return; } @@ -951,7 +805,7 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); } @@ -1005,7 +859,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, * 2. T1 : set PKRU to deny access to pkey=4, touches page * 3. T1 : faults... * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); - * 5. T1 : enters fault handler, takes mmap_sem, etc... + * 5. T1 : enters fault handler, takes mmap_lock, etc... * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ @@ -1257,29 +1111,6 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); - /* - * We can fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * Before doing this on-demand faulting, ensure that the - * fault is not any of the following: - * 1. A fault on a PTE with a reserved bit set. - * 2. A fault caused by a user-mode access. (Do not demand- - * fault kernel memory due to user-mode accesses). - * 3. A fault caused by a page-level protection violation. - * (A demand fault would be on a non-present page which - * would have X86_PF_PROT==0). - */ - if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { - if (vmalloc_fault(address) >= 0) - return; - } - /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; @@ -1394,15 +1225,15 @@ void do_user_addr_fault(struct pt_regs *regs, * Kernel-mode access to the user address space should only occur * on well-defined single instructions listed in the exception * tables. But, an erroneous kernel fault occurring outside one of - * those areas which also holds mmap_sem might deadlock attempting + * those areas which also holds mmap_lock might deadlock attempting * to validate the fault against the address space. * * Only do the expensive exception table search when we might be at * risk of a deadlock. This happens if we - * 1. Failed to acquire mmap_sem, and + * 1. Failed to acquire mmap_lock, and * 2. The access did not originate in userspace. */ - if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + if (unlikely(!mmap_read_trylock(mm))) { if (!user_mode(regs) && !search_exception_tables(regs->ip)) { /* * Fault from code in kernel from @@ -1412,7 +1243,7 @@ void do_user_addr_fault(struct pt_regs *regs, return; } retry: - down_read(&mm->mmap_sem); + mmap_read_lock(mm); } else { /* * The above down_read_trylock() might have succeeded in @@ -1452,9 +1283,9 @@ good_area: * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if - * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. + * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked. * - * Note that handle_userfault() may also release and reacquire mmap_sem + * Note that handle_userfault() may also release and reacquire mmap_lock * (and not return with VM_FAULT_RETRY), when returning to userland to * repeat the page fault later with a VM_FAULT_NOPAGE retval * (potentially after handling any pending signal during the return to @@ -1473,7 +1304,7 @@ good_area: } /* - * If we need to retry the mmap_sem has already been released, + * If we need to retry the mmap_lock has already been released, * and if there is a fatal signal pending there is no guarantee * that we made any progress. Handle this case first. */ @@ -1483,7 +1314,7 @@ good_area: goto retry; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, hw_error_code, address, fault); return; @@ -1518,20 +1349,74 @@ trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, trace_page_fault_kernel(address, regs, error_code); } -dotraplinkage void -do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, - unsigned long address) +static __always_inline void +handle_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { - prefetchw(¤t->mm->mmap_sem); - trace_page_fault_entries(regs, hw_error_code, address); + trace_page_fault_entries(regs, error_code, address); if (unlikely(kmmio_fault(regs, address))) return; /* Was the fault on kernel-controlled part of the address space? */ - if (unlikely(fault_in_kernel_space(address))) - do_kern_addr_fault(regs, hw_error_code, address); - else - do_user_addr_fault(regs, hw_error_code, address); + if (unlikely(fault_in_kernel_space(address))) { + do_kern_addr_fault(regs, error_code, address); + } else { + do_user_addr_fault(regs, error_code, address); + /* + * User address page fault handling might have reenabled + * interrupts. Fixing up all potential exit points of + * do_user_addr_fault() and its leaf functions is just not + * doable w/o creating an unholy mess or turning the code + * upside down. + */ + local_irq_disable(); + } +} + +DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) +{ + unsigned long address = read_cr2(); + bool rcu_exit; + + prefetchw(¤t->mm->mmap_lock); + + /* + * KVM has two types of events that are, logically, interrupts, but + * are unfortunately delivered using the #PF vector. These events are + * "you just accessed valid memory, but the host doesn't have it right + * now, so I'll put you to sleep if you continue" and "that memory + * you tried to access earlier is available now." + * + * We are relying on the interrupted context being sane (valid RSP, + * relevant locks not held, etc.), which is fine as long as the + * interrupted context had IF=1. We are also relying on the KVM + * async pf type field and CR2 being read consistently instead of + * getting values from real and async page faults mixed up. + * + * Fingers crossed. + * + * The async #PF handling code takes care of idtentry handling + * itself. + */ + if (kvm_handle_async_pf(regs, (u32)address)) + return; + + /* + * Entry handling for valid #PF from kernel mode is slightly + * different: RCU is already watching and rcu_irq_enter() must not + * be invoked because a kernel fault on a user space address might + * sleep. + * + * In case the fault hit a RCU idle region the conditional entry + * code reenabled RCU to avoid subsequent wreckage which helps + * debugability. + */ + rcu_exit = idtentry_enter_cond_rcu(regs); + + instrumentation_begin(); + handle_page_fault(regs, error_code, address); + instrumentation_end(); + + idtentry_exit_cond_rcu(regs, rcu_exit); } -NOKPROBE_SYMBOL(do_page_fault); |