diff options
Diffstat (limited to 'arch')
574 files changed, 14908 insertions, 10525 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 21d0089117fe..2520ca5b42eb 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -931,6 +931,18 @@ config STRICT_MODULE_RWX config ARCH_WANT_RELAX_ORDER bool +config ARCH_HAS_REFCOUNT + bool + help + An architecture selects this when it has implemented refcount_t + using open coded assembly primitives that provide an optimized + refcount_t implementation, possibly at the expense of some full + refcount state checks of CONFIG_REFCOUNT_FULL=y. + + The refcount overflow check behavior, however, must be retained. + Catching overflows is the primary security concern for protecting + against bugs in reference counts. + config REFCOUNT_FULL bool "Perform full reference count validation at the expense of speed" help diff --git a/arch/alpha/defconfig b/arch/alpha/defconfig index 539e8b5a6cbd..f4ec420d7f2d 100644 --- a/arch/alpha/defconfig +++ b/arch/alpha/defconfig @@ -19,7 +19,6 @@ CONFIG_INET_AH=m CONFIG_INET_ESP=m # CONFIG_IPV6 is not set CONFIG_NETFILTER=y -CONFIG_IP_NF_QUEUE=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_FILTER=m CONFIG_VLAN_8021Q=m @@ -57,7 +56,6 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_RTC=y CONFIG_EXT2_FS=y CONFIG_REISERFS_FS=m -CONFIG_AUTOFS_FS=m CONFIG_ISO9660_FS=y CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=y diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index d103db5af5ff..5b974ab8425c 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += clkdev.h generic-y += exec.h generic-y += export.h +generic-y += fb.h generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h diff --git a/arch/alpha/include/asm/asm-prototypes.h b/arch/alpha/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..d12c68ea340b --- /dev/null +++ b/arch/alpha/include/asm/asm-prototypes.h @@ -0,0 +1,18 @@ +#include <linux/spinlock.h> + +#include <asm/checksum.h> +#include <asm/console.h> +#include <asm/page.h> +#include <asm/string.h> +#include <asm/uaccess.h> + +#include <asm-generic/asm-prototypes.h> + +extern void __divl(void); +extern void __reml(void); +extern void __divq(void); +extern void __remq(void); +extern void __divlu(void); +extern void __remlu(void); +extern void __divqu(void); +extern void __remqu(void); diff --git a/arch/alpha/include/asm/core_marvel.h b/arch/alpha/include/asm/core_marvel.h index dad300fa14ce..8dcf9dbda618 100644 --- a/arch/alpha/include/asm/core_marvel.h +++ b/arch/alpha/include/asm/core_marvel.h @@ -312,7 +312,7 @@ struct io7 { io7_port7_csrs *csrs; struct io7_port ports[IO7_NUM_PORTS]; - spinlock_t irq_lock; + raw_spinlock_t irq_lock; }; #ifndef __EXTERN_INLINE diff --git a/arch/alpha/include/asm/fb.h b/arch/alpha/include/asm/fb.h deleted file mode 100644 index fa9bbb96b2b3..000000000000 --- a/arch/alpha/include/asm/fb.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _ASM_FB_H_ -#define _ASM_FB_H_ -#include <linux/device.h> - -/* Caching is off in the I/O space quadrant by design. */ -#define fb_pgprotect(...) do {} while (0) - -static inline int fb_is_primary_device(struct fb_info *info) -{ - return 0; -} - -#endif /* _ASM_FB_H_ */ diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h index fb01dfb760c2..05a70edd57b6 100644 --- a/arch/alpha/include/asm/futex.h +++ b/arch/alpha/include/asm/futex.h @@ -25,18 +25,10 @@ : "r" (uaddr), "r"(oparg) \ : "memory") -static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -62,17 +54,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index ff4049155c84..4d61d2a50c52 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -299,6 +299,7 @@ static inline void __iomem * ioremap_nocache(unsigned long offset, return ioremap(offset, size); } +#define ioremap_wc ioremap_nocache #define ioremap_uc ioremap_nocache static inline void iounmap(volatile void __iomem *addr) diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h index a40b9fc0c6c3..718ac0b64adf 100644 --- a/arch/alpha/include/asm/spinlock.h +++ b/arch/alpha/include/asm/spinlock.h @@ -16,11 +16,6 @@ #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) #define arch_spin_is_locked(x) ((x)->lock != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - static inline int arch_spin_value_unlocked(arch_spinlock_t lock) { return lock.lock == 0; diff --git a/arch/alpha/include/asm/types.h b/arch/alpha/include/asm/types.h index 4cb4b6d3452c..0bc66e1d3a7e 100644 --- a/arch/alpha/include/asm/types.h +++ b/arch/alpha/include/asm/types.h @@ -1,6 +1,6 @@ #ifndef _ALPHA_TYPES_H #define _ALPHA_TYPES_H -#include <asm-generic/int-ll64.h> +#include <uapi/asm/types.h> #endif /* _ALPHA_TYPES_H */ diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h index b37153ecf2ac..db7fc0f511e2 100644 --- a/arch/alpha/include/asm/unistd.h +++ b/arch/alpha/include/asm/unistd.h @@ -3,7 +3,7 @@ #include <uapi/asm/unistd.h> -#define NR_SYSCALLS 514 +#define NR_SYSCALLS 523 #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_STAT64 diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 02760f6e6ca4..3b26cc62dadb 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -64,20 +64,12 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 -/* - * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. - * This gives us 6 bits, which is enough until someone invents 128 bit address - * spaces. - * - * Assume these are all power of twos. - * When 0 use the default page size. - */ -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f - #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 7b285dd4fe05..c6133a045352 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -109,4 +109,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/alpha/include/uapi/asm/types.h b/arch/alpha/include/uapi/asm/types.h index 9fd3cd459777..8d1024d7be05 100644 --- a/arch/alpha/include/uapi/asm/types.h +++ b/arch/alpha/include/uapi/asm/types.h @@ -9,8 +9,18 @@ * need to be careful to avoid a name clashes. */ -#ifndef __KERNEL__ +/* + * This is here because we used to use l64 for alpha + * and we don't want to impact user mode with our change to ll64 + * in the kernel. + * + * However, some user programs are fine with this. They can + * flag __SANE_USERSPACE_TYPES__ to get int-ll64.h here. + */ +#if !defined(__SANE_USERSPACE_TYPES__) && !defined(__KERNEL__) #include <asm-generic/int-l64.h> +#else +#include <asm-generic/int-ll64.h> #endif #endif /* _UAPI_ALPHA_TYPES_H */ diff --git a/arch/alpha/include/uapi/asm/unistd.h b/arch/alpha/include/uapi/asm/unistd.h index aa33bf5aacb6..53de540e39a7 100644 --- a/arch/alpha/include/uapi/asm/unistd.h +++ b/arch/alpha/include/uapi/asm/unistd.h @@ -366,11 +366,6 @@ #define __NR_epoll_create 407 #define __NR_epoll_ctl 408 #define __NR_epoll_wait 409 -/* Feb 2007: These three sys_epoll defines shouldn't be here but culling - * them would break userspace apps ... we'll kill them off in 2010 :) */ -#define __NR_sys_epoll_create __NR_epoll_create -#define __NR_sys_epoll_ctl __NR_epoll_ctl -#define __NR_sys_epoll_wait __NR_epoll_wait #define __NR_remap_file_pages 410 #define __NR_set_tid_address 411 #define __NR_restart_syscall 412 @@ -475,5 +470,19 @@ #define __NR_getrandom 511 #define __NR_memfd_create 512 #define __NR_execveat 513 +#define __NR_seccomp 514 +#define __NR_bpf 515 +#define __NR_userfaultfd 516 +#define __NR_membarrier 517 +#define __NR_mlock2 518 +#define __NR_copy_file_range 519 +#define __NR_preadv2 520 +#define __NR_pwritev2 521 +#define __NR_statx 522 + +/* Alpha doesn't have protection keys. */ +#define __IGNORE_pkey_mprotect +#define __IGNORE_pkey_alloc +#define __IGNORE_pkey_free #endif /* _UAPI_ALPHA_UNISTD_H */ diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c index d5f0580746a5..b10c316475dd 100644 --- a/arch/alpha/kernel/core_marvel.c +++ b/arch/alpha/kernel/core_marvel.c @@ -118,7 +118,7 @@ alloc_io7(unsigned int pe) io7 = alloc_bootmem(sizeof(*io7)); io7->pe = pe; - spin_lock_init(&io7->irq_lock); + raw_spin_lock_init(&io7->irq_lock); for (h = 0; h < 4; h++) { io7->ports[h].io7 = io7; @@ -351,7 +351,7 @@ marvel_init_io7(struct io7 *io7) } } -void +void __init marvel_io7_present(gct6_node *node) { int pe; @@ -369,6 +369,7 @@ marvel_io7_present(gct6_node *node) static void __init marvel_find_console_vga_hose(void) { +#ifdef CONFIG_VGA_HOSE u64 *pu64 = (u64 *)((u64)hwrpb + hwrpb->ctbt_offset); if (pu64[7] == 3) { /* TERM_TYPE == graphics */ @@ -402,9 +403,10 @@ marvel_find_console_vga_hose(void) pci_vga_hose = hose; } } +#endif } -gct6_search_struct gct_wanted_node_list[] = { +gct6_search_struct gct_wanted_node_list[] __initdata = { { GCT_TYPE_HOSE, GCT_SUBTYPE_IO_PORT_MODULE, marvel_io7_present }, { 0, 0, NULL } }; diff --git a/arch/alpha/kernel/core_titan.c b/arch/alpha/kernel/core_titan.c index 219bf271c0ba..b532d925443d 100644 --- a/arch/alpha/kernel/core_titan.c +++ b/arch/alpha/kernel/core_titan.c @@ -461,6 +461,7 @@ titan_ioremap(unsigned long addr, unsigned long size) unsigned long *ptes; unsigned long pfn; +#ifdef CONFIG_VGA_HOSE /* * Adjust the address and hose, if necessary. */ @@ -468,6 +469,7 @@ titan_ioremap(unsigned long addr, unsigned long size) h = pci_vga_hose->index; addr += pci_vga_hose->mem_space->start; } +#endif /* * Find the hose. diff --git a/arch/alpha/kernel/module.c b/arch/alpha/kernel/module.c index 936bc8f89a67..47632fa8c24e 100644 --- a/arch/alpha/kernel/module.c +++ b/arch/alpha/kernel/module.c @@ -181,6 +181,9 @@ apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab, switch (r_type) { case R_ALPHA_NONE: break; + case R_ALPHA_REFLONG: + *(u32 *)location = value; + break; case R_ALPHA_REFQUAD: /* BUG() can produce misaligned relocations. */ ((u32 *)location)[0] = value; diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c index ffbdb3fb672f..676bab6e3123 100644 --- a/arch/alpha/kernel/pci-noop.c +++ b/arch/alpha/kernel/pci-noop.c @@ -42,11 +42,7 @@ alloc_pci_controller(void) struct resource * __init alloc_resource(void) { - struct resource *res; - - res = alloc_bootmem(sizeof(*res)); - - return res; + return alloc_bootmem(sizeof(struct resource)); } asmlinkage long diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c index 92c0d460815b..cbecd527c696 100644 --- a/arch/alpha/kernel/pci-sysfs.c +++ b/arch/alpha/kernel/pci-sysfs.c @@ -38,7 +38,7 @@ static int __pci_mmap_fits(struct pci_dev *pdev, int num, unsigned long nr, start, size; int shift = sparse ? 5 : 0; - nr = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + nr = vma_pages(vma); start = vma->vm_pgoff; size = ((pci_resource_len(pdev, num) - 1) >> (PAGE_SHIFT - shift)) + 1; @@ -64,8 +64,7 @@ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, struct vm_area_struct *vma, int sparse) { - struct pci_dev *pdev = to_pci_dev(container_of(kobj, - struct device, kobj)); + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); struct resource *res = attr->private; enum pci_mmap_state mmap_type; struct pci_bus_region bar; @@ -255,7 +254,7 @@ static int __legacy_mmap_fits(struct pci_controller *hose, { unsigned long nr, start, size; - nr = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + nr = vma_pages(vma); start = vma->vm_pgoff; size = ((res_size - 1) >> PAGE_SHIFT) + 1; diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c index 5f387ee5b5c5..8322df174bbf 100644 --- a/arch/alpha/kernel/pci.c +++ b/arch/alpha/kernel/pci.c @@ -379,11 +379,7 @@ alloc_pci_controller(void) struct resource * __init alloc_resource(void) { - struct resource *res; - - res = alloc_bootmem(sizeof(*res)); - - return res; + return alloc_bootmem(sizeof(struct resource)); } diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 491e6a604e82..249229ab4942 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -1094,8 +1094,9 @@ get_sysnames(unsigned long type, unsigned long variation, unsigned long cpu, default: /* default to variation "0" for now */ break; case ST_DEC_EB164: - if (member < ARRAY_SIZE(eb164_indices)) - *variation_name = eb164_names[eb164_indices[member]]; + if (member >= ARRAY_SIZE(eb164_indices)) + break; + *variation_name = eb164_names[eb164_indices[member]]; /* PC164 may show as EB164 variation, but with EV56 CPU, so, since no true EB164 had anything but EV5... */ if (eb164_indices[member] == 0 && cpu == EV56_CPU) diff --git a/arch/alpha/kernel/smc37c669.c b/arch/alpha/kernel/smc37c669.c index c803fc76ae4f..4dbd4e415041 100644 --- a/arch/alpha/kernel/smc37c669.c +++ b/arch/alpha/kernel/smc37c669.c @@ -2007,11 +2007,8 @@ static void __init SMC37c669_config_mode( static unsigned char __init SMC37c669_read_config( unsigned char index ) { - unsigned char data; - - wb( &SMC37c669->index_port, index ); - data = rb( &SMC37c669->data_port ); - return data; + wb(&SMC37c669->index_port, index); + return rb(&SMC37c669->data_port); } /* diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 9fc560459ebd..f6726a746427 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -115,7 +115,7 @@ wait_boot_cpu_to_stop(int cpuid) /* * Where secondaries begin a life of C. */ -void +void __init smp_callin(void) { int cpuid = hard_smp_processor_id(); diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c index 24e41bd7d3c9..3e533920371f 100644 --- a/arch/alpha/kernel/sys_marvel.c +++ b/arch/alpha/kernel/sys_marvel.c @@ -115,11 +115,11 @@ io7_enable_irq(struct irq_data *d) return; } - spin_lock(&io7->irq_lock); + raw_spin_lock(&io7->irq_lock); *ctl |= 1UL << 24; mb(); *ctl; - spin_unlock(&io7->irq_lock); + raw_spin_unlock(&io7->irq_lock); } static void @@ -136,11 +136,11 @@ io7_disable_irq(struct irq_data *d) return; } - spin_lock(&io7->irq_lock); + raw_spin_lock(&io7->irq_lock); *ctl &= ~(1UL << 24); mb(); *ctl; - spin_unlock(&io7->irq_lock); + raw_spin_unlock(&io7->irq_lock); } static void @@ -263,7 +263,7 @@ init_io7_irqs(struct io7 *io7, */ printk(" Interrupts reported to CPU at PE %u\n", boot_cpuid); - spin_lock(&io7->irq_lock); + raw_spin_lock(&io7->irq_lock); /* set up the error irqs */ io7_redirect_irq(io7, &io7->csrs->HLT_CTL.csr, boot_cpuid); @@ -295,7 +295,7 @@ init_io7_irqs(struct io7 *io7, for (i = 0; i < 16; ++i) init_one_io7_msi(io7, i, boot_cpuid); - spin_unlock(&io7->irq_lock); + raw_spin_unlock(&io7->irq_lock); } static void __init diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S index 9b62e3fd4f03..5b4514abb234 100644 --- a/arch/alpha/kernel/systbls.S +++ b/arch/alpha/kernel/systbls.S @@ -532,6 +532,15 @@ sys_call_table: .quad sys_getrandom .quad sys_memfd_create .quad sys_execveat + .quad sys_seccomp + .quad sys_bpf /* 515 */ + .quad sys_userfaultfd + .quad sys_membarrier + .quad sys_mlock2 + .quad sys_copy_file_range + .quad sys_preadv2 /* 520 */ + .quad sys_pwritev2 + .quad sys_statx .size sys_call_table, . - sys_call_table .type sys_call_table, @object diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index 65bb102d985b..ddb89a18cf26 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -193,8 +193,10 @@ die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15) static long dummy_emul(void) { return 0; } long (*alpha_fp_emul_imprecise)(struct pt_regs *regs, unsigned long writemask) = (void *)dummy_emul; +EXPORT_SYMBOL_GPL(alpha_fp_emul_imprecise); long (*alpha_fp_emul) (unsigned long pc) = (void *)dummy_emul; +EXPORT_SYMBOL_GPL(alpha_fp_emul); #else long alpha_fp_emul_imprecise(struct pt_regs *regs, unsigned long writemask); long alpha_fp_emul (unsigned long pc); diff --git a/arch/alpha/lib/Makefile b/arch/alpha/lib/Makefile index 7083434dd241..a80815960364 100644 --- a/arch/alpha/lib/Makefile +++ b/arch/alpha/lib/Makefile @@ -20,12 +20,8 @@ lib-y = __divqu.o __remqu.o __divlu.o __remlu.o \ checksum.o \ csum_partial_copy.o \ $(ev67-y)strlen.o \ - $(ev67-y)strcat.o \ - strcpy.o \ - $(ev67-y)strncat.o \ - strncpy.o \ - $(ev6-y)stxcpy.o \ - $(ev6-y)stxncpy.o \ + stycpy.o \ + styncpy.o \ $(ev67-y)strchr.o \ $(ev67-y)strrchr.o \ $(ev6-y)memchr.o \ @@ -49,3 +45,17 @@ AFLAGS___remlu.o = -DREM -DINTSIZE $(addprefix $(obj)/,__divqu.o __remqu.o __divlu.o __remlu.o): \ $(src)/$(ev6-y)divide.S FORCE $(call if_changed_rule,as_o_S) + +# There are direct branches between {str*cpy,str*cat} and stx*cpy. +# Ensure the branches are within range by merging these objects. + +LDFLAGS_stycpy.o := -r +LDFLAGS_styncpy.o := -r + +$(obj)/stycpy.o: $(obj)/strcpy.o $(obj)/$(ev67-y)strcat.o \ + $(obj)/$(ev6-y)stxcpy.o FORCE + $(call if_changed,ld) + +$(obj)/styncpy.o: $(obj)/strncpy.o $(obj)/$(ev67-y)strncat.o \ + $(obj)/$(ev6-y)stxncpy.o FORCE + $(call if_changed,ld) diff --git a/arch/alpha/lib/copy_user.S b/arch/alpha/lib/copy_user.S index 159f1b7e6e49..c277a1a4383e 100644 --- a/arch/alpha/lib/copy_user.S +++ b/arch/alpha/lib/copy_user.S @@ -34,7 +34,7 @@ .ent __copy_user __copy_user: .prologue 0 - and $18,$18,$0 + mov $18,$0 and $16,7,$3 beq $0,$35 beq $3,$36 diff --git a/arch/alpha/lib/ev6-copy_user.S b/arch/alpha/lib/ev6-copy_user.S index 35e6710d0700..954ca03ebebe 100644 --- a/arch/alpha/lib/ev6-copy_user.S +++ b/arch/alpha/lib/ev6-copy_user.S @@ -45,9 +45,10 @@ # Pipeline info: Slotting & Comments __copy_user: .prologue 0 - andq $18, $18, $0 - subq $18, 32, $1 # .. E .. .. : Is this going to be a small copy? - beq $0, $zerolength # U .. .. .. : U L U L + mov $18, $0 # .. .. .. E + subq $18, 32, $1 # .. .. E. .. : Is this going to be a small copy? + nop # .. E .. .. + beq $18, $zerolength # U .. .. .. : U L U L and $16,7,$3 # .. .. .. E : is leading dest misalignment ble $1, $onebyteloop # .. .. U .. : 1st branch : small amount of data diff --git a/arch/alpha/math-emu/math.c b/arch/alpha/math-emu/math.c index d17d705f6545..1c2d456da7f2 100644 --- a/arch/alpha/math-emu/math.c +++ b/arch/alpha/math-emu/math.c @@ -53,6 +53,7 @@ extern void alpha_write_fp_reg_s (unsigned long reg, unsigned long val); #ifdef MODULE MODULE_DESCRIPTION("FP Software completion module"); +MODULE_LICENSE("GPL v2"); extern long (*alpha_fp_emul_imprecise)(struct pt_regs *, unsigned long); extern long (*alpha_fp_emul) (unsigned long pc); diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 54b54da6384c..11859287c52a 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -123,6 +123,8 @@ static inline void atomic_set(atomic_t *v, int i) atomic_ops_unlock(flags); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + #endif /* diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h index 11e1b1f3acda..eb887dd13e74 100644 --- a/arch/arc/include/asm/futex.h +++ b/arch/arc/include/asm/futex.h @@ -73,20 +73,11 @@ #endif -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) - return -EFAULT; - #ifndef CONFIG_ARC_HAS_LLSC preempt_disable(); /* to guarantee atomic r-m-w of futex op */ #endif @@ -118,30 +109,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) preempt_enable(); #endif - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h index 233d5ffe6ec7..a325e6a36523 100644 --- a/arch/arc/include/asm/spinlock.h +++ b/arch/arc/include/asm/spinlock.h @@ -16,11 +16,6 @@ #define arch_spin_is_locked(x) ((x)->slock != __ARCH_SPIN_LOCK_UNLOCKED__) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, !VAL); -} - #ifdef CONFIG_ARC_HAS_LLSC static inline void arch_spin_lock(arch_spinlock_t *lock) diff --git a/arch/arc/kernel/intc-arcv2.c b/arch/arc/kernel/intc-arcv2.c index cf90714a676d..067ea362fb3e 100644 --- a/arch/arc/kernel/intc-arcv2.c +++ b/arch/arc/kernel/intc-arcv2.c @@ -75,13 +75,20 @@ void arc_init_IRQ(void) * Set a default priority for all available interrupts to prevent * switching of register banks if Fast IRQ and multiple register banks * are supported by CPU. - * Also disable all IRQ lines so faulty external hardware won't + * Also disable private-per-core IRQ lines so faulty external HW won't * trigger interrupt that kernel is not ready to handle. */ for (i = NR_EXCEPTIONS; i < irq_bcr.irqs + NR_EXCEPTIONS; i++) { write_aux_reg(AUX_IRQ_SELECT, i); write_aux_reg(AUX_IRQ_PRIORITY, ARCV2_IRQ_DEF_PRIO); - write_aux_reg(AUX_IRQ_ENABLE, 0); + + /* + * Only mask cpu private IRQs here. + * "common" interrupts are masked at IDU, otherwise it would + * need to be unmasked at each cpu, with IPIs + */ + if (i < FIRST_EXT_IRQ) + write_aux_reg(AUX_IRQ_ENABLE, 0); } /* setup status32, don't enable intr yet as kernel doesn't want */ diff --git a/arch/arc/kernel/intc-compact.c b/arch/arc/kernel/intc-compact.c index cef388025adf..47b421fa0147 100644 --- a/arch/arc/kernel/intc-compact.c +++ b/arch/arc/kernel/intc-compact.c @@ -27,7 +27,7 @@ */ void arc_init_IRQ(void) { - int level_mask = 0, i; + unsigned int level_mask = 0, i; /* Is timer high priority Interrupt (Level2 in ARCompact jargon) */ level_mask |= IS_ENABLED(CONFIG_ARC_COMPACT_IRQ_LEVELS) << TIMER0_IRQ; diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 61a0cb15067e..f1b3f1d575d4 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -50,7 +50,7 @@ config ARM select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT) select HAVE_ARCH_TRACEHOOK select HAVE_ARM_SMCCC if CPU_V7 - select HAVE_CBPF_JIT + select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32 select HAVE_CC_STACKPROTECTOR select HAVE_CONTEXT_TRACKING select HAVE_C_RECORDMCOUNT diff --git a/arch/arm/boot/dts/imx6q-evi.dts b/arch/arm/boot/dts/imx6q-evi.dts index 1f0f950dc11e..e0aea782c666 100644 --- a/arch/arm/boot/dts/imx6q-evi.dts +++ b/arch/arm/boot/dts/imx6q-evi.dts @@ -94,6 +94,15 @@ pinctrl-names = "default"; pinctrl-0 = <&pinctrl_ecspi1 &pinctrl_ecspi1cs>; status = "okay"; + + fpga: fpga@0 { + compatible = "altr,fpga-passive-serial"; + spi-max-frequency = <20000000>; + reg = <0>; + pinctrl-0 = <&pinctrl_fpgaspi>; + nconfig-gpios = <&gpio4 9 GPIO_ACTIVE_LOW>; + nstat-gpios = <&gpio4 11 GPIO_ACTIVE_LOW>; + }; }; &ecspi3 { @@ -319,6 +328,13 @@ >; }; + pinctrl_fpgaspi: fpgaspigrp { + fsl,pins = < + MX6QDL_PAD_KEY_ROW1__GPIO4_IO09 0x1b0b0 + MX6QDL_PAD_KEY_ROW2__GPIO4_IO11 0x1b0b0 + >; + }; + pinctrl_gpminand: gpminandgrp { fsl,pins = < MX6QDL_PAD_NANDF_CLE__NAND_CLE 0xb0b1 diff --git a/arch/arm/boot/dts/imx7ulp-pinfunc.h b/arch/arm/boot/dts/imx7ulp-pinfunc.h new file mode 100644 index 000000000000..fe511775b518 --- /dev/null +++ b/arch/arm/boot/dts/imx7ulp-pinfunc.h @@ -0,0 +1,468 @@ +/* + * Copyright 2016 Freescale Semiconductor, Inc. + * Copyright 2017 NXP + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#ifndef __DTS_IMX7ULP_PINFUNC_H +#define __DTS_IMX7ULP_PINFUNC_H + +/* + * The pin function ID is a tuple of + * <mux_conf_reg input_reg mux_mode input_val> + */ + +#define IMX7ULP_PAD_PTC0__PTC0 0x0000 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC0__TRACE_D15 0x0000 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC0__LPUART4_CTS_B 0x0000 0x0244 0x4 0x1 +#define IMX7ULP_PAD_PTC0__LPI2C4_SCL 0x0000 0x0278 0x5 0x1 +#define IMX7ULP_PAD_PTC0__TPM4_CLKIN 0x0000 0x0298 0x6 0x1 +#define IMX7ULP_PAD_PTC0__FB_AD0 0x0000 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC1__PTC1 0x0004 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC1__TRACE_D14 0x0004 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC1__LPUART4_RTS_B 0x0004 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTC1__LPI2C4_SDA 0x0004 0x027c 0x5 0x1 +#define IMX7ULP_PAD_PTC1__TPM4_CH0 0x0004 0x0280 0x6 0x1 +#define IMX7ULP_PAD_PTC1__FB_AD1 0x0004 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC2__PTC2 0x0008 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC2__TRACE_D13 0x0008 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC2__LPUART4_TX 0x0008 0x024c 0x4 0x1 +#define IMX7ULP_PAD_PTC2__LPI2C4_HREQ 0x0008 0x0274 0x5 0x1 +#define IMX7ULP_PAD_PTC2__TPM4_CH1 0x0008 0x0284 0x6 0x1 +#define IMX7ULP_PAD_PTC2__FB_AD2 0x0008 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC3__PTC3 0x000c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC3__TRACE_D12 0x000c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC3__LPUART4_RX 0x000c 0x0248 0x4 0x1 +#define IMX7ULP_PAD_PTC3__TPM4_CH2 0x000c 0x0288 0x6 0x1 +#define IMX7ULP_PAD_PTC3__FB_AD3 0x000c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC4__PTC4 0x0010 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC4__TRACE_D11 0x0010 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC4__FXIO1_D0 0x0010 0x0204 0x2 0x1 +#define IMX7ULP_PAD_PTC4__LPSPI2_PCS1 0x0010 0x02a0 0x3 0x1 +#define IMX7ULP_PAD_PTC4__LPUART5_CTS_B 0x0010 0x0250 0x4 0x1 +#define IMX7ULP_PAD_PTC4__LPI2C5_SCL 0x0010 0x02bc 0x5 0x1 +#define IMX7ULP_PAD_PTC4__TPM4_CH3 0x0010 0x028c 0x6 0x1 +#define IMX7ULP_PAD_PTC4__FB_AD4 0x0010 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC5__PTC5 0x0014 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC5__TRACE_D10 0x0014 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC5__FXIO1_D1 0x0014 0x0208 0x2 0x1 +#define IMX7ULP_PAD_PTC5__LPSPI2_PCS2 0x0014 0x02a4 0x3 0x1 +#define IMX7ULP_PAD_PTC5__LPUART5_RTS_B 0x0014 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTC5__LPI2C5_SDA 0x0014 0x02c0 0x5 0x1 +#define IMX7ULP_PAD_PTC5__TPM4_CH4 0x0014 0x0290 0x6 0x1 +#define IMX7ULP_PAD_PTC5__FB_AD5 0x0014 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC6__PTC6 0x0018 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC6__TRACE_D9 0x0018 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC6__FXIO1_D2 0x0018 0x020c 0x2 0x1 +#define IMX7ULP_PAD_PTC6__LPSPI2_PCS3 0x0018 0x02a8 0x3 0x1 +#define IMX7ULP_PAD_PTC6__LPUART5_TX 0x0018 0x0258 0x4 0x1 +#define IMX7ULP_PAD_PTC6__LPI2C5_HREQ 0x0018 0x02b8 0x5 0x1 +#define IMX7ULP_PAD_PTC6__TPM4_CH5 0x0018 0x0294 0x6 0x1 +#define IMX7ULP_PAD_PTC6__FB_AD6 0x0018 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC7__PTC7 0x001c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC7__TRACE_D8 0x001c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC7__FXIO1_D3 0x001c 0x0210 0x2 0x1 +#define IMX7ULP_PAD_PTC7__LPUART5_RX 0x001c 0x0254 0x4 0x1 +#define IMX7ULP_PAD_PTC7__TPM5_CH1 0x001c 0x02c8 0x6 0x1 +#define IMX7ULP_PAD_PTC7__FB_AD7 0x001c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC8__PTC8 0x0020 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC8__TRACE_D7 0x0020 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC8__FXIO1_D4 0x0020 0x0214 0x2 0x1 +#define IMX7ULP_PAD_PTC8__LPSPI2_SIN 0x0020 0x02b0 0x3 0x1 +#define IMX7ULP_PAD_PTC8__LPUART6_CTS_B 0x0020 0x025c 0x4 0x1 +#define IMX7ULP_PAD_PTC8__LPI2C6_SCL 0x0020 0x02fc 0x5 0x1 +#define IMX7ULP_PAD_PTC8__TPM5_CLKIN 0x0020 0x02cc 0x6 0x1 +#define IMX7ULP_PAD_PTC8__FB_AD8 0x0020 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC9__PTC9 0x0024 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC9__TRACE_D6 0x0024 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC9__FXIO1_D5 0x0024 0x0218 0x2 0x1 +#define IMX7ULP_PAD_PTC9__LPSPI2_SOUT 0x0024 0x02b4 0x3 0x1 +#define IMX7ULP_PAD_PTC9__LPUART6_RTS_B 0x0024 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTC9__LPI2C6_SDA 0x0024 0x0300 0x5 0x1 +#define IMX7ULP_PAD_PTC9__TPM5_CH0 0x0024 0x02c4 0x6 0x1 +#define IMX7ULP_PAD_PTC9__FB_AD9 0x0024 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC10__PTC10 0x0028 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC10__TRACE_D5 0x0028 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC10__FXIO1_D6 0x0028 0x021c 0x2 0x1 +#define IMX7ULP_PAD_PTC10__LPSPI2_SCK 0x0028 0x02ac 0x3 0x1 +#define IMX7ULP_PAD_PTC10__LPUART6_TX 0x0028 0x0264 0x4 0x1 +#define IMX7ULP_PAD_PTC10__LPI2C6_HREQ 0x0028 0x02f8 0x5 0x1 +#define IMX7ULP_PAD_PTC10__TPM7_CH3 0x0028 0x02e8 0x6 0x1 +#define IMX7ULP_PAD_PTC10__FB_AD10 0x0028 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC11__PTC11 0x002c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC11__TRACE_D4 0x002c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC11__FXIO1_D7 0x002c 0x0220 0x2 0x1 +#define IMX7ULP_PAD_PTC11__LPSPI2_PCS0 0x002c 0x029c 0x3 0x1 +#define IMX7ULP_PAD_PTC11__LPUART6_RX 0x002c 0x0260 0x4 0x1 +#define IMX7ULP_PAD_PTC11__TPM7_CH4 0x002c 0x02ec 0x6 0x1 +#define IMX7ULP_PAD_PTC11__FB_AD11 0x002c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC12__PTC12 0x0030 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC12__TRACE_D3 0x0030 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC12__FXIO1_D8 0x0030 0x0224 0x2 0x1 +#define IMX7ULP_PAD_PTC12__LPSPI3_PCS1 0x0030 0x0314 0x3 0x1 +#define IMX7ULP_PAD_PTC12__LPUART7_CTS_B 0x0030 0x0268 0x4 0x1 +#define IMX7ULP_PAD_PTC12__LPI2C7_SCL 0x0030 0x0308 0x5 0x1 +#define IMX7ULP_PAD_PTC12__TPM7_CH5 0x0030 0x02f0 0x6 0x1 +#define IMX7ULP_PAD_PTC12__FB_AD12 0x0030 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC13__PTC13 0x0034 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC13__TRACE_D2 0x0034 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC13__FXIO1_D9 0x0034 0x0228 0x2 0x1 +#define IMX7ULP_PAD_PTC13__LPSPI3_PCS2 0x0034 0x0318 0x3 0x1 +#define IMX7ULP_PAD_PTC13__LPUART7_RTS_B 0x0034 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTC13__LPI2C7_SDA 0x0034 0x030c 0x5 0x1 +#define IMX7ULP_PAD_PTC13__TPM7_CLKIN 0x0034 0x02f4 0x6 0x1 +#define IMX7ULP_PAD_PTC13__FB_AD13 0x0034 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC14__PTC14 0x0038 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC14__TRACE_D1 0x0038 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC14__FXIO1_D10 0x0038 0x022c 0x2 0x1 +#define IMX7ULP_PAD_PTC14__LPSPI3_PCS3 0x0038 0x031c 0x3 0x1 +#define IMX7ULP_PAD_PTC14__LPUART7_TX 0x0038 0x0270 0x4 0x1 +#define IMX7ULP_PAD_PTC14__LPI2C7_HREQ 0x0038 0x0304 0x5 0x1 +#define IMX7ULP_PAD_PTC14__TPM7_CH0 0x0038 0x02dc 0x6 0x1 +#define IMX7ULP_PAD_PTC14__FB_AD14 0x0038 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC15__PTC15 0x003c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC15__TRACE_D0 0x003c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC15__FXIO1_D11 0x003c 0x0230 0x2 0x1 +#define IMX7ULP_PAD_PTC15__LPUART7_RX 0x003c 0x026c 0x4 0x1 +#define IMX7ULP_PAD_PTC15__TPM7_CH1 0x003c 0x02e0 0x6 0x1 +#define IMX7ULP_PAD_PTC15__FB_AD15 0x003c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC16__PTC16 0x0040 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC16__TRACE_CLKOUT 0x0040 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTC16__FXIO1_D12 0x0040 0x0234 0x2 0x1 +#define IMX7ULP_PAD_PTC16__LPSPI3_SIN 0x0040 0x0324 0x3 0x1 +#define IMX7ULP_PAD_PTC16__TPM7_CH2 0x0040 0x02e4 0x6 0x1 +#define IMX7ULP_PAD_PTC16__FB_ALE_FB_CS1_B_FB_TS_B 0x0040 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC17__PTC17 0x0044 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC17__FXIO1_D13 0x0044 0x0238 0x2 0x1 +#define IMX7ULP_PAD_PTC17__LPSPI3_SOUT 0x0044 0x0328 0x3 0x1 +#define IMX7ULP_PAD_PTC17__TPM6_CLKIN 0x0044 0x02d8 0x6 0x1 +#define IMX7ULP_PAD_PTC17__FB_CS0_B 0x0044 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC18__PTC18 0x0048 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC18__FXIO1_D14 0x0048 0x023c 0x2 0x1 +#define IMX7ULP_PAD_PTC18__LPSPI3_SCK 0x0048 0x0320 0x3 0x1 +#define IMX7ULP_PAD_PTC18__TPM6_CH0 0x0048 0x02d0 0x6 0x1 +#define IMX7ULP_PAD_PTC18__FB_OE_B 0x0048 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTC19__PTC19 0x004c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTC19__FXIO1_D15 0x004c 0x0240 0x2 0x1 +#define IMX7ULP_PAD_PTC19__LPSPI3_PCS0 0x004c 0x0310 0x3 0x1 +#define IMX7ULP_PAD_PTC19__TPM6_CH1 0x004c 0x02d4 0x6 0x1 +#define IMX7ULP_PAD_PTC19__FB_A16 0x004c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTD0__PTD0 0x0080 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD0__SDHC0_RESET_B 0x0080 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD1__PTD1 0x0084 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD1__SDHC0_CMD 0x0084 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD2__PTD2 0x0088 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD2__SDHC0_CLK 0x0088 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD3__PTD3 0x008c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD3__SDHC0_D7 0x008c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD4__PTD4 0x0090 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD4__SDHC0_D6 0x0090 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD5__PTD5 0x0094 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD5__SDHC0_D5 0x0094 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD6__PTD6 0x0098 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD6__SDHC0_D4 0x0098 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD7__PTD7 0x009c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD7__SDHC0_D3 0x009c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD8__PTD8 0x00a0 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD8__TPM4_CLKIN 0x00a0 0x0298 0x6 0x2 +#define IMX7ULP_PAD_PTD8__SDHC0_D2 0x00a0 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD9__PTD9 0x00a4 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD9__TPM4_CH0 0x00a4 0x0280 0x6 0x2 +#define IMX7ULP_PAD_PTD9__SDHC0_D1 0x00a4 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD10__PTD10 0x00a8 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD10__TPM4_CH1 0x00a8 0x0284 0x6 0x2 +#define IMX7ULP_PAD_PTD10__SDHC0_D0 0x00a8 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTD11__PTD11 0x00ac 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTD11__TPM4_CH2 0x00ac 0x0288 0x6 0x2 +#define IMX7ULP_PAD_PTD11__SDHC0_DQS 0x00ac 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE0__PTE0 0x0100 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE0__FXIO1_D31 0x0100 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE0__LPSPI2_PCS1 0x0100 0x02a0 0x3 0x2 +#define IMX7ULP_PAD_PTE0__LPUART4_CTS_B 0x0100 0x0244 0x4 0x2 +#define IMX7ULP_PAD_PTE0__LPI2C4_SCL 0x0100 0x0278 0x5 0x2 +#define IMX7ULP_PAD_PTE0__SDHC1_D1 0x0100 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE0__FB_A25 0x0100 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE1__PTE1 0x0104 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE1__FXIO1_D30 0x0104 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE1__LPSPI2_PCS2 0x0104 0x02a4 0x3 0x2 +#define IMX7ULP_PAD_PTE1__LPUART4_RTS_B 0x0104 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTE1__LPI2C4_SDA 0x0104 0x027c 0x5 0x2 +#define IMX7ULP_PAD_PTE1__SDHC1_D0 0x0104 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE1__FB_A26 0x0104 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE2__PTE2 0x0108 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE2__FXIO1_D29 0x0108 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE2__LPSPI2_PCS3 0x0108 0x02a8 0x3 0x2 +#define IMX7ULP_PAD_PTE2__LPUART4_TX 0x0108 0x024c 0x4 0x2 +#define IMX7ULP_PAD_PTE2__LPI2C4_HREQ 0x0108 0x0274 0x5 0x2 +#define IMX7ULP_PAD_PTE2__SDHC1_CLK 0x0108 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE3__PTE3 0x010c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE3__FXIO1_D28 0x010c 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE3__LPUART4_RX 0x010c 0x0248 0x4 0x2 +#define IMX7ULP_PAD_PTE3__TPM5_CH1 0x010c 0x02c8 0x6 0x2 +#define IMX7ULP_PAD_PTE3__SDHC1_CMD 0x010c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE4__PTE4 0x0110 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE4__FXIO1_D27 0x0110 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE4__LPSPI2_SIN 0x0110 0x02b0 0x3 0x2 +#define IMX7ULP_PAD_PTE4__LPUART5_CTS_B 0x0110 0x0250 0x4 0x2 +#define IMX7ULP_PAD_PTE4__LPI2C5_SCL 0x0110 0x02bc 0x5 0x2 +#define IMX7ULP_PAD_PTE4__TPM5_CLKIN 0x0110 0x02cc 0x6 0x2 +#define IMX7ULP_PAD_PTE4__SDHC1_D3 0x0110 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE5__PTE5 0x0114 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE5__FXIO1_D26 0x0114 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE5__LPSPI2_SOUT 0x0114 0x02b4 0x3 0x2 +#define IMX7ULP_PAD_PTE5__LPUART5_RTS_B 0x0114 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTE5__LPI2C5_SDA 0x0114 0x02c0 0x5 0x2 +#define IMX7ULP_PAD_PTE5__TPM5_CH0 0x0114 0x02c4 0x6 0x2 +#define IMX7ULP_PAD_PTE5__SDHC1_D2 0x0114 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE6__PTE6 0x0118 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE6__FXIO1_D25 0x0118 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE6__LPSPI2_SCK 0x0118 0x02ac 0x3 0x2 +#define IMX7ULP_PAD_PTE6__LPUART5_TX 0x0118 0x0258 0x4 0x2 +#define IMX7ULP_PAD_PTE6__LPI2C5_HREQ 0x0118 0x02b8 0x5 0x2 +#define IMX7ULP_PAD_PTE6__TPM7_CH3 0x0118 0x02e8 0x6 0x2 +#define IMX7ULP_PAD_PTE6__SDHC1_D4 0x0118 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE6__FB_A17 0x0118 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE7__PTE7 0x011c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE7__TRACE_D7 0x011c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE7__VIU_FID 0x011c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE7__FXIO1_D24 0x011c 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE7__LPSPI2_PCS0 0x011c 0x029c 0x3 0x2 +#define IMX7ULP_PAD_PTE7__LPUART5_RX 0x011c 0x0254 0x4 0x2 +#define IMX7ULP_PAD_PTE7__TPM7_CH4 0x011c 0x02ec 0x6 0x2 +#define IMX7ULP_PAD_PTE7__SDHC1_D5 0x011c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE7__FB_A18 0x011c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE8__PTE8 0x0120 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE8__TRACE_D6 0x0120 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE8__VIU_D16 0x0120 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE8__FXIO1_D23 0x0120 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE8__LPSPI3_PCS1 0x0120 0x0314 0x3 0x2 +#define IMX7ULP_PAD_PTE8__LPUART6_CTS_B 0x0120 0x025c 0x4 0x2 +#define IMX7ULP_PAD_PTE8__LPI2C6_SCL 0x0120 0x02fc 0x5 0x2 +#define IMX7ULP_PAD_PTE8__TPM7_CH5 0x0120 0x02f0 0x6 0x2 +#define IMX7ULP_PAD_PTE8__SDHC1_WP 0x0120 0x0200 0x7 0x1 +#define IMX7ULP_PAD_PTE8__SDHC1_D6 0x0120 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE8__FB_CS3_B_FB_BE7_0_BLS31_24_B 0x0120 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE9__PTE9 0x0124 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE9__TRACE_D5 0x0124 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE9__VIU_D17 0x0124 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE9__FXIO1_D22 0x0124 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE9__LPSPI3_PCS2 0x0124 0x0318 0x3 0x2 +#define IMX7ULP_PAD_PTE9__LPUART6_RTS_B 0x0124 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTE9__LPI2C6_SDA 0x0124 0x0300 0x5 0x2 +#define IMX7ULP_PAD_PTE9__TPM7_CLKIN 0x0124 0x02f4 0x6 0x2 +#define IMX7ULP_PAD_PTE9__SDHC1_CD 0x0124 0x032c 0x7 0x1 +#define IMX7ULP_PAD_PTE9__SDHC1_D7 0x0124 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE9__FB_TBST_B_FB_CS2_B_FB_BE15_8_BLS23_16_B 0x0124 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE10__PTE10 0x0128 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE10__TRACE_D4 0x0128 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE10__VIU_D18 0x0128 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE10__FXIO1_D21 0x0128 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE10__LPSPI3_PCS3 0x0128 0x031c 0x3 0x2 +#define IMX7ULP_PAD_PTE10__LPUART6_TX 0x0128 0x0264 0x4 0x2 +#define IMX7ULP_PAD_PTE10__LPI2C6_HREQ 0x0128 0x02f8 0x5 0x2 +#define IMX7ULP_PAD_PTE10__TPM7_CH0 0x0128 0x02dc 0x6 0x2 +#define IMX7ULP_PAD_PTE10__SDHC1_VS 0x0128 0x0000 0x7 0x0 +#define IMX7ULP_PAD_PTE10__SDHC1_DQS 0x0128 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE10__FB_A19 0x0128 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE11__PTE11 0x012c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE11__TRACE_D3 0x012c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE11__VIU_D19 0x012c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE11__FXIO1_D20 0x012c 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE11__LPUART6_RX 0x012c 0x0260 0x4 0x2 +#define IMX7ULP_PAD_PTE11__TPM7_CH1 0x012c 0x02e0 0x6 0x2 +#define IMX7ULP_PAD_PTE11__SDHC1_RESET_B 0x012c 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE11__FB_A20 0x012c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE12__PTE12 0x0130 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE12__TRACE_D2 0x0130 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE12__VIU_D20 0x0130 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE12__FXIO1_D19 0x0130 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE12__LPSPI3_SIN 0x0130 0x0324 0x3 0x2 +#define IMX7ULP_PAD_PTE12__LPUART7_CTS_B 0x0130 0x0268 0x4 0x2 +#define IMX7ULP_PAD_PTE12__LPI2C7_SCL 0x0130 0x0308 0x5 0x2 +#define IMX7ULP_PAD_PTE12__TPM7_CH2 0x0130 0x02e4 0x6 0x2 +#define IMX7ULP_PAD_PTE12__SDHC1_WP 0x0130 0x0200 0x8 0x2 +#define IMX7ULP_PAD_PTE12__FB_A21 0x0130 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE13__PTE13 0x0134 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE13__TRACE_D1 0x0134 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE13__VIU_D21 0x0134 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE13__FXIO1_D18 0x0134 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE13__LPSPI3_SOUT 0x0134 0x0328 0x3 0x2 +#define IMX7ULP_PAD_PTE13__LPUART7_RTS_B 0x0134 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTE13__LPI2C7_SDA 0x0134 0x030c 0x5 0x2 +#define IMX7ULP_PAD_PTE13__TPM6_CLKIN 0x0134 0x02d8 0x6 0x2 +#define IMX7ULP_PAD_PTE13__SDHC1_CD 0x0134 0x032c 0x8 0x2 +#define IMX7ULP_PAD_PTE13__FB_A22 0x0134 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE14__PTE14 0x0138 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE14__TRACE_D0 0x0138 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE14__VIU_D22 0x0138 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE14__FXIO1_D17 0x0138 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE14__LPSPI3_SCK 0x0138 0x0320 0x3 0x2 +#define IMX7ULP_PAD_PTE14__LPUART7_TX 0x0138 0x0270 0x4 0x2 +#define IMX7ULP_PAD_PTE14__LPI2C7_HREQ 0x0138 0x0304 0x5 0x2 +#define IMX7ULP_PAD_PTE14__TPM6_CH0 0x0138 0x02d0 0x6 0x2 +#define IMX7ULP_PAD_PTE14__SDHC1_VS 0x0138 0x0000 0x8 0x0 +#define IMX7ULP_PAD_PTE14__FB_A23 0x0138 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTE15__PTE15 0x013c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTE15__TRACE_CLKOUT 0x013c 0x0000 0xa 0x0 +#define IMX7ULP_PAD_PTE15__VIU_D23 0x013c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTE15__FXIO1_D16 0x013c 0x0000 0x2 0x0 +#define IMX7ULP_PAD_PTE15__LPSPI3_PCS0 0x013c 0x0310 0x3 0x2 +#define IMX7ULP_PAD_PTE15__LPUART7_RX 0x013c 0x026c 0x4 0x2 +#define IMX7ULP_PAD_PTE15__TPM6_CH1 0x013c 0x02d4 0x6 0x2 +#define IMX7ULP_PAD_PTE15__FB_A24 0x013c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF0__PTF0 0x0180 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF0__VIU_DE 0x0180 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF0__LPUART4_CTS_B 0x0180 0x0244 0x4 0x3 +#define IMX7ULP_PAD_PTF0__LPI2C4_SCL 0x0180 0x0278 0x5 0x3 +#define IMX7ULP_PAD_PTF0__TPM4_CLKIN 0x0180 0x0298 0x6 0x3 +#define IMX7ULP_PAD_PTF0__FB_RW_B 0x0180 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF1__PTF1 0x0184 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF1__VIU_HSYNC 0x0184 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF1__LPUART4_RTS_B 0x0184 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTF1__LPI2C4_SDA 0x0184 0x027c 0x5 0x3 +#define IMX7ULP_PAD_PTF1__TPM4_CH0 0x0184 0x0280 0x6 0x3 +#define IMX7ULP_PAD_PTF1__CLKOUT 0x0184 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF2__PTF2 0x0188 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF2__VIU_VSYNC 0x0188 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF2__LPUART4_TX 0x0188 0x024c 0x4 0x3 +#define IMX7ULP_PAD_PTF2__LPI2C4_HREQ 0x0188 0x0274 0x5 0x3 +#define IMX7ULP_PAD_PTF2__TPM4_CH1 0x0188 0x0284 0x6 0x3 +#define IMX7ULP_PAD_PTF2__FB_TSIZ1_FB_CS5_B_FB_BE23_16_BLS15_8_B 0x0188 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF3__PTF3 0x018c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF3__VIU_PCLK 0x018c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF3__LPUART4_RX 0x018c 0x0248 0x4 0x3 +#define IMX7ULP_PAD_PTF3__TPM4_CH2 0x018c 0x0288 0x6 0x3 +#define IMX7ULP_PAD_PTF3__FB_AD16 0x018c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF4__PTF4 0x0190 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF4__VIU_D0 0x0190 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF4__FXIO1_D0 0x0190 0x0204 0x2 0x2 +#define IMX7ULP_PAD_PTF4__LPSPI2_PCS1 0x0190 0x02a0 0x3 0x3 +#define IMX7ULP_PAD_PTF4__LPUART5_CTS_B 0x0190 0x0250 0x4 0x3 +#define IMX7ULP_PAD_PTF4__LPI2C5_SCL 0x0190 0x02bc 0x5 0x3 +#define IMX7ULP_PAD_PTF4__TPM4_CH3 0x0190 0x028c 0x6 0x2 +#define IMX7ULP_PAD_PTF4__FB_AD17 0x0190 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF5__PTF5 0x0194 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF5__VIU_D1 0x0194 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF5__FXIO1_D1 0x0194 0x0208 0x2 0x2 +#define IMX7ULP_PAD_PTF5__LPSPI2_PCS2 0x0194 0x02a4 0x3 0x3 +#define IMX7ULP_PAD_PTF5__LPUART5_RTS_B 0x0194 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTF5__LPI2C5_SDA 0x0194 0x02c0 0x5 0x3 +#define IMX7ULP_PAD_PTF5__TPM4_CH4 0x0194 0x0290 0x6 0x2 +#define IMX7ULP_PAD_PTF5__FB_AD18 0x0194 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF6__PTF6 0x0198 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF6__VIU_D2 0x0198 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF6__FXIO1_D2 0x0198 0x020c 0x2 0x2 +#define IMX7ULP_PAD_PTF6__LPSPI2_PCS3 0x0198 0x02a8 0x3 0x3 +#define IMX7ULP_PAD_PTF6__LPUART5_TX 0x0198 0x0258 0x4 0x3 +#define IMX7ULP_PAD_PTF6__LPI2C5_HREQ 0x0198 0x02b8 0x5 0x3 +#define IMX7ULP_PAD_PTF6__TPM4_CH5 0x0198 0x0294 0x6 0x2 +#define IMX7ULP_PAD_PTF6__FB_AD19 0x0198 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF7__PTF7 0x019c 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF7__VIU_D3 0x019c 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF7__FXIO1_D3 0x019c 0x0210 0x2 0x2 +#define IMX7ULP_PAD_PTF7__LPUART5_RX 0x019c 0x0254 0x4 0x3 +#define IMX7ULP_PAD_PTF7__TPM5_CH1 0x019c 0x02c8 0x6 0x3 +#define IMX7ULP_PAD_PTF7__FB_AD20 0x019c 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF8__PTF8 0x01a0 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF8__USB1_ULPI_CLK 0x01a0 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF8__VIU_D4 0x01a0 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF8__FXIO1_D4 0x01a0 0x0214 0x2 0x2 +#define IMX7ULP_PAD_PTF8__LPSPI2_SIN 0x01a0 0x02b0 0x3 0x3 +#define IMX7ULP_PAD_PTF8__LPUART6_CTS_B 0x01a0 0x025c 0x4 0x3 +#define IMX7ULP_PAD_PTF8__LPI2C6_SCL 0x01a0 0x02fc 0x5 0x3 +#define IMX7ULP_PAD_PTF8__TPM5_CLKIN 0x01a0 0x02cc 0x6 0x3 +#define IMX7ULP_PAD_PTF8__FB_AD21 0x01a0 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF9__PTF9 0x01a4 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF9__USB1_ULPI_NXT 0x01a4 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF9__VIU_D5 0x01a4 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF9__FXIO1_D5 0x01a4 0x0218 0x2 0x2 +#define IMX7ULP_PAD_PTF9__LPSPI2_SOUT 0x01a4 0x02b4 0x3 0x3 +#define IMX7ULP_PAD_PTF9__LPUART6_RTS_B 0x01a4 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTF9__LPI2C6_SDA 0x01a4 0x0300 0x5 0x3 +#define IMX7ULP_PAD_PTF9__TPM5_CH0 0x01a4 0x02c4 0x6 0x3 +#define IMX7ULP_PAD_PTF9__FB_AD22 0x01a4 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF10__PTF10 0x01a8 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF10__USB1_ULPI_STP 0x01a8 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF10__VIU_D6 0x01a8 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF10__FXIO1_D6 0x01a8 0x021c 0x2 0x2 +#define IMX7ULP_PAD_PTF10__LPSPI2_SCK 0x01a8 0x02ac 0x3 0x3 +#define IMX7ULP_PAD_PTF10__LPUART6_TX 0x01a8 0x0264 0x4 0x3 +#define IMX7ULP_PAD_PTF10__LPI2C6_HREQ 0x01a8 0x02f8 0x5 0x3 +#define IMX7ULP_PAD_PTF10__TPM7_CH3 0x01a8 0x02e8 0x6 0x3 +#define IMX7ULP_PAD_PTF10__FB_AD23 0x01a8 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF11__PTF11 0x01ac 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF11__USB1_ULPI_DIR 0x01ac 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF11__VIU_D7 0x01ac 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF11__FXIO1_D7 0x01ac 0x0220 0x2 0x2 +#define IMX7ULP_PAD_PTF11__LPSPI2_PCS0 0x01ac 0x029c 0x3 0x3 +#define IMX7ULP_PAD_PTF11__LPUART6_RX 0x01ac 0x0260 0x4 0x3 +#define IMX7ULP_PAD_PTF11__TPM7_CH4 0x01ac 0x02ec 0x6 0x3 +#define IMX7ULP_PAD_PTF11__FB_CS4_B_FB_TSIZ0_FB_BE31_24_BLS7_0_B 0x01ac 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF12__PTF12 0x01b0 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF12__USB1_ULPI_DATA0 0x01b0 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF12__VIU_D8 0x01b0 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF12__FXIO1_D8 0x01b0 0x0224 0x2 0x2 +#define IMX7ULP_PAD_PTF12__LPSPI3_PCS1 0x01b0 0x0314 0x3 0x3 +#define IMX7ULP_PAD_PTF12__LPUART7_CTS_B 0x01b0 0x0268 0x4 0x3 +#define IMX7ULP_PAD_PTF12__LPI2C7_SCL 0x01b0 0x0308 0x5 0x3 +#define IMX7ULP_PAD_PTF12__TPM7_CH5 0x01b0 0x02f0 0x6 0x3 +#define IMX7ULP_PAD_PTF12__FB_AD24 0x01b0 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF13__PTF13 0x01b4 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF13__USB1_ULPI_DATA1 0x01b4 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF13__VIU_D9 0x01b4 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF13__FXIO1_D9 0x01b4 0x0228 0x2 0x2 +#define IMX7ULP_PAD_PTF13__LPSPI3_PCS2 0x01b4 0x0318 0x3 0x3 +#define IMX7ULP_PAD_PTF13__LPUART7_RTS_B 0x01b4 0x0000 0x4 0x0 +#define IMX7ULP_PAD_PTF13__LPI2C7_SDA 0x01b4 0x030c 0x5 0x3 +#define IMX7ULP_PAD_PTF13__TPM7_CLKIN 0x01b4 0x02f4 0x6 0x3 +#define IMX7ULP_PAD_PTF13__FB_AD25 0x01b4 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF14__PTF14 0x01b8 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF14__USB1_ULPI_DATA2 0x01b8 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF14__VIU_D10 0x01b8 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF14__FXIO1_D10 0x01b8 0x022c 0x2 0x2 +#define IMX7ULP_PAD_PTF14__LPSPI3_PCS3 0x01b8 0x031c 0x3 0x3 +#define IMX7ULP_PAD_PTF14__LPUART7_TX 0x01b8 0x0270 0x4 0x3 +#define IMX7ULP_PAD_PTF14__LPI2C7_HREQ 0x01b8 0x0304 0x5 0x3 +#define IMX7ULP_PAD_PTF14__TPM7_CH0 0x01b8 0x02dc 0x6 0x3 +#define IMX7ULP_PAD_PTF14__FB_AD26 0x01b8 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF15__PTF15 0x01bc 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF15__USB1_ULPI_DATA3 0x01bc 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF15__VIU_D11 0x01bc 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF15__FXIO1_D11 0x01bc 0x0230 0x2 0x2 +#define IMX7ULP_PAD_PTF15__LPUART7_RX 0x01bc 0x026c 0x4 0x3 +#define IMX7ULP_PAD_PTF15__TPM7_CH1 0x01bc 0x02e0 0x6 0x3 +#define IMX7ULP_PAD_PTF15__FB_AD27 0x01bc 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF16__PTF16 0x01c0 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF16__USB1_ULPI_DATA4 0x01c0 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF16__VIU_D12 0x01c0 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF16__FXIO1_D12 0x01c0 0x0234 0x2 0x2 +#define IMX7ULP_PAD_PTF16__LPSPI3_SIN 0x01c0 0x0324 0x3 0x3 +#define IMX7ULP_PAD_PTF16__TPM7_CH2 0x01c0 0x02e4 0x6 0x3 +#define IMX7ULP_PAD_PTF16__FB_AD28 0x01c0 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF17__PTF17 0x01c4 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF17__USB1_ULPI_DATA5 0x01c4 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF17__VIU_D13 0x01c4 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF17__FXIO1_D13 0x01c4 0x0238 0x2 0x2 +#define IMX7ULP_PAD_PTF17__LPSPI3_SOUT 0x01c4 0x0328 0x3 0x3 +#define IMX7ULP_PAD_PTF17__TPM6_CLKIN 0x01c4 0x02d8 0x6 0x3 +#define IMX7ULP_PAD_PTF17__FB_AD29 0x01c4 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF18__PTF18 0x01c8 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF18__USB1_ULPI_DATA6 0x01c8 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF18__VIU_D14 0x01c8 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF18__FXIO1_D14 0x01c8 0x023c 0x2 0x2 +#define IMX7ULP_PAD_PTF18__LPSPI3_SCK 0x01c8 0x0320 0x3 0x3 +#define IMX7ULP_PAD_PTF18__TPM6_CH0 0x01c8 0x02d0 0x6 0x3 +#define IMX7ULP_PAD_PTF18__FB_AD30 0x01c8 0x0000 0x9 0x0 +#define IMX7ULP_PAD_PTF19__PTF19 0x01cc 0x0000 0x1 0x0 +#define IMX7ULP_PAD_PTF19__USB1_ULPI_DATA7 0x01cc 0x0000 0xb 0x0 +#define IMX7ULP_PAD_PTF19__VIU_D15 0x01cc 0x0000 0xc 0x0 +#define IMX7ULP_PAD_PTF19__FXIO1_D15 0x01cc 0x0240 0x2 0x2 +#define IMX7ULP_PAD_PTF19__LPSPI3_PCS0 0x01cc 0x0310 0x3 0x3 +#define IMX7ULP_PAD_PTF19__TPM6_CH1 0x01cc 0x02d4 0x6 0x3 +#define IMX7ULP_PAD_PTF19__FB_AD31 0x01cc 0x0000 0x9 0x0 + +#endif /* __DTS_IMX7ULP_PINFUNC_H */ diff --git a/arch/arm/boot/dts/ls1021a.dtsi b/arch/arm/boot/dts/ls1021a.dtsi index 7bb9df2c1460..9319e1f0f1d8 100644 --- a/arch/arm/boot/dts/ls1021a.dtsi +++ b/arch/arm/boot/dts/ls1021a.dtsi @@ -129,14 +129,14 @@ }; msi1: msi-controller@1570e00 { - compatible = "fsl,1s1021a-msi"; + compatible = "fsl,ls1021a-msi"; reg = <0x0 0x1570e00 0x0 0x8>; msi-controller; interrupts = <GIC_SPI 179 IRQ_TYPE_LEVEL_HIGH>; }; msi2: msi-controller@1570e08 { - compatible = "fsl,1s1021a-msi"; + compatible = "fsl,ls1021a-msi"; reg = <0x0 0x1570e08 0x0 0x8>; msi-controller; interrupts = <GIC_SPI 180 IRQ_TYPE_LEVEL_HIGH>; @@ -699,7 +699,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x40 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi1>; + msi-parent = <&msi1>, <&msi2>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic GIC_SPI 91 IRQ_TYPE_LEVEL_HIGH>, @@ -722,7 +722,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x48 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi2>; + msi-parent = <&msi1>, <&msi2>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm/boot/dts/rk3228-evb.dts b/arch/arm/boot/dts/rk3228-evb.dts index 58834330a5ba..1be9daacc4f9 100644 --- a/arch/arm/boot/dts/rk3228-evb.dts +++ b/arch/arm/boot/dts/rk3228-evb.dts @@ -50,6 +50,16 @@ device_type = "memory"; reg = <0x60000000 0x40000000>; }; + + vcc_phy: vcc-phy-regulator { + compatible = "regulator-fixed"; + enable-active-high; + regulator-name = "vcc_phy"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + regulator-always-on; + regulator-boot-on; + }; }; &emmc { @@ -60,6 +70,30 @@ status = "okay"; }; +&gmac { + assigned-clocks = <&cru SCLK_MAC_SRC>; + assigned-clock-rates = <50000000>; + clock_in_out = "output"; + phy-supply = <&vcc_phy>; + phy-mode = "rmii"; + phy-handle = <&phy>; + status = "okay"; + + mdio { + compatible = "snps,dwmac-mdio"; + #address-cells = <1>; + #size-cells = <0>; + + phy: phy@0 { + compatible = "ethernet-phy-id1234.d400", "ethernet-phy-ieee802.3-c22"; + reg = <0>; + clocks = <&cru SCLK_MAC_PHY>; + resets = <&cru SRST_MACPHY>; + phy-is-integrated; + }; + }; +}; + &tsadc { status = "okay"; diff --git a/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts b/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts index 6713d0f2b3f4..b1502df7b509 100644 --- a/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts +++ b/arch/arm/boot/dts/sun8i-h2-plus-orangepi-zero.dts @@ -56,8 +56,6 @@ aliases { serial0 = &uart0; - /* ethernet0 is the H3 emac, defined in sun8i-h3.dtsi */ - ethernet0 = &emac; ethernet1 = &xr819; }; @@ -104,13 +102,6 @@ status = "okay"; }; -&emac { - phy-handle = <&int_mii_phy>; - phy-mode = "mii"; - allwinner,leds-active-low; - status = "okay"; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins_a>; diff --git a/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts b/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts index d756ff825116..a337af1de322 100644 --- a/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts +++ b/arch/arm/boot/dts/sun8i-h3-bananapi-m2-plus.dts @@ -52,7 +52,6 @@ compatible = "sinovoip,bpi-m2-plus", "allwinner,sun8i-h3"; aliases { - ethernet0 = &emac; serial0 = &uart0; serial1 = &uart1; }; @@ -115,30 +114,12 @@ status = "okay"; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&emac_rgmii_pins>; - phy-supply = <®_gmac_3v3>; - phy-handle = <&ext_rgmii_phy>; - phy-mode = "rgmii"; - - allwinner,leds-active-low; - status = "okay"; -}; - &ir { pinctrl-names = "default"; pinctrl-0 = <&ir_pins_a>; status = "okay"; }; -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <0>; - }; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>; diff --git a/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts b/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts index 78f6c24952dd..8d2cc6e9a03f 100644 --- a/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts +++ b/arch/arm/boot/dts/sun8i-h3-nanopi-neo.dts @@ -46,10 +46,3 @@ model = "FriendlyARM NanoPi NEO"; compatible = "friendlyarm,nanopi-neo", "allwinner,sun8i-h3"; }; - -&emac { - phy-handle = <&int_mii_phy>; - phy-mode = "mii"; - allwinner,leds-active-low; - status = "okay"; -}; diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts index 17cdeae19c6f..8ff71b1bb45b 100644 --- a/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts +++ b/arch/arm/boot/dts/sun8i-h3-orangepi-2.dts @@ -54,7 +54,6 @@ aliases { serial0 = &uart0; /* ethernet0 is the H3 emac, defined in sun8i-h3.dtsi */ - ethernet0 = &emac; ethernet1 = &rtl8189; }; @@ -118,13 +117,6 @@ status = "okay"; }; -&emac { - phy-handle = <&int_mii_phy>; - phy-mode = "mii"; - allwinner,leds-active-low; - status = "okay"; -}; - &ir { pinctrl-names = "default"; pinctrl-0 = <&ir_pins_a>; diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts index 6880268e8b87..5fea430e0eb1 100644 --- a/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts +++ b/arch/arm/boot/dts/sun8i-h3-orangepi-one.dts @@ -52,7 +52,6 @@ compatible = "xunlong,orangepi-one", "allwinner,sun8i-h3"; aliases { - ethernet0 = &emac; serial0 = &uart0; }; @@ -98,13 +97,6 @@ status = "okay"; }; -&emac { - phy-handle = <&int_mii_phy>; - phy-mode = "mii"; - allwinner,leds-active-low; - status = "okay"; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>; diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts index a10281b455f5..8b93f5c781a7 100644 --- a/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts +++ b/arch/arm/boot/dts/sun8i-h3-orangepi-pc-plus.dts @@ -53,11 +53,6 @@ }; }; -&emac { - /* LEDs changed to active high on the plus */ - /delete-property/ allwinner,leds-active-low; -}; - &mmc1 { pinctrl-names = "default"; pinctrl-0 = <&mmc1_pins_a>; diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts index 998b60f8d295..1a044b17d6c6 100644 --- a/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts +++ b/arch/arm/boot/dts/sun8i-h3-orangepi-pc.dts @@ -52,7 +52,6 @@ compatible = "xunlong,orangepi-pc", "allwinner,sun8i-h3"; aliases { - ethernet0 = &emac; serial0 = &uart0; }; @@ -114,13 +113,6 @@ status = "okay"; }; -&emac { - phy-handle = <&int_mii_phy>; - phy-mode = "mii"; - allwinner,leds-active-low; - status = "okay"; -}; - &ir { pinctrl-names = "default"; pinctrl-0 = <&ir_pins_a>; diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts index 331ed683ac62..828ae7a526d9 100644 --- a/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts +++ b/arch/arm/boot/dts/sun8i-h3-orangepi-plus.dts @@ -47,10 +47,6 @@ model = "Xunlong Orange Pi Plus / Plus 2"; compatible = "xunlong,orangepi-plus", "allwinner,sun8i-h3"; - aliases { - ethernet0 = &emac; - }; - reg_gmac_3v3: gmac-3v3 { compatible = "regulator-fixed"; regulator-name = "gmac-3v3"; @@ -78,24 +74,6 @@ status = "okay"; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&emac_rgmii_pins>; - phy-supply = <®_gmac_3v3>; - phy-handle = <&ext_rgmii_phy>; - phy-mode = "rgmii"; - - allwinner,leds-active-low; - status = "okay"; -}; - -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <0>; - }; -}; - &mmc2 { pinctrl-names = "default"; pinctrl-0 = <&mmc2_8bit_pins>; diff --git a/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts b/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts index 80026f3caafc..97920b12a944 100644 --- a/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts +++ b/arch/arm/boot/dts/sun8i-h3-orangepi-plus2e.dts @@ -61,19 +61,3 @@ gpio = <&pio 3 6 GPIO_ACTIVE_HIGH>; /* PD6 */ }; }; - -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&emac_rgmii_pins>; - phy-supply = <®_gmac_3v3>; - phy-handle = <&ext_rgmii_phy>; - phy-mode = "rgmii"; - status = "okay"; -}; - -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - }; -}; diff --git a/arch/arm/boot/dts/sunxi-h3-h5.dtsi b/arch/arm/boot/dts/sunxi-h3-h5.dtsi index d38282b9e5d4..11240a8313c2 100644 --- a/arch/arm/boot/dts/sunxi-h3-h5.dtsi +++ b/arch/arm/boot/dts/sunxi-h3-h5.dtsi @@ -391,32 +391,6 @@ clocks = <&osc24M>; }; - emac: ethernet@1c30000 { - compatible = "allwinner,sun8i-h3-emac"; - syscon = <&syscon>; - reg = <0x01c30000 0x10000>; - interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>; - interrupt-names = "macirq"; - resets = <&ccu RST_BUS_EMAC>; - reset-names = "stmmaceth"; - clocks = <&ccu CLK_BUS_EMAC>; - clock-names = "stmmaceth"; - #address-cells = <1>; - #size-cells = <0>; - status = "disabled"; - - mdio: mdio { - #address-cells = <1>; - #size-cells = <0>; - int_mii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - clocks = <&ccu CLK_BUS_EPHY>; - resets = <&ccu RST_BUS_EPHY>; - }; - }; - }; - spi0: spi@01c68000 { compatible = "allwinner,sun8i-h3-spi"; reg = <0x01c68000 0x1000>; diff --git a/arch/arm/boot/dts/tango4-smp8758.dtsi b/arch/arm/boot/dts/tango4-smp8758.dtsi index d2e65c46bcc7..eca33d568690 100644 --- a/arch/arm/boot/dts/tango4-smp8758.dtsi +++ b/arch/arm/boot/dts/tango4-smp8758.dtsi @@ -13,7 +13,6 @@ reg = <0>; clocks = <&clkgen CPU_CLK>; clock-latency = <1>; - operating-points = <1215000 0 607500 0 405000 0 243000 0 135000 0>; }; cpu1: cpu@1 { diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index 4d19c1b4b8e7..94d7e71c69c4 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -270,6 +270,7 @@ CONFIG_ICPLUS_PHY=y CONFIG_REALTEK_PHY=y CONFIG_MICREL_PHY=y CONFIG_FIXED_PHY=y +CONFIG_ROCKCHIP_PHY=y CONFIG_USB_PEGASUS=y CONFIG_USB_RTL8152=m CONFIG_USB_USBNET=y diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index b9adedcc5b2e..ec72752d5668 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -94,14 +94,15 @@ config CRYPTO_AES_ARM_CE ARMv8 Crypto Extensions config CRYPTO_GHASH_ARM_CE - tristate "PMULL-accelerated GHASH using ARMv8 Crypto Extensions" + tristate "PMULL-accelerated GHASH using NEON/ARMv8 Crypto Extensions" depends on KERNEL_MODE_NEON select CRYPTO_HASH select CRYPTO_CRYPTD help Use an implementation of GHASH (used by the GCM AEAD chaining mode) that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64) - that is part of the ARMv8 Crypto Extensions + that is part of the ARMv8 Crypto Extensions, or a slower variant that + uses the vmull.p8 instruction that is part of the basic NEON ISA. config CRYPTO_CRCT10DIF_ARM_CE tristate "CRCT10DIF digest algorithm using PMULL instructions" diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c index 0f966a8ca1ce..d0a9cec73707 100644 --- a/arch/arm/crypto/aes-ce-glue.c +++ b/arch/arm/crypto/aes-ce-glue.c @@ -285,9 +285,7 @@ static int ctr_encrypt(struct skcipher_request *req) ce_aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, num_rounds(ctx), blocks, walk.iv); - if (tdst != tsrc) - memcpy(tdst, tsrc, nbytes); - crypto_xor(tdst, tail, nbytes); + crypto_xor_cpy(tdst, tsrc, tail, nbytes); err = skcipher_walk_done(&walk, 0); } kernel_neon_end(); diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S index c817a86c4ca8..54b384084637 100644 --- a/arch/arm/crypto/aes-cipher-core.S +++ b/arch/arm/crypto/aes-cipher-core.S @@ -10,6 +10,7 @@ */ #include <linux/linkage.h> +#include <asm/cache.h> .text .align 5 @@ -32,19 +33,19 @@ .endif .endm - .macro __load, out, in, idx + .macro __load, out, in, idx, sz, op .if __LINUX_ARM_ARCH__ < 7 && \idx > 0 - ldr \out, [ttab, \in, lsr #(8 * \idx) - 2] + ldr\op \out, [ttab, \in, lsr #(8 * \idx) - \sz] .else - ldr \out, [ttab, \in, lsl #2] + ldr\op \out, [ttab, \in, lsl #\sz] .endif .endm - .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc + .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op __select \out0, \in0, 0 __select t0, \in1, 1 - __load \out0, \out0, 0 - __load t0, t0, 1 + __load \out0, \out0, 0, \sz, \op + __load t0, t0, 1, \sz, \op .if \enc __select \out1, \in1, 0 @@ -53,10 +54,10 @@ __select \out1, \in3, 0 __select t1, \in0, 1 .endif - __load \out1, \out1, 0 + __load \out1, \out1, 0, \sz, \op __select t2, \in2, 2 - __load t1, t1, 1 - __load t2, t2, 2 + __load t1, t1, 1, \sz, \op + __load t2, t2, 2, \sz, \op eor \out0, \out0, t0, ror #24 @@ -68,9 +69,9 @@ __select \t3, \in1, 2 __select \t4, \in2, 3 .endif - __load \t3, \t3, 2 - __load t0, t0, 3 - __load \t4, \t4, 3 + __load \t3, \t3, 2, \sz, \op + __load t0, t0, 3, \sz, \op + __load \t4, \t4, 3, \sz, \op eor \out1, \out1, t1, ror #24 eor \out0, \out0, t2, ror #16 @@ -82,14 +83,14 @@ eor \out1, \out1, t2 .endm - .macro fround, out0, out1, out2, out3, in0, in1, in2, in3 - __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1 - __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1 + .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op + __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op .endm - .macro iround, out0, out1, out2, out3, in0, in1, in2, in3 - __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0 - __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0 + .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op + __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op .endm .macro __rev, out, in @@ -114,7 +115,7 @@ .endif .endm - .macro do_crypt, round, ttab, ltab + .macro do_crypt, round, ttab, ltab, bsz push {r3-r11, lr} ldr r4, [in] @@ -146,9 +147,12 @@ 1: subs rounds, rounds, #4 \round r8, r9, r10, r11, r4, r5, r6, r7 - __adrl ttab, \ltab, ls + bls 2f \round r4, r5, r6, r7, r8, r9, r10, r11 - bhi 0b + b 0b + +2: __adrl ttab, \ltab + \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b #ifdef CONFIG_CPU_BIG_ENDIAN __rev r4, r4 @@ -170,10 +174,48 @@ .ltorg .endm + .align L1_CACHE_SHIFT + .type __aes_arm_inverse_sbox, %object +__aes_arm_inverse_sbox: + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + .size __aes_arm_inverse_sbox, . - __aes_arm_inverse_sbox + ENTRY(__aes_arm_encrypt) - do_crypt fround, crypto_ft_tab, crypto_fl_tab + do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2 ENDPROC(__aes_arm_encrypt) + .align 5 ENTRY(__aes_arm_decrypt) - do_crypt iround, crypto_it_tab, crypto_il_tab + do_crypt iround, crypto_it_tab, __aes_arm_inverse_sbox, 0 ENDPROC(__aes_arm_decrypt) diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c index c76377961444..18768f330449 100644 --- a/arch/arm/crypto/aes-neonbs-glue.c +++ b/arch/arm/crypto/aes-neonbs-glue.c @@ -221,9 +221,8 @@ static int ctr_encrypt(struct skcipher_request *req) u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; - if (dst != src) - memcpy(dst, src, walk.total % AES_BLOCK_SIZE); - crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE); + crypto_xor_cpy(dst, src, final, + walk.total % AES_BLOCK_SIZE); err = skcipher_walk_done(&walk, 0); break; diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S index f6ab8bcc9efe..2f78c10b1881 100644 --- a/arch/arm/crypto/ghash-ce-core.S +++ b/arch/arm/crypto/ghash-ce-core.S @@ -1,7 +1,7 @@ /* - * Accelerated GHASH implementation with ARMv8 vmull.p64 instructions. + * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. * - * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -12,40 +12,162 @@ #include <asm/assembler.h> SHASH .req q0 - SHASH2 .req q1 - T1 .req q2 - T2 .req q3 - MASK .req q4 - XL .req q5 - XM .req q6 - XH .req q7 - IN1 .req q7 + T1 .req q1 + XL .req q2 + XM .req q3 + XH .req q4 + IN1 .req q4 SHASH_L .req d0 SHASH_H .req d1 - SHASH2_L .req d2 - T1_L .req d4 - MASK_L .req d8 - XL_L .req d10 - XL_H .req d11 - XM_L .req d12 - XM_H .req d13 - XH_L .req d14 + T1_L .req d2 + T1_H .req d3 + XL_L .req d4 + XL_H .req d5 + XM_L .req d6 + XM_H .req d7 + XH_L .req d8 + + t0l .req d10 + t0h .req d11 + t1l .req d12 + t1h .req d13 + t2l .req d14 + t2h .req d15 + t3l .req d16 + t3h .req d17 + t4l .req d18 + t4h .req d19 + + t0q .req q5 + t1q .req q6 + t2q .req q7 + t3q .req q8 + t4q .req q9 + T2 .req q9 + + s1l .req d20 + s1h .req d21 + s2l .req d22 + s2h .req d23 + s3l .req d24 + s3h .req d25 + s4l .req d26 + s4h .req d27 + + MASK .req d28 + SHASH2_p8 .req d28 + + k16 .req d29 + k32 .req d30 + k48 .req d31 + SHASH2_p64 .req d31 .text .fpu crypto-neon-fp-armv8 + .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 + vmull.p64 \rd, \rn, \rm + .endm + /* - * void pmull_ghash_update(int blocks, u64 dg[], const char *src, - * struct ghash_key const *k, const char *head) + * This implementation of 64x64 -> 128 bit polynomial multiplication + * using vmull.p8 instructions (8x8 -> 16) is taken from the paper + * "Fast Software Polynomial Multiplication on ARM Processors Using + * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and + * Ricardo Dahab (https://hal.inria.fr/hal-01506572) + * + * It has been slightly tweaked for in-order performance, and to allow + * 'rq' to overlap with 'ad' or 'bd'. */ -ENTRY(pmull_ghash_update) - vld1.64 {SHASH}, [r3] + .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l + vext.8 t0l, \ad, \ad, #1 @ A1 + .ifc \b1, t4l + vext.8 t4l, \bd, \bd, #1 @ B1 + .endif + vmull.p8 t0q, t0l, \bd @ F = A1*B + vext.8 t1l, \ad, \ad, #2 @ A2 + vmull.p8 t4q, \ad, \b1 @ E = A*B1 + .ifc \b2, t3l + vext.8 t3l, \bd, \bd, #2 @ B2 + .endif + vmull.p8 t1q, t1l, \bd @ H = A2*B + vext.8 t2l, \ad, \ad, #3 @ A3 + vmull.p8 t3q, \ad, \b2 @ G = A*B2 + veor t0q, t0q, t4q @ L = E + F + .ifc \b3, t4l + vext.8 t4l, \bd, \bd, #3 @ B3 + .endif + vmull.p8 t2q, t2l, \bd @ J = A3*B + veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 + veor t1q, t1q, t3q @ M = G + H + .ifc \b4, t3l + vext.8 t3l, \bd, \bd, #4 @ B4 + .endif + vmull.p8 t4q, \ad, \b3 @ I = A*B3 + veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 + vmull.p8 t3q, \ad, \b4 @ K = A*B4 + vand t0h, t0h, k48 + vand t1h, t1h, k32 + veor t2q, t2q, t4q @ N = I + J + veor t0l, t0l, t0h + veor t1l, t1l, t1h + veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 + vand t2h, t2h, k16 + veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 + vmov.i64 t3h, #0 + vext.8 t0q, t0q, t0q, #15 + veor t2l, t2l, t2h + vext.8 t1q, t1q, t1q, #14 + vmull.p8 \rq, \ad, \bd @ D = A*B + vext.8 t2q, t2q, t2q, #13 + vext.8 t3q, t3q, t3q, #12 + veor t0q, t0q, t1q + veor t2q, t2q, t3q + veor \rq, \rq, t0q + veor \rq, \rq, t2q + .endm + + // + // PMULL (64x64->128) based reduction for CPUs that can do + // it in a single instruction. + // + .macro __pmull_reduce_p64 + vmull.p64 T1, XL_L, MASK + + veor XH_L, XH_L, XM_H + vext.8 T1, T1, T1, #8 + veor XL_H, XL_H, XM_L + veor T1, T1, XL + + vmull.p64 XL, T1_H, MASK + .endm + + // + // Alternative reduction for CPUs that lack support for the + // 64x64->128 PMULL instruction + // + .macro __pmull_reduce_p8 + veor XL_H, XL_H, XM_L + veor XH_L, XH_L, XM_H + + vshl.i64 T1, XL, #57 + vshl.i64 T2, XL, #62 + veor T1, T1, T2 + vshl.i64 T2, XL, #63 + veor T1, T1, T2 + veor XL_H, XL_H, T1_L + veor XH_L, XH_L, T1_H + + vshr.u64 T1, XL, #1 + veor XH, XH, XL + veor XL, XL, T1 + vshr.u64 T1, T1, #6 + vshr.u64 XL, XL, #1 + .endm + + .macro ghash_update, pn vld1.64 {XL}, [r1] - vmov.i8 MASK, #0xe1 - vext.8 SHASH2, SHASH, SHASH, #8 - vshl.u64 MASK, MASK, #57 - veor SHASH2, SHASH2, SHASH /* do the head block first, if supplied */ ldr ip, [sp] @@ -62,33 +184,59 @@ ENTRY(pmull_ghash_update) #ifndef CONFIG_CPU_BIG_ENDIAN vrev64.8 T1, T1 #endif - vext.8 T2, XL, XL, #8 vext.8 IN1, T1, T1, #8 - veor T1, T1, T2 + veor T1_L, T1_L, XL_H veor XL, XL, IN1 - vmull.p64 XH, SHASH_H, XL_H @ a1 * b1 + __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 veor T1, T1, XL - vmull.p64 XL, SHASH_L, XL_L @ a0 * b0 - vmull.p64 XM, SHASH2_L, T1_L @ (a1 + a0)(b1 + b0) + __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 + __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) - vext.8 T1, XL, XH, #8 - veor T2, XL, XH + veor T1, XL, XH veor XM, XM, T1 - veor XM, XM, T2 - vmull.p64 T2, XL_L, MASK_L - vmov XH_L, XM_H - vmov XM_H, XL_L + __pmull_reduce_\pn - veor XL, XM, T2 - vext.8 T2, XL, XL, #8 - vmull.p64 XL, XL_L, MASK_L - veor T2, T2, XH - veor XL, XL, T2 + veor T1, T1, XH + veor XL, XL, T1 bne 0b vst1.64 {XL}, [r1] bx lr -ENDPROC(pmull_ghash_update) + .endm + + /* + * void pmull_ghash_update(int blocks, u64 dg[], const char *src, + * struct ghash_key const *k, const char *head) + */ +ENTRY(pmull_ghash_update_p64) + vld1.64 {SHASH}, [r3] + veor SHASH2_p64, SHASH_L, SHASH_H + + vmov.i8 MASK, #0xe1 + vshl.u64 MASK, MASK, #57 + + ghash_update p64 +ENDPROC(pmull_ghash_update_p64) + +ENTRY(pmull_ghash_update_p8) + vld1.64 {SHASH}, [r3] + veor SHASH2_p8, SHASH_L, SHASH_H + + vext.8 s1l, SHASH_L, SHASH_L, #1 + vext.8 s2l, SHASH_L, SHASH_L, #2 + vext.8 s3l, SHASH_L, SHASH_L, #3 + vext.8 s4l, SHASH_L, SHASH_L, #4 + vext.8 s1h, SHASH_H, SHASH_H, #1 + vext.8 s2h, SHASH_H, SHASH_H, #2 + vext.8 s3h, SHASH_H, SHASH_H, #3 + vext.8 s4h, SHASH_H, SHASH_H, #4 + + vmov.i64 k16, #0xffff + vmov.i64 k32, #0xffffffff + vmov.i64 k48, #0xffffffffffff + + ghash_update p8 +ENDPROC(pmull_ghash_update_p8) diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c index 6bac8bea9f1e..d9bb52cae2ac 100644 --- a/arch/arm/crypto/ghash-ce-glue.c +++ b/arch/arm/crypto/ghash-ce-glue.c @@ -22,6 +22,7 @@ MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("ghash"); #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 @@ -41,8 +42,17 @@ struct ghash_async_ctx { struct cryptd_ahash *cryptd_tfm; }; -asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, - struct ghash_key const *k, const char *head); +asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); + +asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); + +static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); static int ghash_init(struct shash_desc *desc) { @@ -312,6 +322,14 @@ static int __init ghash_ce_mod_init(void) { int err; + if (!(elf_hwcap & HWCAP_NEON)) + return -ENODEV; + + if (elf_hwcap2 & HWCAP2_PMULL) + pmull_ghash_update = pmull_ghash_update_p64; + else + pmull_ghash_update = pmull_ghash_update_p8; + err = crypto_register_shash(&ghash_alg); if (err) return err; @@ -332,5 +350,5 @@ static void __exit ghash_ce_mod_exit(void) crypto_unregister_shash(&ghash_alg); } -module_cpu_feature_match(PMULL, ghash_ce_mod_init); +module_init(ghash_ce_mod_init); module_exit(ghash_ce_mod_exit); diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index 27475904e096..eee269321923 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -276,6 +276,12 @@ static inline u64 __gic_readq_nonatomic(const volatile void __iomem *addr) #define gicr_write_pendbaser(v, c) __gic_writeq_nonatomic(v, c) /* + * GICR_xLPIR - only the lower bits are significant + */ +#define gic_read_lpir(c) readl_relaxed(c) +#define gic_write_lpir(v, c) writel_relaxed(lower_32_bits(v), c) + +/* * GITS_TYPER is an ID register and doesn't need atomicity. */ #define gits_read_typer(c) __gic_readq_nonatomic(c) @@ -291,5 +297,33 @@ static inline u64 __gic_readq_nonatomic(const volatile void __iomem *addr) */ #define gits_write_cwriter(v, c) __gic_writeq_nonatomic(v, c) +/* + * GITS_VPROPBASER - hi and lo bits may be accessed independently. + */ +#define gits_write_vpropbaser(v, c) __gic_writeq_nonatomic(v, c) + +/* + * GITS_VPENDBASER - the Valid bit must be cleared before changing + * anything else. + */ +static inline void gits_write_vpendbaser(u64 val, void * __iomem addr) +{ + u32 tmp; + + tmp = readl_relaxed(addr + 4); + if (tmp & (GICR_VPENDBASER_Valid >> 32)) { + tmp &= ~(GICR_VPENDBASER_Valid >> 32); + writel_relaxed(tmp, addr + 4); + } + + /* + * Use the fact that __gic_writeq_nonatomic writes the second + * half of the 64bit quantity after the first. + */ + __gic_writeq_nonatomic(val, addr); +} + +#define gits_read_vpendbaser(c) __gic_readq_nonatomic(c) + #endif /* !__ASSEMBLY__ */ #endif /* !__ASM_ARCH_GICV3_H */ diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index 6795368ad023..cc414382dab4 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -128,20 +128,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, #endif /* !SMP */ static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - #ifndef CONFIG_SMP preempt_disable(); #endif @@ -172,17 +162,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) preempt_enable(); #endif - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 127e2dd2e21c..4a879f6ff13b 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -225,12 +225,6 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -/* We do not have shadow page tables, hence the empty hooks */ -static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address) -{ -} - struct kvm_vcpu *kvm_arm_get_running_vcpu(void); struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); void kvm_arm_halt_guest(struct kvm *kvm); diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index 4bec45442072..c030143c18c6 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -52,22 +52,6 @@ static inline void dsb_sev(void) * memory. */ -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - u16 owner = READ_ONCE(lock->tickets.owner); - - for (;;) { - arch_spinlock_t tmp = READ_ONCE(*lock); - - if (tmp.tickets.owner == tmp.tickets.next || - tmp.tickets.owner != owner) - break; - - wfe(); - } - smp_acquire__after_ctrl_dep(); -} - #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) static inline void arch_spin_lock(arch_spinlock_t *lock) diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 776757d1604a..1d468b527b7b 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define TIF_NEED_RESCHED 1 /* rescheduling necessary */ #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_UPROBE 3 /* breakpointed or singlestepping */ -#define TIF_SYSCALL_TRACE 4 /* syscall trace active */ -#define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ -#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ -#define TIF_SECCOMP 7 /* seccomp syscall filtering active */ +#define TIF_FSCHECK 4 /* Check FS is USER_DS on return */ +#define TIF_SYSCALL_TRACE 5 /* syscall trace active */ +#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ +#define TIF_SYSCALL_TRACEPOINT 7 /* syscall tracepoint instrumentation */ +#define TIF_SECCOMP 8 /* seccomp syscall filtering active */ #define TIF_NOHZ 12 /* in adaptive nohz mode */ #define TIF_USING_IWMMXT 17 @@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_UPROBE (1 << TIF_UPROBE) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) @@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, /* * Change these and you break ASM code in entry-common.S */ -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_UPROBE) +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_FSCHECK) #endif /* __KERNEL__ */ #endif /* __ASM_ARM_THREAD_INFO_H */ diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h index f555bb3664dc..683d9230984a 100644 --- a/arch/arm/include/asm/traps.h +++ b/arch/arm/include/asm/traps.h @@ -18,7 +18,6 @@ struct undef_hook { void register_undef_hook(struct undef_hook *hook); void unregister_undef_hook(struct undef_hook *hook); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER static inline int __in_irqentry_text(unsigned long ptr) { extern char __irqentry_text_start[]; @@ -27,12 +26,6 @@ static inline int __in_irqentry_text(unsigned long ptr) return ptr >= (unsigned long)&__irqentry_text_start && ptr < (unsigned long)&__irqentry_text_end; } -#else -static inline int __in_irqentry_text(unsigned long ptr) -{ - return 0; -} -#endif static inline int in_exception_text(unsigned long ptr) { diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 0bf2347495f1..87936dd5d151 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -70,6 +70,8 @@ static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER); + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); } #define segment_eq(a, b) ((a) == (b)) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index eb5cd77bf1d8..e33c32d56193 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -41,7 +41,9 @@ ret_fast_syscall: UNWIND(.cantunwind ) disable_irq_notrace @ disable interrupts ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK + tst r1, #_TIF_SYSCALL_WORK + bne fast_work_pending + tst r1, #_TIF_WORK_MASK bne fast_work_pending /* perform architecture specific actions before user return */ @@ -67,12 +69,15 @@ ret_fast_syscall: str r0, [sp, #S_R0 + S_OFF]! @ save returned r0 disable_irq_notrace @ disable interrupts ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK + tst r1, #_TIF_SYSCALL_WORK + bne fast_work_pending + tst r1, #_TIF_WORK_MASK beq no_work_pending UNWIND(.fnend ) ENDPROC(ret_fast_syscall) /* Slower path - fall through to work_pending */ +fast_work_pending: #endif tst r1, #_TIF_SYSCALL_WORK diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 5814298ef0b7..e2de50bf8742 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -14,6 +14,7 @@ #include <linux/uaccess.h> #include <linux/tracehook.h> #include <linux/uprobes.h> +#include <linux/syscalls.h> #include <asm/elf.h> #include <asm/cacheflush.h> @@ -613,6 +614,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) * Update the trace code with the current status. */ trace_hardirqs_off(); + + /* Check valid user FS if needed */ + addr_limit_user_check(); + do { if (likely(thread_flags & _TIF_NEED_RESCHED)) { schedule(); diff --git a/arch/arm/mach-hisi/Kconfig b/arch/arm/mach-hisi/Kconfig index a3b091a4d344..65a048fa08ec 100644 --- a/arch/arm/mach-hisi/Kconfig +++ b/arch/arm/mach-hisi/Kconfig @@ -39,6 +39,7 @@ config ARCH_HIP04 select HAVE_ARM_ARCH_TIMER select MCPM if SMP select MCPM_QUAD_CLUSTER if SMP + select GENERIC_IRQ_EFFECTIVE_AFF_MASK help Support for Hisilicon HiP04 SoC family diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile index 779fb1f680b3..b3b3b3a19183 100644 --- a/arch/arm/mach-omap2/Makefile +++ b/arch/arm/mach-omap2/Makefile @@ -8,7 +8,7 @@ ccflags-y := -I$(srctree)/$(src)/include \ # Common support obj-y := id.o io.o control.o devices.o fb.o timer.o pm.o \ common.o dma.o wd_timer.o display.o i2c.o hdq1w.o omap_hwmod.o \ - omap_device.o omap-headsmp.o sram.o drm.o + omap_device.o omap-headsmp.o sram.o hwmod-common = omap_hwmod.o omap_hwmod_reset.o \ omap_hwmod_common_data.o diff --git a/arch/arm/mach-omap2/board-generic.c b/arch/arm/mach-omap2/board-generic.c index b1e661bb5521..583fc39d84cd 100644 --- a/arch/arm/mach-omap2/board-generic.c +++ b/arch/arm/mach-omap2/board-generic.c @@ -33,6 +33,7 @@ static void __init __maybe_unused omap_generic_init(void) pdata_quirks_init(omap_dt_match_table); omapdss_init_of(); + omap_soc_device_init(); } #ifdef CONFIG_SOC_OMAP2420 diff --git a/arch/arm/mach-omap2/display.c b/arch/arm/mach-omap2/display.c index 8fa01c0ecdb2..b3f6eb5d04a2 100644 --- a/arch/arm/mach-omap2/display.c +++ b/arch/arm/mach-omap2/display.c @@ -66,6 +66,7 @@ */ #define FRAMEDONE_IRQ_TIMEOUT 100 +#if defined(CONFIG_FB_OMAP2) static struct platform_device omap_display_device = { .name = "omapdss", .id = -1, @@ -163,6 +164,65 @@ static enum omapdss_version __init omap_display_get_version(void) return OMAPDSS_VER_UNKNOWN; } +static int __init omapdss_init_fbdev(void) +{ + static struct omap_dss_board_info board_data = { + .dsi_enable_pads = omap_dsi_enable_pads, + .dsi_disable_pads = omap_dsi_disable_pads, + .set_min_bus_tput = omap_dss_set_min_bus_tput, + }; + struct device_node *node; + int r; + + board_data.version = omap_display_get_version(); + if (board_data.version == OMAPDSS_VER_UNKNOWN) { + pr_err("DSS not supported on this SoC\n"); + return -ENODEV; + } + + omap_display_device.dev.platform_data = &board_data; + + r = platform_device_register(&omap_display_device); + if (r < 0) { + pr_err("Unable to register omapdss device\n"); + return r; + } + + /* create vrfb device */ + r = omap_init_vrfb(); + if (r < 0) { + pr_err("Unable to register omapvrfb device\n"); + return r; + } + + /* create FB device */ + r = omap_init_fb(); + if (r < 0) { + pr_err("Unable to register omapfb device\n"); + return r; + } + + /* create V4L2 display device */ + r = omap_init_vout(); + if (r < 0) { + pr_err("Unable to register omap_vout device\n"); + return r; + } + + /* add DSI info for omap4 */ + node = of_find_node_by_name(NULL, "omap4_padconf_global"); + if (node) + omap4_dsi_mux_syscon = syscon_node_to_regmap(node); + + return 0; +} +#else +static inline int omapdss_init_fbdev(void) +{ + return 0; +} +#endif /* CONFIG_FB_OMAP2 */ + static void dispc_disable_outputs(void) { u32 v, irq_mask = 0; @@ -335,16 +395,9 @@ static struct device_node * __init omapdss_find_dss_of_node(void) int __init omapdss_init_of(void) { int r; - enum omapdss_version ver; struct device_node *node; struct platform_device *pdev; - static struct omap_dss_board_info board_data = { - .dsi_enable_pads = omap_dsi_enable_pads, - .dsi_disable_pads = omap_dsi_disable_pads, - .set_min_bus_tput = omap_dss_set_min_bus_tput, - }; - /* only create dss helper devices if dss is enabled in the .dts */ node = omapdss_find_dss_of_node(); @@ -354,13 +407,6 @@ int __init omapdss_init_of(void) if (!of_device_is_available(node)) return 0; - ver = omap_display_get_version(); - - if (ver == OMAPDSS_VER_UNKNOWN) { - pr_err("DSS not supported on this SoC\n"); - return -ENODEV; - } - pdev = of_find_device_by_node(node); if (!pdev) { @@ -374,48 +420,5 @@ int __init omapdss_init_of(void) return r; } - board_data.version = ver; - - omap_display_device.dev.platform_data = &board_data; - - r = platform_device_register(&omap_display_device); - if (r < 0) { - pr_err("Unable to register omapdss device\n"); - return r; - } - - /* create DRM device */ - r = omap_init_drm(); - if (r < 0) { - pr_err("Unable to register omapdrm device\n"); - return r; - } - - /* create vrfb device */ - r = omap_init_vrfb(); - if (r < 0) { - pr_err("Unable to register omapvrfb device\n"); - return r; - } - - /* create FB device */ - r = omap_init_fb(); - if (r < 0) { - pr_err("Unable to register omapfb device\n"); - return r; - } - - /* create V4L2 display device */ - r = omap_init_vout(); - if (r < 0) { - pr_err("Unable to register omap_vout device\n"); - return r; - } - - /* add DSI info for omap4 */ - node = of_find_node_by_name(NULL, "omap4_padconf_global"); - if (node) - omap4_dsi_mux_syscon = syscon_node_to_regmap(node); - - return 0; + return omapdss_init_fbdev(); } diff --git a/arch/arm/mach-omap2/display.h b/arch/arm/mach-omap2/display.h index 9a39646d4316..42ec2e99a2f4 100644 --- a/arch/arm/mach-omap2/display.h +++ b/arch/arm/mach-omap2/display.h @@ -26,7 +26,6 @@ struct omap_dss_dispc_dev_attr { bool has_framedonetv_irq; }; -int omap_init_drm(void); int omap_init_vrfb(void); int omap_init_fb(void); int omap_init_vout(void); diff --git a/arch/arm/mach-omap2/drm.c b/arch/arm/mach-omap2/drm.c deleted file mode 100644 index 44fef961bb70..000000000000 --- a/arch/arm/mach-omap2/drm.c +++ /dev/null @@ -1,53 +0,0 @@ -/* - * DRM/KMS device registration for TI OMAP platforms - * - * Copyright (C) 2012 Texas Instruments - * Author: Rob Clark <rob.clark@linaro.org> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published by - * the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/platform_device.h> -#include <linux/dma-mapping.h> -#include <linux/platform_data/omap_drm.h> - -#include "soc.h" -#include "display.h" - -#if IS_ENABLED(CONFIG_DRM_OMAP) - -static struct omap_drm_platform_data platform_data; - -static struct platform_device omap_drm_device = { - .dev = { - .coherent_dma_mask = DMA_BIT_MASK(32), - .platform_data = &platform_data, - }, - .name = "omapdrm", - .id = 0, -}; - -int __init omap_init_drm(void) -{ - platform_data.omaprev = GET_OMAP_TYPE; - - return platform_device_register(&omap_drm_device); - -} -#else -int __init omap_init_drm(void) { return 0; } -#endif diff --git a/arch/arm/mach-omap2/io.c b/arch/arm/mach-omap2/io.c index 1cd20e4d56b0..cb5d7314cf99 100644 --- a/arch/arm/mach-omap2/io.c +++ b/arch/arm/mach-omap2/io.c @@ -428,7 +428,6 @@ static void __init __maybe_unused omap_hwmod_init_postsetup(void) static void __init __maybe_unused omap_common_late_init(void) { omap2_common_pm_late_init(); - omap_soc_device_init(); } #ifdef CONFIG_SOC_OMAP2420 diff --git a/arch/arm/mach-tegra/cpuidle-tegra114.c b/arch/arm/mach-tegra/cpuidle-tegra114.c index d3aa9be16621..e3fbcfedf845 100644 --- a/arch/arm/mach-tegra/cpuidle-tegra114.c +++ b/arch/arm/mach-tegra/cpuidle-tegra114.c @@ -60,7 +60,7 @@ static int tegra114_idle_power_down(struct cpuidle_device *dev, return index; } -static void tegra114_idle_enter_freeze(struct cpuidle_device *dev, +static void tegra114_idle_enter_s2idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { @@ -77,7 +77,7 @@ static struct cpuidle_driver tegra_idle_driver = { #ifdef CONFIG_PM_SLEEP [1] = { .enter = tegra114_idle_power_down, - .enter_freeze = tegra114_idle_enter_freeze, + .enter_s2idle = tegra114_idle_enter_s2idle, .exit_latency = 500, .target_residency = 1000, .flags = CPUIDLE_FLAG_TIMER_STOP, diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index d5b9fa19b684..c199990e12b6 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1,6 +1,7 @@ /* - * Just-In-Time compiler for BPF filters on 32bit ARM + * Just-In-Time compiler for eBPF filters on 32bit ARM * + * Copyright (c) 2017 Shubham Bansal <illusionist.neo@gmail.com> * Copyright (c) 2011 Mircea Gherzan <mgherzan@gmail.com> * * This program is free software; you can redistribute it and/or modify it @@ -8,6 +9,7 @@ * Free Software Foundation; version 2 of the License. */ +#include <linux/bpf.h> #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/errno.h> @@ -18,54 +20,101 @@ #include <linux/if_vlan.h> #include <asm/cacheflush.h> -#include <asm/set_memory.h> #include <asm/hwcap.h> #include <asm/opcodes.h> #include "bpf_jit_32.h" +int bpf_jit_enable __read_mostly; + +#define STACK_OFFSET(k) (k) +#define TMP_REG_1 (MAX_BPF_JIT_REG + 0) /* TEMP Register 1 */ +#define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* TEMP Register 2 */ +#define TCALL_CNT (MAX_BPF_JIT_REG + 2) /* Tail Call Count */ + +/* Flags used for JIT optimization */ +#define SEEN_CALL (1 << 0) + +#define FLAG_IMM_OVERFLOW (1 << 0) + /* - * ABI: + * Map eBPF registers to ARM 32bit registers or stack scratch space. + * + * 1. First argument is passed using the arm 32bit registers and rest of the + * arguments are passed on stack scratch space. + * 2. First callee-saved arugument is mapped to arm 32 bit registers and rest + * arguments are mapped to scratch space on stack. + * 3. We need two 64 bit temp registers to do complex operations on eBPF + * registers. + * + * As the eBPF registers are all 64 bit registers and arm has only 32 bit + * registers, we have to map each eBPF registers with two arm 32 bit regs or + * scratch memory space and we have to build eBPF 64 bit register from those. * - * r0 scratch register - * r4 BPF register A - * r5 BPF register X - * r6 pointer to the skb - * r7 skb->data - * r8 skb_headlen(skb) */ +static const u8 bpf2a32[][2] = { + /* return value from in-kernel function, and exit value from eBPF */ + [BPF_REG_0] = {ARM_R1, ARM_R0}, + /* arguments from eBPF program to in-kernel function */ + [BPF_REG_1] = {ARM_R3, ARM_R2}, + /* Stored on stack scratch space */ + [BPF_REG_2] = {STACK_OFFSET(0), STACK_OFFSET(4)}, + [BPF_REG_3] = {STACK_OFFSET(8), STACK_OFFSET(12)}, + [BPF_REG_4] = {STACK_OFFSET(16), STACK_OFFSET(20)}, + [BPF_REG_5] = {STACK_OFFSET(24), STACK_OFFSET(28)}, + /* callee saved registers that in-kernel function will preserve */ + [BPF_REG_6] = {ARM_R5, ARM_R4}, + /* Stored on stack scratch space */ + [BPF_REG_7] = {STACK_OFFSET(32), STACK_OFFSET(36)}, + [BPF_REG_8] = {STACK_OFFSET(40), STACK_OFFSET(44)}, + [BPF_REG_9] = {STACK_OFFSET(48), STACK_OFFSET(52)}, + /* Read only Frame Pointer to access Stack */ + [BPF_REG_FP] = {STACK_OFFSET(56), STACK_OFFSET(60)}, + /* Temporary Register for internal BPF JIT, can be used + * for constant blindings and others. + */ + [TMP_REG_1] = {ARM_R7, ARM_R6}, + [TMP_REG_2] = {ARM_R10, ARM_R8}, + /* Tail call count. Stored on stack scratch space. */ + [TCALL_CNT] = {STACK_OFFSET(64), STACK_OFFSET(68)}, + /* temporary register for blinding constants. + * Stored on stack scratch space. + */ + [BPF_REG_AX] = {STACK_OFFSET(72), STACK_OFFSET(76)}, +}; -#define r_scratch ARM_R0 -/* r1-r3 are (also) used for the unaligned loads on the non-ARMv7 slowpath */ -#define r_off ARM_R1 -#define r_A ARM_R4 -#define r_X ARM_R5 -#define r_skb ARM_R6 -#define r_skb_data ARM_R7 -#define r_skb_hl ARM_R8 - -#define SCRATCH_SP_OFFSET 0 -#define SCRATCH_OFF(k) (SCRATCH_SP_OFFSET + 4 * (k)) - -#define SEEN_MEM ((1 << BPF_MEMWORDS) - 1) -#define SEEN_MEM_WORD(k) (1 << (k)) -#define SEEN_X (1 << BPF_MEMWORDS) -#define SEEN_CALL (1 << (BPF_MEMWORDS + 1)) -#define SEEN_SKB (1 << (BPF_MEMWORDS + 2)) -#define SEEN_DATA (1 << (BPF_MEMWORDS + 3)) +#define dst_lo dst[1] +#define dst_hi dst[0] +#define src_lo src[1] +#define src_hi src[0] -#define FLAG_NEED_X_RESET (1 << 0) -#define FLAG_IMM_OVERFLOW (1 << 1) +/* + * JIT Context: + * + * prog : bpf_prog + * idx : index of current last JITed instruction. + * prologue_bytes : bytes used in prologue. + * epilogue_offset : offset of epilogue starting. + * seen : bit mask used for JIT optimization. + * offsets : array of eBPF instruction offsets in + * JITed code. + * target : final JITed code. + * epilogue_bytes : no of bytes used in epilogue. + * imm_count : no of immediate counts used for global + * variables. + * imms : array of global variable addresses. + */ struct jit_ctx { - const struct bpf_prog *skf; - unsigned idx; - unsigned prologue_bytes; - int ret0_fp_idx; + const struct bpf_prog *prog; + unsigned int idx; + unsigned int prologue_bytes; + unsigned int epilogue_offset; u32 seen; u32 flags; u32 *offsets; u32 *target; + u32 stack_size; #if __LINUX_ARM_ARCH__ < 7 u16 epilogue_bytes; u16 imm_count; @@ -73,68 +122,16 @@ struct jit_ctx { #endif }; -int bpf_jit_enable __read_mostly; - -static inline int call_neg_helper(struct sk_buff *skb, int offset, void *ret, - unsigned int size) -{ - void *ptr = bpf_internal_load_pointer_neg_helper(skb, offset, size); - - if (!ptr) - return -EFAULT; - memcpy(ret, ptr, size); - return 0; -} - -static u64 jit_get_skb_b(struct sk_buff *skb, int offset) -{ - u8 ret; - int err; - - if (offset < 0) - err = call_neg_helper(skb, offset, &ret, 1); - else - err = skb_copy_bits(skb, offset, &ret, 1); - - return (u64)err << 32 | ret; -} - -static u64 jit_get_skb_h(struct sk_buff *skb, int offset) -{ - u16 ret; - int err; - - if (offset < 0) - err = call_neg_helper(skb, offset, &ret, 2); - else - err = skb_copy_bits(skb, offset, &ret, 2); - - return (u64)err << 32 | ntohs(ret); -} - -static u64 jit_get_skb_w(struct sk_buff *skb, int offset) -{ - u32 ret; - int err; - - if (offset < 0) - err = call_neg_helper(skb, offset, &ret, 4); - else - err = skb_copy_bits(skb, offset, &ret, 4); - - return (u64)err << 32 | ntohl(ret); -} - /* * Wrappers which handle both OABI and EABI and assures Thumb2 interworking * (where the assembly routines like __aeabi_uidiv could cause problems). */ -static u32 jit_udiv(u32 dividend, u32 divisor) +static u32 jit_udiv32(u32 dividend, u32 divisor) { return dividend / divisor; } -static u32 jit_mod(u32 dividend, u32 divisor) +static u32 jit_mod32(u32 dividend, u32 divisor) { return dividend % divisor; } @@ -158,36 +155,22 @@ static inline void emit(u32 inst, struct jit_ctx *ctx) _emit(ARM_COND_AL, inst, ctx); } -static u16 saved_regs(struct jit_ctx *ctx) +/* + * Checks if immediate value can be converted to imm12(12 bits) value. + */ +static int16_t imm8m(u32 x) { - u16 ret = 0; - - if ((ctx->skf->len > 1) || - (ctx->skf->insns[0].code == (BPF_RET | BPF_A))) - ret |= 1 << r_A; - -#ifdef CONFIG_FRAME_POINTER - ret |= (1 << ARM_FP) | (1 << ARM_IP) | (1 << ARM_LR) | (1 << ARM_PC); -#else - if (ctx->seen & SEEN_CALL) - ret |= 1 << ARM_LR; -#endif - if (ctx->seen & (SEEN_DATA | SEEN_SKB)) - ret |= 1 << r_skb; - if (ctx->seen & SEEN_DATA) - ret |= (1 << r_skb_data) | (1 << r_skb_hl); - if (ctx->seen & SEEN_X) - ret |= 1 << r_X; - - return ret; -} + u32 rot; -static inline int mem_words_used(struct jit_ctx *ctx) -{ - /* yes, we do waste some stack space IF there are "holes" in the set" */ - return fls(ctx->seen & SEEN_MEM); + for (rot = 0; rot < 16; rot++) + if ((x & ~ror32(0xff, 2 * rot)) == 0) + return rol32(x, 2 * rot) | (rot << 8); + return -1; } +/* + * Initializes the JIT space with undefined instructions. + */ static void jit_fill_hole(void *area, unsigned int size) { u32 *ptr; @@ -196,88 +179,34 @@ static void jit_fill_hole(void *area, unsigned int size) *ptr++ = __opcode_to_mem_arm(ARM_INST_UDF); } -static void build_prologue(struct jit_ctx *ctx) -{ - u16 reg_set = saved_regs(ctx); - u16 off; - -#ifdef CONFIG_FRAME_POINTER - emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx); - emit(ARM_PUSH(reg_set), ctx); - emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx); -#else - if (reg_set) - emit(ARM_PUSH(reg_set), ctx); -#endif - - if (ctx->seen & (SEEN_DATA | SEEN_SKB)) - emit(ARM_MOV_R(r_skb, ARM_R0), ctx); - - if (ctx->seen & SEEN_DATA) { - off = offsetof(struct sk_buff, data); - emit(ARM_LDR_I(r_skb_data, r_skb, off), ctx); - /* headlen = len - data_len */ - off = offsetof(struct sk_buff, len); - emit(ARM_LDR_I(r_skb_hl, r_skb, off), ctx); - off = offsetof(struct sk_buff, data_len); - emit(ARM_LDR_I(r_scratch, r_skb, off), ctx); - emit(ARM_SUB_R(r_skb_hl, r_skb_hl, r_scratch), ctx); - } - - if (ctx->flags & FLAG_NEED_X_RESET) - emit(ARM_MOV_I(r_X, 0), ctx); - - /* do not leak kernel data to userspace */ - if (bpf_needs_clear_a(&ctx->skf->insns[0])) - emit(ARM_MOV_I(r_A, 0), ctx); - - /* stack space for the BPF_MEM words */ - if (ctx->seen & SEEN_MEM) - emit(ARM_SUB_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx); -} - -static void build_epilogue(struct jit_ctx *ctx) -{ - u16 reg_set = saved_regs(ctx); - - if (ctx->seen & SEEN_MEM) - emit(ARM_ADD_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx); - - reg_set &= ~(1 << ARM_LR); +/* Stack must be multiples of 16 Bytes */ +#define STACK_ALIGN(sz) (((sz) + 3) & ~3) -#ifdef CONFIG_FRAME_POINTER - /* the first instruction of the prologue was: mov ip, sp */ - reg_set &= ~(1 << ARM_IP); - reg_set |= (1 << ARM_SP); - emit(ARM_LDM(ARM_SP, reg_set), ctx); -#else - if (reg_set) { - if (ctx->seen & SEEN_CALL) - reg_set |= 1 << ARM_PC; - emit(ARM_POP(reg_set), ctx); - } +/* Stack space for BPF_REG_2, BPF_REG_3, BPF_REG_4, + * BPF_REG_5, BPF_REG_7, BPF_REG_8, BPF_REG_9, + * BPF_REG_FP and Tail call counts. + */ +#define SCRATCH_SIZE 80 - if (!(ctx->seen & SEEN_CALL)) - emit(ARM_BX(ARM_LR), ctx); -#endif -} +/* total stack size used in JITed code */ +#define _STACK_SIZE \ + (ctx->prog->aux->stack_depth + \ + + SCRATCH_SIZE + \ + + 4 /* extra for skb_copy_bits buffer */) -static int16_t imm8m(u32 x) -{ - u32 rot; +#define STACK_SIZE STACK_ALIGN(_STACK_SIZE) - for (rot = 0; rot < 16; rot++) - if ((x & ~ror32(0xff, 2 * rot)) == 0) - return rol32(x, 2 * rot) | (rot << 8); +/* Get the offset of eBPF REGISTERs stored on scratch space. */ +#define STACK_VAR(off) (STACK_SIZE-off-4) - return -1; -} +/* Offset of skb_copy_bits buffer */ +#define SKB_BUFFER STACK_VAR(SCRATCH_SIZE) #if __LINUX_ARM_ARCH__ < 7 static u16 imm_offset(u32 k, struct jit_ctx *ctx) { - unsigned i = 0, offset; + unsigned int i = 0, offset; u16 imm; /* on the "fake" run we just count them (duplicates included) */ @@ -296,7 +225,7 @@ static u16 imm_offset(u32 k, struct jit_ctx *ctx) ctx->imms[i] = k; /* constants go just after the epilogue */ - offset = ctx->offsets[ctx->skf->len]; + offset = ctx->offsets[ctx->prog->len - 1] * 4; offset += ctx->prologue_bytes; offset += ctx->epilogue_bytes; offset += i * 4; @@ -320,10 +249,22 @@ static u16 imm_offset(u32 k, struct jit_ctx *ctx) #endif /* __LINUX_ARM_ARCH__ */ +static inline int bpf2a32_offset(int bpf_to, int bpf_from, + const struct jit_ctx *ctx) { + int to, from; + + if (ctx->target == NULL) + return 0; + to = ctx->offsets[bpf_to]; + from = ctx->offsets[bpf_from]; + + return to - from - 1; +} + /* * Move an immediate that's not an imm8m to a core register. */ -static inline void emit_mov_i_no8m(int rd, u32 val, struct jit_ctx *ctx) +static inline void emit_mov_i_no8m(const u8 rd, u32 val, struct jit_ctx *ctx) { #if __LINUX_ARM_ARCH__ < 7 emit(ARM_LDR_I(rd, ARM_PC, imm_offset(val, ctx)), ctx); @@ -334,7 +275,7 @@ static inline void emit_mov_i_no8m(int rd, u32 val, struct jit_ctx *ctx) #endif } -static inline void emit_mov_i(int rd, u32 val, struct jit_ctx *ctx) +static inline void emit_mov_i(const u8 rd, u32 val, struct jit_ctx *ctx) { int imm12 = imm8m(val); @@ -344,676 +285,1594 @@ static inline void emit_mov_i(int rd, u32 val, struct jit_ctx *ctx) emit_mov_i_no8m(rd, val, ctx); } -#if __LINUX_ARM_ARCH__ < 6 - -static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) +static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx) { - _emit(cond, ARM_LDRB_I(ARM_R3, r_addr, 1), ctx); - _emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx); - _emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 3), ctx); - _emit(cond, ARM_LSL_I(ARM_R3, ARM_R3, 16), ctx); - _emit(cond, ARM_LDRB_I(ARM_R0, r_addr, 2), ctx); - _emit(cond, ARM_ORR_S(ARM_R3, ARM_R3, ARM_R1, SRTYPE_LSL, 24), ctx); - _emit(cond, ARM_ORR_R(ARM_R3, ARM_R3, ARM_R2), ctx); - _emit(cond, ARM_ORR_S(r_res, ARM_R3, ARM_R0, SRTYPE_LSL, 8), ctx); + ctx->seen |= SEEN_CALL; +#if __LINUX_ARM_ARCH__ < 5 + emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx); + + if (elf_hwcap & HWCAP_THUMB) + emit(ARM_BX(tgt_reg), ctx); + else + emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx); +#else + emit(ARM_BLX_R(tgt_reg), ctx); +#endif } -static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) +static inline int epilogue_offset(const struct jit_ctx *ctx) { - _emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx); - _emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 1), ctx); - _emit(cond, ARM_ORR_S(r_res, ARM_R2, ARM_R1, SRTYPE_LSL, 8), ctx); + int to, from; + /* No need for 1st dummy run */ + if (ctx->target == NULL) + return 0; + to = ctx->epilogue_offset; + from = ctx->idx; + + return to - from - 2; } -static inline void emit_swap16(u8 r_dst, u8 r_src, struct jit_ctx *ctx) +static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) { - /* r_dst = (r_src << 8) | (r_src >> 8) */ - emit(ARM_LSL_I(ARM_R1, r_src, 8), ctx); - emit(ARM_ORR_S(r_dst, ARM_R1, r_src, SRTYPE_LSR, 8), ctx); + const u8 *tmp = bpf2a32[TMP_REG_1]; + s32 jmp_offset; + + /* checks if divisor is zero or not. If it is, then + * exit directly. + */ + emit(ARM_CMP_I(rn, 0), ctx); + _emit(ARM_COND_EQ, ARM_MOV_I(ARM_R0, 0), ctx); + jmp_offset = epilogue_offset(ctx); + _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx); +#if __LINUX_ARM_ARCH__ == 7 + if (elf_hwcap & HWCAP_IDIVA) { + if (op == BPF_DIV) + emit(ARM_UDIV(rd, rm, rn), ctx); + else { + emit(ARM_UDIV(ARM_IP, rm, rn), ctx); + emit(ARM_MLS(rd, rn, ARM_IP, rm), ctx); + } + return; + } +#endif /* - * we need to mask out the bits set in r_dst[23:16] due to - * the first shift instruction. - * - * note that 0x8ff is the encoded immediate 0x00ff0000. + * For BPF_ALU | BPF_DIV | BPF_K instructions + * As ARM_R1 and ARM_R0 contains 1st argument of bpf + * function, we need to save it on caller side to save + * it from getting destroyed within callee. + * After the return from the callee, we restore ARM_R0 + * ARM_R1. */ - emit(ARM_BIC_I(r_dst, r_dst, 0x8ff), ctx); -} + if (rn != ARM_R1) { + emit(ARM_MOV_R(tmp[0], ARM_R1), ctx); + emit(ARM_MOV_R(ARM_R1, rn), ctx); + } + if (rm != ARM_R0) { + emit(ARM_MOV_R(tmp[1], ARM_R0), ctx); + emit(ARM_MOV_R(ARM_R0, rm), ctx); + } -#else /* ARMv6+ */ + /* Call appropriate function */ + ctx->seen |= SEEN_CALL; + emit_mov_i(ARM_IP, op == BPF_DIV ? + (u32)jit_udiv32 : (u32)jit_mod32, ctx); + emit_blx_r(ARM_IP, ctx); -static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) -{ - _emit(cond, ARM_LDR_I(r_res, r_addr, 0), ctx); -#ifdef __LITTLE_ENDIAN - _emit(cond, ARM_REV(r_res, r_res), ctx); -#endif + /* Save return value */ + if (rd != ARM_R0) + emit(ARM_MOV_R(rd, ARM_R0), ctx); + + /* Restore ARM_R0 and ARM_R1 */ + if (rn != ARM_R1) + emit(ARM_MOV_R(ARM_R1, tmp[0]), ctx); + if (rm != ARM_R0) + emit(ARM_MOV_R(ARM_R0, tmp[1]), ctx); } -static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) +/* Checks whether BPF register is on scratch stack space or not. */ +static inline bool is_on_stack(u8 bpf_reg) { - _emit(cond, ARM_LDRH_I(r_res, r_addr, 0), ctx); -#ifdef __LITTLE_ENDIAN - _emit(cond, ARM_REV16(r_res, r_res), ctx); -#endif + static u8 stack_regs[] = {BPF_REG_AX, BPF_REG_3, BPF_REG_4, BPF_REG_5, + BPF_REG_7, BPF_REG_8, BPF_REG_9, TCALL_CNT, + BPF_REG_2, BPF_REG_FP}; + int i, reg_len = sizeof(stack_regs); + + for (i = 0 ; i < reg_len ; i++) { + if (bpf_reg == stack_regs[i]) + return true; + } + return false; } -static inline void emit_swap16(u8 r_dst __maybe_unused, - u8 r_src __maybe_unused, - struct jit_ctx *ctx __maybe_unused) +static inline void emit_a32_mov_i(const u8 dst, const u32 val, + bool dstk, struct jit_ctx *ctx) { -#ifdef __LITTLE_ENDIAN - emit(ARM_REV16(r_dst, r_src), ctx); -#endif + const u8 *tmp = bpf2a32[TMP_REG_1]; + + if (dstk) { + emit_mov_i(tmp[1], val, ctx); + emit(ARM_STR_I(tmp[1], ARM_SP, STACK_VAR(dst)), ctx); + } else { + emit_mov_i(dst, val, ctx); + } } -#endif /* __LINUX_ARM_ARCH__ < 6 */ +/* Sign extended move */ +static inline void emit_a32_mov_i64(const bool is64, const u8 dst[], + const u32 val, bool dstk, + struct jit_ctx *ctx) { + u32 hi = 0; + if (is64 && (val & (1<<31))) + hi = (u32)~0; + emit_a32_mov_i(dst_lo, val, dstk, ctx); + emit_a32_mov_i(dst_hi, hi, dstk, ctx); +} -/* Compute the immediate value for a PC-relative branch. */ -static inline u32 b_imm(unsigned tgt, struct jit_ctx *ctx) -{ - u32 imm; +static inline void emit_a32_add_r(const u8 dst, const u8 src, + const bool is64, const bool hi, + struct jit_ctx *ctx) { + /* 64 bit : + * adds dst_lo, dst_lo, src_lo + * adc dst_hi, dst_hi, src_hi + * 32 bit : + * add dst_lo, dst_lo, src_lo + */ + if (!hi && is64) + emit(ARM_ADDS_R(dst, dst, src), ctx); + else if (hi && is64) + emit(ARM_ADC_R(dst, dst, src), ctx); + else + emit(ARM_ADD_R(dst, dst, src), ctx); +} - if (ctx->target == NULL) - return 0; - /* - * BPF allows only forward jumps and the offset of the target is - * still the one computed during the first pass. +static inline void emit_a32_sub_r(const u8 dst, const u8 src, + const bool is64, const bool hi, + struct jit_ctx *ctx) { + /* 64 bit : + * subs dst_lo, dst_lo, src_lo + * sbc dst_hi, dst_hi, src_hi + * 32 bit : + * sub dst_lo, dst_lo, src_lo */ - imm = ctx->offsets[tgt] + ctx->prologue_bytes - (ctx->idx * 4 + 8); + if (!hi && is64) + emit(ARM_SUBS_R(dst, dst, src), ctx); + else if (hi && is64) + emit(ARM_SBC_R(dst, dst, src), ctx); + else + emit(ARM_SUB_R(dst, dst, src), ctx); +} - return imm >> 2; +static inline void emit_alu_r(const u8 dst, const u8 src, const bool is64, + const bool hi, const u8 op, struct jit_ctx *ctx){ + switch (BPF_OP(op)) { + /* dst = dst + src */ + case BPF_ADD: + emit_a32_add_r(dst, src, is64, hi, ctx); + break; + /* dst = dst - src */ + case BPF_SUB: + emit_a32_sub_r(dst, src, is64, hi, ctx); + break; + /* dst = dst | src */ + case BPF_OR: + emit(ARM_ORR_R(dst, dst, src), ctx); + break; + /* dst = dst & src */ + case BPF_AND: + emit(ARM_AND_R(dst, dst, src), ctx); + break; + /* dst = dst ^ src */ + case BPF_XOR: + emit(ARM_EOR_R(dst, dst, src), ctx); + break; + /* dst = dst * src */ + case BPF_MUL: + emit(ARM_MUL(dst, dst, src), ctx); + break; + /* dst = dst << src */ + case BPF_LSH: + emit(ARM_LSL_R(dst, dst, src), ctx); + break; + /* dst = dst >> src */ + case BPF_RSH: + emit(ARM_LSR_R(dst, dst, src), ctx); + break; + /* dst = dst >> src (signed)*/ + case BPF_ARSH: + emit(ARM_MOV_SR(dst, dst, SRTYPE_ASR, src), ctx); + break; + } } -#define OP_IMM3(op, r1, r2, imm_val, ctx) \ - do { \ - imm12 = imm8m(imm_val); \ - if (imm12 < 0) { \ - emit_mov_i_no8m(r_scratch, imm_val, ctx); \ - emit(op ## _R((r1), (r2), r_scratch), ctx); \ - } else { \ - emit(op ## _I((r1), (r2), imm12), ctx); \ - } \ - } while (0) - -static inline void emit_err_ret(u8 cond, struct jit_ctx *ctx) -{ - if (ctx->ret0_fp_idx >= 0) { - _emit(cond, ARM_B(b_imm(ctx->ret0_fp_idx, ctx)), ctx); - /* NOP to keep the size constant between passes */ - emit(ARM_MOV_R(ARM_R0, ARM_R0), ctx); +/* ALU operation (32 bit) + * dst = dst (op) src + */ +static inline void emit_a32_alu_r(const u8 dst, const u8 src, + bool dstk, bool sstk, + struct jit_ctx *ctx, const bool is64, + const bool hi, const u8 op) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + u8 rn = sstk ? tmp[1] : src; + + if (sstk) + emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src)), ctx); + + /* ALU operation */ + if (dstk) { + emit(ARM_LDR_I(tmp[0], ARM_SP, STACK_VAR(dst)), ctx); + emit_alu_r(tmp[0], rn, is64, hi, op, ctx); + emit(ARM_STR_I(tmp[0], ARM_SP, STACK_VAR(dst)), ctx); } else { - _emit(cond, ARM_MOV_I(ARM_R0, 0), ctx); - _emit(cond, ARM_B(b_imm(ctx->skf->len, ctx)), ctx); + emit_alu_r(dst, rn, is64, hi, op, ctx); } } -static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx) -{ -#if __LINUX_ARM_ARCH__ < 5 - emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx); +/* ALU operation (64 bit) */ +static inline void emit_a32_alu_r64(const bool is64, const u8 dst[], + const u8 src[], bool dstk, + bool sstk, struct jit_ctx *ctx, + const u8 op) { + emit_a32_alu_r(dst_lo, src_lo, dstk, sstk, ctx, is64, false, op); + if (is64) + emit_a32_alu_r(dst_hi, src_hi, dstk, sstk, ctx, is64, true, op); + else + emit_a32_mov_i(dst_hi, 0, dstk, ctx); +} - if (elf_hwcap & HWCAP_THUMB) - emit(ARM_BX(tgt_reg), ctx); +/* dst = imm (4 bytes)*/ +static inline void emit_a32_mov_r(const u8 dst, const u8 src, + bool dstk, bool sstk, + struct jit_ctx *ctx) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + u8 rt = sstk ? tmp[0] : src; + + if (sstk) + emit(ARM_LDR_I(tmp[0], ARM_SP, STACK_VAR(src)), ctx); + if (dstk) + emit(ARM_STR_I(rt, ARM_SP, STACK_VAR(dst)), ctx); else - emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx); -#else - emit(ARM_BLX_R(tgt_reg), ctx); -#endif + emit(ARM_MOV_R(dst, rt), ctx); } -static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, - int bpf_op) -{ -#if __LINUX_ARM_ARCH__ == 7 - if (elf_hwcap & HWCAP_IDIVA) { - if (bpf_op == BPF_DIV) - emit(ARM_UDIV(rd, rm, rn), ctx); - else { - emit(ARM_UDIV(ARM_R3, rm, rn), ctx); - emit(ARM_MLS(rd, rn, ARM_R3, rm), ctx); - } - return; +/* dst = src */ +static inline void emit_a32_mov_r64(const bool is64, const u8 dst[], + const u8 src[], bool dstk, + bool sstk, struct jit_ctx *ctx) { + emit_a32_mov_r(dst_lo, src_lo, dstk, sstk, ctx); + if (is64) { + /* complete 8 byte move */ + emit_a32_mov_r(dst_hi, src_hi, dstk, sstk, ctx); + } else { + /* Zero out high 4 bytes */ + emit_a32_mov_i(dst_hi, 0, dstk, ctx); } -#endif +} - /* - * For BPF_ALU | BPF_DIV | BPF_K instructions, rm is ARM_R4 - * (r_A) and rn is ARM_R0 (r_scratch) so load rn first into - * ARM_R1 to avoid accidentally overwriting ARM_R0 with rm - * before using it as a source for ARM_R1. - * - * For BPF_ALU | BPF_DIV | BPF_X rm is ARM_R4 (r_A) and rn is - * ARM_R5 (r_X) so there is no particular register overlap - * issues. - */ - if (rn != ARM_R1) - emit(ARM_MOV_R(ARM_R1, rn), ctx); - if (rm != ARM_R0) - emit(ARM_MOV_R(ARM_R0, rm), ctx); +/* Shift operations */ +static inline void emit_a32_alu_i(const u8 dst, const u32 val, bool dstk, + struct jit_ctx *ctx, const u8 op) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + u8 rd = dstk ? tmp[0] : dst; + + if (dstk) + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst)), ctx); + + /* Do shift operation */ + switch (op) { + case BPF_LSH: + emit(ARM_LSL_I(rd, rd, val), ctx); + break; + case BPF_RSH: + emit(ARM_LSR_I(rd, rd, val), ctx); + break; + case BPF_NEG: + emit(ARM_RSB_I(rd, rd, val), ctx); + break; + } + if (dstk) + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst)), ctx); +} + +/* dst = ~dst (64 bit) */ +static inline void emit_a32_neg64(const u8 dst[], bool dstk, + struct jit_ctx *ctx){ + const u8 *tmp = bpf2a32[TMP_REG_1]; + u8 rd = dstk ? tmp[1] : dst[1]; + u8 rm = dstk ? tmp[0] : dst[0]; + + /* Setup Operand */ + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Do Negate Operation */ + emit(ARM_RSBS_I(rd, rd, 0), ctx); + emit(ARM_RSC_I(rm, rm, 0), ctx); + + if (dstk) { + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } +} + +/* dst = dst << src */ +static inline void emit_a32_lsh_r64(const u8 dst[], const u8 src[], bool dstk, + bool sstk, struct jit_ctx *ctx) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + + /* Setup Operands */ + u8 rt = sstk ? tmp2[1] : src_lo; + u8 rd = dstk ? tmp[1] : dst_lo; + u8 rm = dstk ? tmp[0] : dst_hi; + + if (sstk) + emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)), ctx); + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Do LSH operation */ + emit(ARM_SUB_I(ARM_IP, rt, 32), ctx); + emit(ARM_RSB_I(tmp2[0], rt, 32), ctx); + /* As we are using ARM_LR */ ctx->seen |= SEEN_CALL; - emit_mov_i(ARM_R3, bpf_op == BPF_DIV ? (u32)jit_udiv : (u32)jit_mod, - ctx); - emit_blx_r(ARM_R3, ctx); + emit(ARM_MOV_SR(ARM_LR, rm, SRTYPE_ASL, rt), ctx); + emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd, SRTYPE_ASL, ARM_IP), ctx); + emit(ARM_ORR_SR(ARM_IP, ARM_LR, rd, SRTYPE_LSR, tmp2[0]), ctx); + emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_ASL, rt), ctx); + + if (dstk) { + emit(ARM_STR_I(ARM_LR, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(ARM_IP, ARM_SP, STACK_VAR(dst_hi)), ctx); + } else { + emit(ARM_MOV_R(rd, ARM_LR), ctx); + emit(ARM_MOV_R(rm, ARM_IP), ctx); + } +} - if (rd != ARM_R0) - emit(ARM_MOV_R(rd, ARM_R0), ctx); +/* dst = dst >> src (signed)*/ +static inline void emit_a32_arsh_r64(const u8 dst[], const u8 src[], bool dstk, + bool sstk, struct jit_ctx *ctx) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + /* Setup Operands */ + u8 rt = sstk ? tmp2[1] : src_lo; + u8 rd = dstk ? tmp[1] : dst_lo; + u8 rm = dstk ? tmp[0] : dst_hi; + + if (sstk) + emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)), ctx); + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Do the ARSH operation */ + emit(ARM_RSB_I(ARM_IP, rt, 32), ctx); + emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx); + /* As we are using ARM_LR */ + ctx->seen |= SEEN_CALL; + emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx); + emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASL, ARM_IP), ctx); + _emit(ARM_COND_MI, ARM_B(0), ctx); + emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASR, tmp2[0]), ctx); + emit(ARM_MOV_SR(ARM_IP, rm, SRTYPE_ASR, rt), ctx); + if (dstk) { + emit(ARM_STR_I(ARM_LR, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(ARM_IP, ARM_SP, STACK_VAR(dst_hi)), ctx); + } else { + emit(ARM_MOV_R(rd, ARM_LR), ctx); + emit(ARM_MOV_R(rm, ARM_IP), ctx); + } +} + +/* dst = dst >> src */ +static inline void emit_a32_lsr_r64(const u8 dst[], const u8 src[], bool dstk, + bool sstk, struct jit_ctx *ctx) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + /* Setup Operands */ + u8 rt = sstk ? tmp2[1] : src_lo; + u8 rd = dstk ? tmp[1] : dst_lo; + u8 rm = dstk ? tmp[0] : dst_hi; + + if (sstk) + emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)), ctx); + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Do LSH operation */ + emit(ARM_RSB_I(ARM_IP, rt, 32), ctx); + emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx); + /* As we are using ARM_LR */ + ctx->seen |= SEEN_CALL; + emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx); + emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASL, ARM_IP), ctx); + emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_LSR, tmp2[0]), ctx); + emit(ARM_MOV_SR(ARM_IP, rm, SRTYPE_LSR, rt), ctx); + if (dstk) { + emit(ARM_STR_I(ARM_LR, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(ARM_IP, ARM_SP, STACK_VAR(dst_hi)), ctx); + } else { + emit(ARM_MOV_R(rd, ARM_LR), ctx); + emit(ARM_MOV_R(rm, ARM_IP), ctx); + } } -static inline void update_on_xread(struct jit_ctx *ctx) +/* dst = dst << val */ +static inline void emit_a32_lsh_i64(const u8 dst[], bool dstk, + const u32 val, struct jit_ctx *ctx){ + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + /* Setup operands */ + u8 rd = dstk ? tmp[1] : dst_lo; + u8 rm = dstk ? tmp[0] : dst_hi; + + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Do LSH operation */ + if (val < 32) { + emit(ARM_MOV_SI(tmp2[0], rm, SRTYPE_ASL, val), ctx); + emit(ARM_ORR_SI(rm, tmp2[0], rd, SRTYPE_LSR, 32 - val), ctx); + emit(ARM_MOV_SI(rd, rd, SRTYPE_ASL, val), ctx); + } else { + if (val == 32) + emit(ARM_MOV_R(rm, rd), ctx); + else + emit(ARM_MOV_SI(rm, rd, SRTYPE_ASL, val - 32), ctx); + emit(ARM_EOR_R(rd, rd, rd), ctx); + } + + if (dstk) { + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } +} + +/* dst = dst >> val */ +static inline void emit_a32_lsr_i64(const u8 dst[], bool dstk, + const u32 val, struct jit_ctx *ctx) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + /* Setup operands */ + u8 rd = dstk ? tmp[1] : dst_lo; + u8 rm = dstk ? tmp[0] : dst_hi; + + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Do LSR operation */ + if (val < 32) { + emit(ARM_MOV_SI(tmp2[1], rd, SRTYPE_LSR, val), ctx); + emit(ARM_ORR_SI(rd, tmp2[1], rm, SRTYPE_ASL, 32 - val), ctx); + emit(ARM_MOV_SI(rm, rm, SRTYPE_LSR, val), ctx); + } else if (val == 32) { + emit(ARM_MOV_R(rd, rm), ctx); + emit(ARM_MOV_I(rm, 0), ctx); + } else { + emit(ARM_MOV_SI(rd, rm, SRTYPE_LSR, val - 32), ctx); + emit(ARM_MOV_I(rm, 0), ctx); + } + + if (dstk) { + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } +} + +/* dst = dst >> val (signed) */ +static inline void emit_a32_arsh_i64(const u8 dst[], bool dstk, + const u32 val, struct jit_ctx *ctx){ + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + /* Setup operands */ + u8 rd = dstk ? tmp[1] : dst_lo; + u8 rm = dstk ? tmp[0] : dst_hi; + + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Do ARSH operation */ + if (val < 32) { + emit(ARM_MOV_SI(tmp2[1], rd, SRTYPE_LSR, val), ctx); + emit(ARM_ORR_SI(rd, tmp2[1], rm, SRTYPE_ASL, 32 - val), ctx); + emit(ARM_MOV_SI(rm, rm, SRTYPE_ASR, val), ctx); + } else if (val == 32) { + emit(ARM_MOV_R(rd, rm), ctx); + emit(ARM_MOV_SI(rm, rm, SRTYPE_ASR, 31), ctx); + } else { + emit(ARM_MOV_SI(rd, rm, SRTYPE_ASR, val - 32), ctx); + emit(ARM_MOV_SI(rm, rm, SRTYPE_ASR, 31), ctx); + } + + if (dstk) { + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } +} + +static inline void emit_a32_mul_r64(const u8 dst[], const u8 src[], bool dstk, + bool sstk, struct jit_ctx *ctx) { + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + /* Setup operands for multiplication */ + u8 rd = dstk ? tmp[1] : dst_lo; + u8 rm = dstk ? tmp[0] : dst_hi; + u8 rt = sstk ? tmp2[1] : src_lo; + u8 rn = sstk ? tmp2[0] : src_hi; + + if (dstk) { + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + if (sstk) { + emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)), ctx); + emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_hi)), ctx); + } + + /* Do Multiplication */ + emit(ARM_MUL(ARM_IP, rd, rn), ctx); + emit(ARM_MUL(ARM_LR, rm, rt), ctx); + /* As we are using ARM_LR */ + ctx->seen |= SEEN_CALL; + emit(ARM_ADD_R(ARM_LR, ARM_IP, ARM_LR), ctx); + + emit(ARM_UMULL(ARM_IP, rm, rd, rt), ctx); + emit(ARM_ADD_R(rm, ARM_LR, rm), ctx); + if (dstk) { + emit(ARM_STR_I(ARM_IP, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx); + } else { + emit(ARM_MOV_R(rd, ARM_IP), ctx); + } +} + +/* *(size *)(dst + off) = src */ +static inline void emit_str_r(const u8 dst, const u8 src, bool dstk, + const s32 off, struct jit_ctx *ctx, const u8 sz){ + const u8 *tmp = bpf2a32[TMP_REG_1]; + u8 rd = dstk ? tmp[1] : dst; + + if (dstk) + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst)), ctx); + if (off) { + emit_a32_mov_i(tmp[0], off, false, ctx); + emit(ARM_ADD_R(tmp[0], rd, tmp[0]), ctx); + rd = tmp[0]; + } + switch (sz) { + case BPF_W: + /* Store a Word */ + emit(ARM_STR_I(src, rd, 0), ctx); + break; + case BPF_H: + /* Store a HalfWord */ + emit(ARM_STRH_I(src, rd, 0), ctx); + break; + case BPF_B: + /* Store a Byte */ + emit(ARM_STRB_I(src, rd, 0), ctx); + break; + } +} + +/* dst = *(size*)(src + off) */ +static inline void emit_ldx_r(const u8 dst, const u8 src, bool dstk, + const s32 off, struct jit_ctx *ctx, const u8 sz){ + const u8 *tmp = bpf2a32[TMP_REG_1]; + u8 rd = dstk ? tmp[1] : dst; + u8 rm = src; + + if (off) { + emit_a32_mov_i(tmp[0], off, false, ctx); + emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx); + rm = tmp[0]; + } + switch (sz) { + case BPF_W: + /* Load a Word */ + emit(ARM_LDR_I(rd, rm, 0), ctx); + break; + case BPF_H: + /* Load a HalfWord */ + emit(ARM_LDRH_I(rd, rm, 0), ctx); + break; + case BPF_B: + /* Load a Byte */ + emit(ARM_LDRB_I(rd, rm, 0), ctx); + break; + } + if (dstk) + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst)), ctx); +} + +/* Arithmatic Operation */ +static inline void emit_ar_r(const u8 rd, const u8 rt, const u8 rm, + const u8 rn, struct jit_ctx *ctx, u8 op) { + switch (op) { + case BPF_JSET: + ctx->seen |= SEEN_CALL; + emit(ARM_AND_R(ARM_IP, rt, rn), ctx); + emit(ARM_AND_R(ARM_LR, rd, rm), ctx); + emit(ARM_ORRS_R(ARM_IP, ARM_LR, ARM_IP), ctx); + break; + case BPF_JEQ: + case BPF_JNE: + case BPF_JGT: + case BPF_JGE: + case BPF_JLE: + case BPF_JLT: + emit(ARM_CMP_R(rd, rm), ctx); + _emit(ARM_COND_EQ, ARM_CMP_R(rt, rn), ctx); + break; + case BPF_JSLE: + case BPF_JSGT: + emit(ARM_CMP_R(rn, rt), ctx); + emit(ARM_SBCS_R(ARM_IP, rm, rd), ctx); + break; + case BPF_JSLT: + case BPF_JSGE: + emit(ARM_CMP_R(rt, rn), ctx); + emit(ARM_SBCS_R(ARM_IP, rd, rm), ctx); + break; + } +} + +static int out_offset = -1; /* initialized on the first pass of build_body() */ +static int emit_bpf_tail_call(struct jit_ctx *ctx) { - if (!(ctx->seen & SEEN_X)) - ctx->flags |= FLAG_NEED_X_RESET; - ctx->seen |= SEEN_X; + /* bpf_tail_call(void *prog_ctx, struct bpf_array *array, u64 index) */ + const u8 *r2 = bpf2a32[BPF_REG_2]; + const u8 *r3 = bpf2a32[BPF_REG_3]; + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + const u8 *tcc = bpf2a32[TCALL_CNT]; + const int idx0 = ctx->idx; +#define cur_offset (ctx->idx - idx0) +#define jmp_offset (out_offset - (cur_offset)) + u32 off, lo, hi; + + /* if (index >= array->map.max_entries) + * goto out; + */ + off = offsetof(struct bpf_array, map.max_entries); + /* array->map.max_entries */ + emit_a32_mov_i(tmp[1], off, false, ctx); + emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r2[1])), ctx); + emit(ARM_LDR_R(tmp[1], tmp2[1], tmp[1]), ctx); + /* index (64 bit) */ + emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r3[1])), ctx); + /* index >= array->map.max_entries */ + emit(ARM_CMP_R(tmp2[1], tmp[1]), ctx); + _emit(ARM_COND_CS, ARM_B(jmp_offset), ctx); + + /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + * tail_call_cnt++; + */ + lo = (u32)MAX_TAIL_CALL_CNT; + hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32); + emit(ARM_LDR_I(tmp[1], ARM_SP, STACK_VAR(tcc[1])), ctx); + emit(ARM_LDR_I(tmp[0], ARM_SP, STACK_VAR(tcc[0])), ctx); + emit(ARM_CMP_I(tmp[0], hi), ctx); + _emit(ARM_COND_EQ, ARM_CMP_I(tmp[1], lo), ctx); + _emit(ARM_COND_HI, ARM_B(jmp_offset), ctx); + emit(ARM_ADDS_I(tmp[1], tmp[1], 1), ctx); + emit(ARM_ADC_I(tmp[0], tmp[0], 0), ctx); + emit(ARM_STR_I(tmp[1], ARM_SP, STACK_VAR(tcc[1])), ctx); + emit(ARM_STR_I(tmp[0], ARM_SP, STACK_VAR(tcc[0])), ctx); + + /* prog = array->ptrs[index] + * if (prog == NULL) + * goto out; + */ + off = offsetof(struct bpf_array, ptrs); + emit_a32_mov_i(tmp[1], off, false, ctx); + emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r2[1])), ctx); + emit(ARM_ADD_R(tmp[1], tmp2[1], tmp[1]), ctx); + emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r3[1])), ctx); + emit(ARM_MOV_SI(tmp[0], tmp2[1], SRTYPE_ASL, 2), ctx); + emit(ARM_LDR_R(tmp[1], tmp[1], tmp[0]), ctx); + emit(ARM_CMP_I(tmp[1], 0), ctx); + _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx); + + /* goto *(prog->bpf_func + prologue_size); */ + off = offsetof(struct bpf_prog, bpf_func); + emit_a32_mov_i(tmp2[1], off, false, ctx); + emit(ARM_LDR_R(tmp[1], tmp[1], tmp2[1]), ctx); + emit(ARM_ADD_I(tmp[1], tmp[1], ctx->prologue_bytes), ctx); + emit(ARM_BX(tmp[1]), ctx); + + /* out: */ + if (out_offset == -1) + out_offset = cur_offset; + if (cur_offset != out_offset) { + pr_err_once("tail_call out_offset = %d, expected %d!\n", + cur_offset, out_offset); + return -1; + } + return 0; +#undef cur_offset +#undef jmp_offset } -static int build_body(struct jit_ctx *ctx) +/* 0xabcd => 0xcdab */ +static inline void emit_rev16(const u8 rd, const u8 rn, struct jit_ctx *ctx) { - void *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w}; - const struct bpf_prog *prog = ctx->skf; - const struct sock_filter *inst; - unsigned i, load_order, off, condt; - int imm12; - u32 k; +#if __LINUX_ARM_ARCH__ < 6 + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + + emit(ARM_AND_I(tmp2[1], rn, 0xff), ctx); + emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 8), ctx); + emit(ARM_AND_I(tmp2[0], tmp2[0], 0xff), ctx); + emit(ARM_ORR_SI(rd, tmp2[0], tmp2[1], SRTYPE_LSL, 8), ctx); +#else /* ARMv6+ */ + emit(ARM_REV16(rd, rn), ctx); +#endif +} - for (i = 0; i < prog->len; i++) { - u16 code; +/* 0xabcdefgh => 0xghefcdab */ +static inline void emit_rev32(const u8 rd, const u8 rn, struct jit_ctx *ctx) +{ +#if __LINUX_ARM_ARCH__ < 6 + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + + emit(ARM_AND_I(tmp2[1], rn, 0xff), ctx); + emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 24), ctx); + emit(ARM_ORR_SI(ARM_IP, tmp2[0], tmp2[1], SRTYPE_LSL, 24), ctx); + + emit(ARM_MOV_SI(tmp2[1], rn, SRTYPE_LSR, 8), ctx); + emit(ARM_AND_I(tmp2[1], tmp2[1], 0xff), ctx); + emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 16), ctx); + emit(ARM_AND_I(tmp2[0], tmp2[0], 0xff), ctx); + emit(ARM_MOV_SI(tmp2[0], tmp2[0], SRTYPE_LSL, 8), ctx); + emit(ARM_ORR_SI(tmp2[0], tmp2[0], tmp2[1], SRTYPE_LSL, 16), ctx); + emit(ARM_ORR_R(rd, ARM_IP, tmp2[0]), ctx); + +#else /* ARMv6+ */ + emit(ARM_REV(rd, rn), ctx); +#endif +} - inst = &(prog->insns[i]); - /* K as an immediate value operand */ - k = inst->k; - code = bpf_anc_helper(inst); +// push the scratch stack register on top of the stack +static inline void emit_push_r64(const u8 src[], const u8 shift, + struct jit_ctx *ctx) +{ + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + u16 reg_set = 0; - /* compute offsets only in the fake pass */ - if (ctx->target == NULL) - ctx->offsets[i] = ctx->idx * 4; + emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(src[1]+shift)), ctx); + emit(ARM_LDR_I(tmp2[0], ARM_SP, STACK_VAR(src[0]+shift)), ctx); + + reg_set = (1 << tmp2[1]) | (1 << tmp2[0]); + emit(ARM_PUSH(reg_set), ctx); +} + +static void build_prologue(struct jit_ctx *ctx) +{ + const u8 r0 = bpf2a32[BPF_REG_0][1]; + const u8 r2 = bpf2a32[BPF_REG_1][1]; + const u8 r3 = bpf2a32[BPF_REG_1][0]; + const u8 r4 = bpf2a32[BPF_REG_6][1]; + const u8 r5 = bpf2a32[BPF_REG_6][0]; + const u8 r6 = bpf2a32[TMP_REG_1][1]; + const u8 r7 = bpf2a32[TMP_REG_1][0]; + const u8 r8 = bpf2a32[TMP_REG_2][1]; + const u8 r10 = bpf2a32[TMP_REG_2][0]; + const u8 fplo = bpf2a32[BPF_REG_FP][1]; + const u8 fphi = bpf2a32[BPF_REG_FP][0]; + const u8 sp = ARM_SP; + const u8 *tcc = bpf2a32[TCALL_CNT]; + + u16 reg_set = 0; + + /* + * eBPF prog stack layout + * + * high + * original ARM_SP => +-----+ eBPF prologue + * |FP/LR| + * current ARM_FP => +-----+ + * | ... | callee saved registers + * eBPF fp register => +-----+ <= (BPF_FP) + * | ... | eBPF JIT scratch space + * | | eBPF prog stack + * +-----+ + * |RSVD | JIT scratchpad + * current A64_SP => +-----+ <= (BPF_FP - STACK_SIZE) + * | | + * | ... | Function call stack + * | | + * +-----+ + * low + */ + + /* Save callee saved registers. */ + reg_set |= (1<<r4) | (1<<r5) | (1<<r6) | (1<<r7) | (1<<r8) | (1<<r10); +#ifdef CONFIG_FRAME_POINTER + reg_set |= (1<<ARM_FP) | (1<<ARM_IP) | (1<<ARM_LR) | (1<<ARM_PC); + emit(ARM_MOV_R(ARM_IP, sp), ctx); + emit(ARM_PUSH(reg_set), ctx); + emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx); +#else + /* Check if call instruction exists in BPF body */ + if (ctx->seen & SEEN_CALL) + reg_set |= (1<<ARM_LR); + emit(ARM_PUSH(reg_set), ctx); +#endif + /* Save frame pointer for later */ + emit(ARM_SUB_I(ARM_IP, sp, SCRATCH_SIZE), ctx); + + ctx->stack_size = imm8m(STACK_SIZE); + + /* Set up function call stack */ + emit(ARM_SUB_I(ARM_SP, ARM_SP, ctx->stack_size), ctx); - switch (code) { - case BPF_LD | BPF_IMM: - emit_mov_i(r_A, k, ctx); + /* Set up BPF prog stack base register */ + emit_a32_mov_r(fplo, ARM_IP, true, false, ctx); + emit_a32_mov_i(fphi, 0, true, ctx); + + /* mov r4, 0 */ + emit(ARM_MOV_I(r4, 0), ctx); + + /* Move BPF_CTX to BPF_R1 */ + emit(ARM_MOV_R(r3, r4), ctx); + emit(ARM_MOV_R(r2, r0), ctx); + /* Initialize Tail Count */ + emit(ARM_STR_I(r4, ARM_SP, STACK_VAR(tcc[0])), ctx); + emit(ARM_STR_I(r4, ARM_SP, STACK_VAR(tcc[1])), ctx); + /* end of prologue */ +} + +static void build_epilogue(struct jit_ctx *ctx) +{ + const u8 r4 = bpf2a32[BPF_REG_6][1]; + const u8 r5 = bpf2a32[BPF_REG_6][0]; + const u8 r6 = bpf2a32[TMP_REG_1][1]; + const u8 r7 = bpf2a32[TMP_REG_1][0]; + const u8 r8 = bpf2a32[TMP_REG_2][1]; + const u8 r10 = bpf2a32[TMP_REG_2][0]; + u16 reg_set = 0; + + /* unwind function call stack */ + emit(ARM_ADD_I(ARM_SP, ARM_SP, ctx->stack_size), ctx); + + /* restore callee saved registers. */ + reg_set |= (1<<r4) | (1<<r5) | (1<<r6) | (1<<r7) | (1<<r8) | (1<<r10); +#ifdef CONFIG_FRAME_POINTER + /* the first instruction of the prologue was: mov ip, sp */ + reg_set |= (1<<ARM_FP) | (1<<ARM_SP) | (1<<ARM_PC); + emit(ARM_LDM(ARM_SP, reg_set), ctx); +#else + if (ctx->seen & SEEN_CALL) + reg_set |= (1<<ARM_PC); + /* Restore callee saved registers. */ + emit(ARM_POP(reg_set), ctx); + /* Return back to the callee function */ + if (!(ctx->seen & SEEN_CALL)) + emit(ARM_BX(ARM_LR), ctx); +#endif +} + +/* + * Convert an eBPF instruction to native instruction, i.e + * JITs an eBPF instruction. + * Returns : + * 0 - Successfully JITed an 8-byte eBPF instruction + * >0 - Successfully JITed a 16-byte eBPF instruction + * <0 - Failed to JIT. + */ +static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) +{ + const u8 code = insn->code; + const u8 *dst = bpf2a32[insn->dst_reg]; + const u8 *src = bpf2a32[insn->src_reg]; + const u8 *tmp = bpf2a32[TMP_REG_1]; + const u8 *tmp2 = bpf2a32[TMP_REG_2]; + const s16 off = insn->off; + const s32 imm = insn->imm; + const int i = insn - ctx->prog->insnsi; + const bool is64 = BPF_CLASS(code) == BPF_ALU64; + const bool dstk = is_on_stack(insn->dst_reg); + const bool sstk = is_on_stack(insn->src_reg); + u8 rd, rt, rm, rn; + s32 jmp_offset; + +#define check_imm(bits, imm) do { \ + if ((((imm) > 0) && ((imm) >> (bits))) || \ + (((imm) < 0) && (~(imm) >> (bits)))) { \ + pr_info("[%2d] imm=%d(0x%x) out of range\n", \ + i, imm, imm); \ + return -EINVAL; \ + } \ +} while (0) +#define check_imm24(imm) check_imm(24, imm) + + switch (code) { + /* ALU operations */ + + /* dst = src */ + case BPF_ALU | BPF_MOV | BPF_K: + case BPF_ALU | BPF_MOV | BPF_X: + case BPF_ALU64 | BPF_MOV | BPF_K: + case BPF_ALU64 | BPF_MOV | BPF_X: + switch (BPF_SRC(code)) { + case BPF_X: + emit_a32_mov_r64(is64, dst, src, dstk, sstk, ctx); break; - case BPF_LD | BPF_W | BPF_LEN: - ctx->seen |= SEEN_SKB; - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); - emit(ARM_LDR_I(r_A, r_skb, - offsetof(struct sk_buff, len)), ctx); + case BPF_K: + /* Sign-extend immediate value to destination reg */ + emit_a32_mov_i64(is64, dst, imm, dstk, ctx); break; - case BPF_LD | BPF_MEM: - /* A = scratch[k] */ - ctx->seen |= SEEN_MEM_WORD(k); - emit(ARM_LDR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx); + } + break; + /* dst = dst + src/imm */ + /* dst = dst - src/imm */ + /* dst = dst | src/imm */ + /* dst = dst & src/imm */ + /* dst = dst ^ src/imm */ + /* dst = dst * src/imm */ + /* dst = dst << src */ + /* dst = dst >> src */ + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_ADD | BPF_X: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_SUB | BPF_X: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_OR | BPF_X: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_AND | BPF_X: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_XOR | BPF_X: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_MUL | BPF_X: + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU | BPF_ARSH | BPF_K: + case BPF_ALU | BPF_ARSH | BPF_X: + case BPF_ALU64 | BPF_ADD | BPF_K: + case BPF_ALU64 | BPF_ADD | BPF_X: + case BPF_ALU64 | BPF_SUB | BPF_K: + case BPF_ALU64 | BPF_SUB | BPF_X: + case BPF_ALU64 | BPF_OR | BPF_K: + case BPF_ALU64 | BPF_OR | BPF_X: + case BPF_ALU64 | BPF_AND | BPF_K: + case BPF_ALU64 | BPF_AND | BPF_X: + case BPF_ALU64 | BPF_XOR | BPF_K: + case BPF_ALU64 | BPF_XOR | BPF_X: + switch (BPF_SRC(code)) { + case BPF_X: + emit_a32_alu_r64(is64, dst, src, dstk, sstk, + ctx, BPF_OP(code)); break; - case BPF_LD | BPF_W | BPF_ABS: - load_order = 2; - goto load; - case BPF_LD | BPF_H | BPF_ABS: - load_order = 1; - goto load; - case BPF_LD | BPF_B | BPF_ABS: - load_order = 0; -load: - emit_mov_i(r_off, k, ctx); -load_common: - ctx->seen |= SEEN_DATA | SEEN_CALL; - - if (load_order > 0) { - emit(ARM_SUB_I(r_scratch, r_skb_hl, - 1 << load_order), ctx); - emit(ARM_CMP_R(r_scratch, r_off), ctx); - condt = ARM_COND_GE; - } else { - emit(ARM_CMP_R(r_skb_hl, r_off), ctx); - condt = ARM_COND_HI; - } - - /* - * test for negative offset, only if we are - * currently scheduled to take the fast - * path. this will update the flags so that - * the slowpath instruction are ignored if the - * offset is negative. - * - * for loard_order == 0 the HI condition will - * make loads at offset 0 take the slow path too. + case BPF_K: + /* Move immediate value to the temporary register + * and then do the ALU operation on the temporary + * register as this will sign-extend the immediate + * value into temporary reg and then it would be + * safe to do the operation on it. */ - _emit(condt, ARM_CMP_I(r_off, 0), ctx); - - _emit(condt, ARM_ADD_R(r_scratch, r_off, r_skb_data), - ctx); - - if (load_order == 0) - _emit(condt, ARM_LDRB_I(r_A, r_scratch, 0), - ctx); - else if (load_order == 1) - emit_load_be16(condt, r_A, r_scratch, ctx); - else if (load_order == 2) - emit_load_be32(condt, r_A, r_scratch, ctx); - - _emit(condt, ARM_B(b_imm(i + 1, ctx)), ctx); - - /* the slowpath */ - emit_mov_i(ARM_R3, (u32)load_func[load_order], ctx); - emit(ARM_MOV_R(ARM_R0, r_skb), ctx); - /* the offset is already in R1 */ - emit_blx_r(ARM_R3, ctx); - /* check the result of skb_copy_bits */ - emit(ARM_CMP_I(ARM_R1, 0), ctx); - emit_err_ret(ARM_COND_NE, ctx); - emit(ARM_MOV_R(r_A, ARM_R0), ctx); + emit_a32_mov_i64(is64, tmp2, imm, false, ctx); + emit_a32_alu_r64(is64, dst, tmp2, dstk, false, + ctx, BPF_OP(code)); break; - case BPF_LD | BPF_W | BPF_IND: - load_order = 2; - goto load_ind; - case BPF_LD | BPF_H | BPF_IND: - load_order = 1; - goto load_ind; - case BPF_LD | BPF_B | BPF_IND: - load_order = 0; -load_ind: - update_on_xread(ctx); - OP_IMM3(ARM_ADD, r_off, r_X, k, ctx); - goto load_common; - case BPF_LDX | BPF_IMM: - ctx->seen |= SEEN_X; - emit_mov_i(r_X, k, ctx); + } + break; + /* dst = dst / src(imm) */ + /* dst = dst % src(imm) */ + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_MOD | BPF_K: + case BPF_ALU | BPF_MOD | BPF_X: + rt = src_lo; + rd = dstk ? tmp2[1] : dst_lo; + if (dstk) + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + switch (BPF_SRC(code)) { + case BPF_X: + rt = sstk ? tmp2[0] : rt; + if (sstk) + emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)), + ctx); break; - case BPF_LDX | BPF_W | BPF_LEN: - ctx->seen |= SEEN_X | SEEN_SKB; - emit(ARM_LDR_I(r_X, r_skb, - offsetof(struct sk_buff, len)), ctx); + case BPF_K: + rt = tmp2[0]; + emit_a32_mov_i(rt, imm, false, ctx); break; - case BPF_LDX | BPF_MEM: - ctx->seen |= SEEN_X | SEEN_MEM_WORD(k); - emit(ARM_LDR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx); + } + emit_udivmod(rd, rd, rt, ctx, BPF_OP(code)); + if (dstk) + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit_a32_mov_i(dst_hi, 0, dstk, ctx); + break; + case BPF_ALU64 | BPF_DIV | BPF_K: + case BPF_ALU64 | BPF_DIV | BPF_X: + case BPF_ALU64 | BPF_MOD | BPF_K: + case BPF_ALU64 | BPF_MOD | BPF_X: + goto notyet; + /* dst = dst >> imm */ + /* dst = dst << imm */ + case BPF_ALU | BPF_RSH | BPF_K: + case BPF_ALU | BPF_LSH | BPF_K: + if (unlikely(imm > 31)) + return -EINVAL; + if (imm) + emit_a32_alu_i(dst_lo, imm, dstk, ctx, BPF_OP(code)); + emit_a32_mov_i(dst_hi, 0, dstk, ctx); + break; + /* dst = dst << imm */ + case BPF_ALU64 | BPF_LSH | BPF_K: + if (unlikely(imm > 63)) + return -EINVAL; + emit_a32_lsh_i64(dst, dstk, imm, ctx); + break; + /* dst = dst >> imm */ + case BPF_ALU64 | BPF_RSH | BPF_K: + if (unlikely(imm > 63)) + return -EINVAL; + emit_a32_lsr_i64(dst, dstk, imm, ctx); + break; + /* dst = dst << src */ + case BPF_ALU64 | BPF_LSH | BPF_X: + emit_a32_lsh_r64(dst, src, dstk, sstk, ctx); + break; + /* dst = dst >> src */ + case BPF_ALU64 | BPF_RSH | BPF_X: + emit_a32_lsr_r64(dst, src, dstk, sstk, ctx); + break; + /* dst = dst >> src (signed) */ + case BPF_ALU64 | BPF_ARSH | BPF_X: + emit_a32_arsh_r64(dst, src, dstk, sstk, ctx); + break; + /* dst = dst >> imm (signed) */ + case BPF_ALU64 | BPF_ARSH | BPF_K: + if (unlikely(imm > 63)) + return -EINVAL; + emit_a32_arsh_i64(dst, dstk, imm, ctx); + break; + /* dst = ~dst */ + case BPF_ALU | BPF_NEG: + emit_a32_alu_i(dst_lo, 0, dstk, ctx, BPF_OP(code)); + emit_a32_mov_i(dst_hi, 0, dstk, ctx); + break; + /* dst = ~dst (64 bit) */ + case BPF_ALU64 | BPF_NEG: + emit_a32_neg64(dst, dstk, ctx); + break; + /* dst = dst * src/imm */ + case BPF_ALU64 | BPF_MUL | BPF_X: + case BPF_ALU64 | BPF_MUL | BPF_K: + switch (BPF_SRC(code)) { + case BPF_X: + emit_a32_mul_r64(dst, src, dstk, sstk, ctx); break; - case BPF_LDX | BPF_B | BPF_MSH: - /* x = ((*(frame + k)) & 0xf) << 2; */ - ctx->seen |= SEEN_X | SEEN_DATA | SEEN_CALL; - /* the interpreter should deal with the negative K */ - if ((int)k < 0) - return -1; - /* offset in r1: we might have to take the slow path */ - emit_mov_i(r_off, k, ctx); - emit(ARM_CMP_R(r_skb_hl, r_off), ctx); - - /* load in r0: common with the slowpath */ - _emit(ARM_COND_HI, ARM_LDRB_R(ARM_R0, r_skb_data, - ARM_R1), ctx); - /* - * emit_mov_i() might generate one or two instructions, - * the same holds for emit_blx_r() + case BPF_K: + /* Move immediate value to the temporary register + * and then do the multiplication on it as this + * will sign-extend the immediate value into temp + * reg then it would be safe to do the operation + * on it. */ - _emit(ARM_COND_HI, ARM_B(b_imm(i + 1, ctx) - 2), ctx); - - emit(ARM_MOV_R(ARM_R0, r_skb), ctx); - /* r_off is r1 */ - emit_mov_i(ARM_R3, (u32)jit_get_skb_b, ctx); - emit_blx_r(ARM_R3, ctx); - /* check the return value of skb_copy_bits */ - emit(ARM_CMP_I(ARM_R1, 0), ctx); - emit_err_ret(ARM_COND_NE, ctx); - - emit(ARM_AND_I(r_X, ARM_R0, 0x00f), ctx); - emit(ARM_LSL_I(r_X, r_X, 2), ctx); - break; - case BPF_ST: - ctx->seen |= SEEN_MEM_WORD(k); - emit(ARM_STR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx); - break; - case BPF_STX: - update_on_xread(ctx); - ctx->seen |= SEEN_MEM_WORD(k); - emit(ARM_STR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx); - break; - case BPF_ALU | BPF_ADD | BPF_K: - /* A += K */ - OP_IMM3(ARM_ADD, r_A, r_A, k, ctx); - break; - case BPF_ALU | BPF_ADD | BPF_X: - update_on_xread(ctx); - emit(ARM_ADD_R(r_A, r_A, r_X), ctx); - break; - case BPF_ALU | BPF_SUB | BPF_K: - /* A -= K */ - OP_IMM3(ARM_SUB, r_A, r_A, k, ctx); - break; - case BPF_ALU | BPF_SUB | BPF_X: - update_on_xread(ctx); - emit(ARM_SUB_R(r_A, r_A, r_X), ctx); - break; - case BPF_ALU | BPF_MUL | BPF_K: - /* A *= K */ - emit_mov_i(r_scratch, k, ctx); - emit(ARM_MUL(r_A, r_A, r_scratch), ctx); - break; - case BPF_ALU | BPF_MUL | BPF_X: - update_on_xread(ctx); - emit(ARM_MUL(r_A, r_A, r_X), ctx); + emit_a32_mov_i64(is64, tmp2, imm, false, ctx); + emit_a32_mul_r64(dst, tmp2, dstk, false, ctx); break; - case BPF_ALU | BPF_DIV | BPF_K: - if (k == 1) - break; - emit_mov_i(r_scratch, k, ctx); - emit_udivmod(r_A, r_A, r_scratch, ctx, BPF_DIV); - break; - case BPF_ALU | BPF_DIV | BPF_X: - update_on_xread(ctx); - emit(ARM_CMP_I(r_X, 0), ctx); - emit_err_ret(ARM_COND_EQ, ctx); - emit_udivmod(r_A, r_A, r_X, ctx, BPF_DIV); + } + break; + /* dst = htole(dst) */ + /* dst = htobe(dst) */ + case BPF_ALU | BPF_END | BPF_FROM_LE: + case BPF_ALU | BPF_END | BPF_FROM_BE: + rd = dstk ? tmp[0] : dst_hi; + rt = dstk ? tmp[1] : dst_lo; + if (dstk) { + emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + if (BPF_SRC(code) == BPF_FROM_LE) + goto emit_bswap_uxt; + switch (imm) { + case 16: + emit_rev16(rt, rt, ctx); + goto emit_bswap_uxt; + case 32: + emit_rev32(rt, rt, ctx); + goto emit_bswap_uxt; + case 64: + /* Because of the usage of ARM_LR */ + ctx->seen |= SEEN_CALL; + emit_rev32(ARM_LR, rt, ctx); + emit_rev32(rt, rd, ctx); + emit(ARM_MOV_R(rd, ARM_LR), ctx); break; - case BPF_ALU | BPF_MOD | BPF_K: - if (k == 1) { - emit_mov_i(r_A, 0, ctx); - break; - } - emit_mov_i(r_scratch, k, ctx); - emit_udivmod(r_A, r_A, r_scratch, ctx, BPF_MOD); + } + goto exit; +emit_bswap_uxt: + switch (imm) { + case 16: + /* zero-extend 16 bits into 64 bits */ +#if __LINUX_ARM_ARCH__ < 6 + emit_a32_mov_i(tmp2[1], 0xffff, false, ctx); + emit(ARM_AND_R(rt, rt, tmp2[1]), ctx); +#else /* ARMv6+ */ + emit(ARM_UXTH(rt, rt), ctx); +#endif + emit(ARM_EOR_R(rd, rd, rd), ctx); break; - case BPF_ALU | BPF_MOD | BPF_X: - update_on_xread(ctx); - emit(ARM_CMP_I(r_X, 0), ctx); - emit_err_ret(ARM_COND_EQ, ctx); - emit_udivmod(r_A, r_A, r_X, ctx, BPF_MOD); + case 32: + /* zero-extend 32 bits into 64 bits */ + emit(ARM_EOR_R(rd, rd, rd), ctx); break; - case BPF_ALU | BPF_OR | BPF_K: - /* A |= K */ - OP_IMM3(ARM_ORR, r_A, r_A, k, ctx); + case 64: + /* nop */ break; - case BPF_ALU | BPF_OR | BPF_X: - update_on_xread(ctx); - emit(ARM_ORR_R(r_A, r_A, r_X), ctx); + } +exit: + if (dstk) { + emit(ARM_STR_I(rt, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + break; + /* dst = imm64 */ + case BPF_LD | BPF_IMM | BPF_DW: + { + const struct bpf_insn insn1 = insn[1]; + u32 hi, lo = imm; + + hi = insn1.imm; + emit_a32_mov_i(dst_lo, lo, dstk, ctx); + emit_a32_mov_i(dst_hi, hi, dstk, ctx); + + return 1; + } + /* LDX: dst = *(size *)(src + off) */ + case BPF_LDX | BPF_MEM | BPF_W: + case BPF_LDX | BPF_MEM | BPF_H: + case BPF_LDX | BPF_MEM | BPF_B: + case BPF_LDX | BPF_MEM | BPF_DW: + rn = sstk ? tmp2[1] : src_lo; + if (sstk) + emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx); + switch (BPF_SIZE(code)) { + case BPF_W: + /* Load a Word */ + case BPF_H: + /* Load a Half-Word */ + case BPF_B: + /* Load a Byte */ + emit_ldx_r(dst_lo, rn, dstk, off, ctx, BPF_SIZE(code)); + emit_a32_mov_i(dst_hi, 0, dstk, ctx); break; - case BPF_ALU | BPF_XOR | BPF_K: - /* A ^= K; */ - OP_IMM3(ARM_EOR, r_A, r_A, k, ctx); + case BPF_DW: + /* Load a double word */ + emit_ldx_r(dst_lo, rn, dstk, off, ctx, BPF_W); + emit_ldx_r(dst_hi, rn, dstk, off+4, ctx, BPF_W); break; - case BPF_ANC | SKF_AD_ALU_XOR_X: - case BPF_ALU | BPF_XOR | BPF_X: - /* A ^= X */ - update_on_xread(ctx); - emit(ARM_EOR_R(r_A, r_A, r_X), ctx); + } + break; + /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */ + case BPF_LD | BPF_ABS | BPF_W: + case BPF_LD | BPF_ABS | BPF_H: + case BPF_LD | BPF_ABS | BPF_B: + /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + src + imm)) */ + case BPF_LD | BPF_IND | BPF_W: + case BPF_LD | BPF_IND | BPF_H: + case BPF_LD | BPF_IND | BPF_B: + { + const u8 r4 = bpf2a32[BPF_REG_6][1]; /* r4 = ptr to sk_buff */ + const u8 r0 = bpf2a32[BPF_REG_0][1]; /*r0: struct sk_buff *skb*/ + /* rtn value */ + const u8 r1 = bpf2a32[BPF_REG_0][0]; /* r1: int k */ + const u8 r2 = bpf2a32[BPF_REG_1][1]; /* r2: unsigned int size */ + const u8 r3 = bpf2a32[BPF_REG_1][0]; /* r3: void *buffer */ + const u8 r6 = bpf2a32[TMP_REG_1][1]; /* r6: void *(*func)(..) */ + int size; + + /* Setting up first argument */ + emit(ARM_MOV_R(r0, r4), ctx); + + /* Setting up second argument */ + emit_a32_mov_i(r1, imm, false, ctx); + if (BPF_MODE(code) == BPF_IND) + emit_a32_alu_r(r1, src_lo, false, sstk, ctx, + false, false, BPF_ADD); + + /* Setting up third argument */ + switch (BPF_SIZE(code)) { + case BPF_W: + size = 4; break; - case BPF_ALU | BPF_AND | BPF_K: - /* A &= K */ - OP_IMM3(ARM_AND, r_A, r_A, k, ctx); + case BPF_H: + size = 2; break; - case BPF_ALU | BPF_AND | BPF_X: - update_on_xread(ctx); - emit(ARM_AND_R(r_A, r_A, r_X), ctx); + case BPF_B: + size = 1; break; - case BPF_ALU | BPF_LSH | BPF_K: - if (unlikely(k > 31)) - return -1; - emit(ARM_LSL_I(r_A, r_A, k), ctx); + default: + return -EINVAL; + } + emit_a32_mov_i(r2, size, false, ctx); + + /* Setting up fourth argument */ + emit(ARM_ADD_I(r3, ARM_SP, imm8m(SKB_BUFFER)), ctx); + + /* Setting up function pointer to call */ + emit_a32_mov_i(r6, (unsigned int)bpf_load_pointer, false, ctx); + emit_blx_r(r6, ctx); + + emit(ARM_EOR_R(r1, r1, r1), ctx); + /* Check if return address is NULL or not. + * if NULL then jump to epilogue + * else continue to load the value from retn address + */ + emit(ARM_CMP_I(r0, 0), ctx); + jmp_offset = epilogue_offset(ctx); + check_imm24(jmp_offset); + _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx); + + /* Load value from the address */ + switch (BPF_SIZE(code)) { + case BPF_W: + emit(ARM_LDR_I(r0, r0, 0), ctx); + emit_rev32(r0, r0, ctx); break; - case BPF_ALU | BPF_LSH | BPF_X: - update_on_xread(ctx); - emit(ARM_LSL_R(r_A, r_A, r_X), ctx); + case BPF_H: + emit(ARM_LDRH_I(r0, r0, 0), ctx); + emit_rev16(r0, r0, ctx); break; - case BPF_ALU | BPF_RSH | BPF_K: - if (unlikely(k > 31)) - return -1; - if (k) - emit(ARM_LSR_I(r_A, r_A, k), ctx); + case BPF_B: + emit(ARM_LDRB_I(r0, r0, 0), ctx); + /* No need to reverse */ break; - case BPF_ALU | BPF_RSH | BPF_X: - update_on_xread(ctx); - emit(ARM_LSR_R(r_A, r_A, r_X), ctx); + } + break; + } + /* ST: *(size *)(dst + off) = imm */ + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_H: + case BPF_ST | BPF_MEM | BPF_B: + case BPF_ST | BPF_MEM | BPF_DW: + switch (BPF_SIZE(code)) { + case BPF_DW: + /* Sign-extend immediate value into temp reg */ + emit_a32_mov_i64(true, tmp2, imm, false, ctx); + emit_str_r(dst_lo, tmp2[1], dstk, off, ctx, BPF_W); + emit_str_r(dst_lo, tmp2[0], dstk, off+4, ctx, BPF_W); break; - case BPF_ALU | BPF_NEG: - /* A = -A */ - emit(ARM_RSB_I(r_A, r_A, 0), ctx); + case BPF_W: + case BPF_H: + case BPF_B: + emit_a32_mov_i(tmp2[1], imm, false, ctx); + emit_str_r(dst_lo, tmp2[1], dstk, off, ctx, + BPF_SIZE(code)); break; - case BPF_JMP | BPF_JA: - /* pc += K */ - emit(ARM_B(b_imm(i + k + 1, ctx)), ctx); + } + break; + /* STX XADD: lock *(u32 *)(dst + off) += src */ + case BPF_STX | BPF_XADD | BPF_W: + /* STX XADD: lock *(u64 *)(dst + off) += src */ + case BPF_STX | BPF_XADD | BPF_DW: + goto notyet; + /* STX: *(size *)(dst + off) = src */ + case BPF_STX | BPF_MEM | BPF_W: + case BPF_STX | BPF_MEM | BPF_H: + case BPF_STX | BPF_MEM | BPF_B: + case BPF_STX | BPF_MEM | BPF_DW: + { + u8 sz = BPF_SIZE(code); + + rn = sstk ? tmp2[1] : src_lo; + rm = sstk ? tmp2[0] : src_hi; + if (sstk) { + emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(src_hi)), ctx); + } + + /* Store the value */ + if (BPF_SIZE(code) == BPF_DW) { + emit_str_r(dst_lo, rn, dstk, off, ctx, BPF_W); + emit_str_r(dst_lo, rm, dstk, off+4, ctx, BPF_W); + } else { + emit_str_r(dst_lo, rn, dstk, off, ctx, sz); + } + break; + } + /* PC += off if dst == src */ + /* PC += off if dst > src */ + /* PC += off if dst >= src */ + /* PC += off if dst < src */ + /* PC += off if dst <= src */ + /* PC += off if dst != src */ + /* PC += off if dst > src (signed) */ + /* PC += off if dst >= src (signed) */ + /* PC += off if dst < src (signed) */ + /* PC += off if dst <= src (signed) */ + /* PC += off if dst & src */ + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JNE | BPF_X: + case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSET | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: + /* Setup source registers */ + rm = sstk ? tmp2[0] : src_hi; + rn = sstk ? tmp2[1] : src_lo; + if (sstk) { + emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx); + emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(src_hi)), ctx); + } + goto go_jmp; + /* PC += off if dst == imm */ + /* PC += off if dst > imm */ + /* PC += off if dst >= imm */ + /* PC += off if dst < imm */ + /* PC += off if dst <= imm */ + /* PC += off if dst != imm */ + /* PC += off if dst > imm (signed) */ + /* PC += off if dst >= imm (signed) */ + /* PC += off if dst < imm (signed) */ + /* PC += off if dst <= imm (signed) */ + /* PC += off if dst & imm */ + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JNE | BPF_K: + case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: + if (off == 0) break; - case BPF_JMP | BPF_JEQ | BPF_K: - /* pc += (A == K) ? pc->jt : pc->jf */ - condt = ARM_COND_EQ; - goto cmp_imm; - case BPF_JMP | BPF_JGT | BPF_K: - /* pc += (A > K) ? pc->jt : pc->jf */ - condt = ARM_COND_HI; - goto cmp_imm; - case BPF_JMP | BPF_JGE | BPF_K: - /* pc += (A >= K) ? pc->jt : pc->jf */ - condt = ARM_COND_HS; -cmp_imm: - imm12 = imm8m(k); - if (imm12 < 0) { - emit_mov_i_no8m(r_scratch, k, ctx); - emit(ARM_CMP_R(r_A, r_scratch), ctx); - } else { - emit(ARM_CMP_I(r_A, imm12), ctx); - } -cond_jump: - if (inst->jt) - _emit(condt, ARM_B(b_imm(i + inst->jt + 1, - ctx)), ctx); - if (inst->jf) - _emit(condt ^ 1, ARM_B(b_imm(i + inst->jf + 1, - ctx)), ctx); + rm = tmp2[0]; + rn = tmp2[1]; + /* Sign-extend immediate value */ + emit_a32_mov_i64(true, tmp2, imm, false, ctx); +go_jmp: + /* Setup destination register */ + rd = dstk ? tmp[0] : dst_hi; + rt = dstk ? tmp[1] : dst_lo; + if (dstk) { + emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(dst_lo)), ctx); + emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_hi)), ctx); + } + + /* Check for the condition */ + emit_ar_r(rd, rt, rm, rn, ctx, BPF_OP(code)); + + /* Setup JUMP instruction */ + jmp_offset = bpf2a32_offset(i+off, i, ctx); + switch (BPF_OP(code)) { + case BPF_JNE: + case BPF_JSET: + _emit(ARM_COND_NE, ARM_B(jmp_offset), ctx); break; - case BPF_JMP | BPF_JEQ | BPF_X: - /* pc += (A == X) ? pc->jt : pc->jf */ - condt = ARM_COND_EQ; - goto cmp_x; - case BPF_JMP | BPF_JGT | BPF_X: - /* pc += (A > X) ? pc->jt : pc->jf */ - condt = ARM_COND_HI; - goto cmp_x; - case BPF_JMP | BPF_JGE | BPF_X: - /* pc += (A >= X) ? pc->jt : pc->jf */ - condt = ARM_COND_CS; -cmp_x: - update_on_xread(ctx); - emit(ARM_CMP_R(r_A, r_X), ctx); - goto cond_jump; - case BPF_JMP | BPF_JSET | BPF_K: - /* pc += (A & K) ? pc->jt : pc->jf */ - condt = ARM_COND_NE; - /* not set iff all zeroes iff Z==1 iff EQ */ - - imm12 = imm8m(k); - if (imm12 < 0) { - emit_mov_i_no8m(r_scratch, k, ctx); - emit(ARM_TST_R(r_A, r_scratch), ctx); - } else { - emit(ARM_TST_I(r_A, imm12), ctx); - } - goto cond_jump; - case BPF_JMP | BPF_JSET | BPF_X: - /* pc += (A & X) ? pc->jt : pc->jf */ - update_on_xread(ctx); - condt = ARM_COND_NE; - emit(ARM_TST_R(r_A, r_X), ctx); - goto cond_jump; - case BPF_RET | BPF_A: - emit(ARM_MOV_R(ARM_R0, r_A), ctx); - goto b_epilogue; - case BPF_RET | BPF_K: - if ((k == 0) && (ctx->ret0_fp_idx < 0)) - ctx->ret0_fp_idx = i; - emit_mov_i(ARM_R0, k, ctx); -b_epilogue: - if (i != ctx->skf->len - 1) - emit(ARM_B(b_imm(prog->len, ctx)), ctx); + case BPF_JEQ: + _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx); break; - case BPF_MISC | BPF_TAX: - /* X = A */ - ctx->seen |= SEEN_X; - emit(ARM_MOV_R(r_X, r_A), ctx); + case BPF_JGT: + _emit(ARM_COND_HI, ARM_B(jmp_offset), ctx); break; - case BPF_MISC | BPF_TXA: - /* A = X */ - update_on_xread(ctx); - emit(ARM_MOV_R(r_A, r_X), ctx); + case BPF_JGE: + _emit(ARM_COND_CS, ARM_B(jmp_offset), ctx); break; - case BPF_ANC | SKF_AD_PROTOCOL: - /* A = ntohs(skb->protocol) */ - ctx->seen |= SEEN_SKB; - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, - protocol) != 2); - off = offsetof(struct sk_buff, protocol); - emit(ARM_LDRH_I(r_scratch, r_skb, off), ctx); - emit_swap16(r_A, r_scratch, ctx); + case BPF_JSGT: + _emit(ARM_COND_LT, ARM_B(jmp_offset), ctx); break; - case BPF_ANC | SKF_AD_CPU: - /* r_scratch = current_thread_info() */ - OP_IMM3(ARM_BIC, r_scratch, ARM_SP, THREAD_SIZE - 1, ctx); - /* A = current_thread_info()->cpu */ - BUILD_BUG_ON(FIELD_SIZEOF(struct thread_info, cpu) != 4); - off = offsetof(struct thread_info, cpu); - emit(ARM_LDR_I(r_A, r_scratch, off), ctx); + case BPF_JSGE: + _emit(ARM_COND_GE, ARM_B(jmp_offset), ctx); break; - case BPF_ANC | SKF_AD_IFINDEX: - case BPF_ANC | SKF_AD_HATYPE: - /* A = skb->dev->ifindex */ - /* A = skb->dev->type */ - ctx->seen |= SEEN_SKB; - off = offsetof(struct sk_buff, dev); - emit(ARM_LDR_I(r_scratch, r_skb, off), ctx); - - emit(ARM_CMP_I(r_scratch, 0), ctx); - emit_err_ret(ARM_COND_EQ, ctx); - - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, - ifindex) != 4); - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, - type) != 2); - - if (code == (BPF_ANC | SKF_AD_IFINDEX)) { - off = offsetof(struct net_device, ifindex); - emit(ARM_LDR_I(r_A, r_scratch, off), ctx); - } else { - /* - * offset of field "type" in "struct - * net_device" is above what can be - * used in the ldrh rd, [rn, #imm] - * instruction, so load the offset in - * a register and use ldrh rd, [rn, rm] - */ - off = offsetof(struct net_device, type); - emit_mov_i(ARM_R3, off, ctx); - emit(ARM_LDRH_R(r_A, r_scratch, ARM_R3), ctx); - } + case BPF_JLE: + _emit(ARM_COND_LS, ARM_B(jmp_offset), ctx); break; - case BPF_ANC | SKF_AD_MARK: - ctx->seen |= SEEN_SKB; - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); - off = offsetof(struct sk_buff, mark); - emit(ARM_LDR_I(r_A, r_skb, off), ctx); + case BPF_JLT: + _emit(ARM_COND_CC, ARM_B(jmp_offset), ctx); break; - case BPF_ANC | SKF_AD_RXHASH: - ctx->seen |= SEEN_SKB; - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); - off = offsetof(struct sk_buff, hash); - emit(ARM_LDR_I(r_A, r_skb, off), ctx); + case BPF_JSLT: + _emit(ARM_COND_LT, ARM_B(jmp_offset), ctx); break; - case BPF_ANC | SKF_AD_VLAN_TAG: - case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: - ctx->seen |= SEEN_SKB; - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); - off = offsetof(struct sk_buff, vlan_tci); - emit(ARM_LDRH_I(r_A, r_skb, off), ctx); - if (code == (BPF_ANC | SKF_AD_VLAN_TAG)) - OP_IMM3(ARM_AND, r_A, r_A, ~VLAN_TAG_PRESENT, ctx); - else { - OP_IMM3(ARM_LSR, r_A, r_A, 12, ctx); - OP_IMM3(ARM_AND, r_A, r_A, 0x1, ctx); - } + case BPF_JSLE: + _emit(ARM_COND_GE, ARM_B(jmp_offset), ctx); break; - case BPF_ANC | SKF_AD_PKTTYPE: - ctx->seen |= SEEN_SKB; - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, - __pkt_type_offset[0]) != 1); - off = PKT_TYPE_OFFSET(); - emit(ARM_LDRB_I(r_A, r_skb, off), ctx); - emit(ARM_AND_I(r_A, r_A, PKT_TYPE_MAX), ctx); -#ifdef __BIG_ENDIAN_BITFIELD - emit(ARM_LSR_I(r_A, r_A, 5), ctx); -#endif + } + break; + /* JMP OFF */ + case BPF_JMP | BPF_JA: + { + if (off == 0) break; - case BPF_ANC | SKF_AD_QUEUE: - ctx->seen |= SEEN_SKB; - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, - queue_mapping) != 2); - BUILD_BUG_ON(offsetof(struct sk_buff, - queue_mapping) > 0xff); - off = offsetof(struct sk_buff, queue_mapping); - emit(ARM_LDRH_I(r_A, r_skb, off), ctx); + jmp_offset = bpf2a32_offset(i+off, i, ctx); + check_imm24(jmp_offset); + emit(ARM_B(jmp_offset), ctx); + break; + } + /* tail call */ + case BPF_JMP | BPF_TAIL_CALL: + if (emit_bpf_tail_call(ctx)) + return -EFAULT; + break; + /* function call */ + case BPF_JMP | BPF_CALL: + { + const u8 *r0 = bpf2a32[BPF_REG_0]; + const u8 *r1 = bpf2a32[BPF_REG_1]; + const u8 *r2 = bpf2a32[BPF_REG_2]; + const u8 *r3 = bpf2a32[BPF_REG_3]; + const u8 *r4 = bpf2a32[BPF_REG_4]; + const u8 *r5 = bpf2a32[BPF_REG_5]; + const u32 func = (u32)__bpf_call_base + (u32)imm; + + emit_a32_mov_r64(true, r0, r1, false, false, ctx); + emit_a32_mov_r64(true, r1, r2, false, true, ctx); + emit_push_r64(r5, 0, ctx); + emit_push_r64(r4, 8, ctx); + emit_push_r64(r3, 16, ctx); + + emit_a32_mov_i(tmp[1], func, false, ctx); + emit_blx_r(tmp[1], ctx); + + emit(ARM_ADD_I(ARM_SP, ARM_SP, imm8m(24)), ctx); // callee clean + break; + } + /* function return */ + case BPF_JMP | BPF_EXIT: + /* Optimization: when last instruction is EXIT + * simply fallthrough to epilogue. + */ + if (i == ctx->prog->len - 1) break; - case BPF_ANC | SKF_AD_PAY_OFFSET: - ctx->seen |= SEEN_SKB | SEEN_CALL; + jmp_offset = epilogue_offset(ctx); + check_imm24(jmp_offset); + emit(ARM_B(jmp_offset), ctx); + break; +notyet: + pr_info_once("*** NOT YET: opcode %02x ***\n", code); + return -EFAULT; + default: + pr_err_once("unknown opcode %02x\n", code); + return -EINVAL; + } - emit(ARM_MOV_R(ARM_R0, r_skb), ctx); - emit_mov_i(ARM_R3, (unsigned int)skb_get_poff, ctx); - emit_blx_r(ARM_R3, ctx); - emit(ARM_MOV_R(r_A, ARM_R0), ctx); - break; - case BPF_LDX | BPF_W | BPF_ABS: - /* - * load a 32bit word from struct seccomp_data. - * seccomp_check_filter() will already have checked - * that k is 32bit aligned and lies within the - * struct seccomp_data. - */ - ctx->seen |= SEEN_SKB; - emit(ARM_LDR_I(r_A, r_skb, k), ctx); - break; - default: - return -1; + if (ctx->flags & FLAG_IMM_OVERFLOW) + /* + * this instruction generated an overflow when + * trying to access the literal pool, so + * delegate this filter to the kernel interpreter. + */ + return -1; + return 0; +} + +static int build_body(struct jit_ctx *ctx) +{ + const struct bpf_prog *prog = ctx->prog; + unsigned int i; + + for (i = 0; i < prog->len; i++) { + const struct bpf_insn *insn = &(prog->insnsi[i]); + int ret; + + ret = build_insn(insn, ctx); + + /* It's used with loading the 64 bit immediate value. */ + if (ret > 0) { + i++; + if (ctx->target == NULL) + ctx->offsets[i] = ctx->idx; + continue; } - if (ctx->flags & FLAG_IMM_OVERFLOW) - /* - * this instruction generated an overflow when - * trying to access the literal pool, so - * delegate this filter to the kernel interpreter. - */ - return -1; + if (ctx->target == NULL) + ctx->offsets[i] = ctx->idx; + + /* If unsuccesfull, return with error code */ + if (ret) + return ret; } + return 0; +} - /* compute offsets only during the first pass */ - if (ctx->target == NULL) - ctx->offsets[i] = ctx->idx * 4; +static int validate_code(struct jit_ctx *ctx) +{ + int i; + + for (i = 0; i < ctx->idx; i++) { + if (ctx->target[i] == __opcode_to_mem_arm(ARM_INST_UDF)) + return -1; + } return 0; } +void bpf_jit_compile(struct bpf_prog *prog) +{ + /* Nothing to do here. We support Internal BPF. */ +} -void bpf_jit_compile(struct bpf_prog *fp) +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { + struct bpf_prog *tmp, *orig_prog = prog; struct bpf_binary_header *header; + bool tmp_blinded = false; struct jit_ctx ctx; - unsigned tmp_idx; - unsigned alloc_size; - u8 *target_ptr; + unsigned int tmp_idx; + unsigned int image_size; + u8 *image_ptr; + /* If BPF JIT was not enabled then we must fall back to + * the interpreter. + */ if (!bpf_jit_enable) - return; + return orig_prog; - memset(&ctx, 0, sizeof(ctx)); - ctx.skf = fp; - ctx.ret0_fp_idx = -1; + /* If constant blinding was enabled and we failed during blinding + * then we must fall back to the interpreter. Otherwise, we save + * the new JITed code. + */ + tmp = bpf_jit_blind_constants(prog); - ctx.offsets = kzalloc(4 * (ctx.skf->len + 1), GFP_KERNEL); - if (ctx.offsets == NULL) - return; + if (IS_ERR(tmp)) + return orig_prog; + if (tmp != prog) { + tmp_blinded = true; + prog = tmp; + } - /* fake pass to fill in the ctx->seen */ - if (unlikely(build_body(&ctx))) + memset(&ctx, 0, sizeof(ctx)); + ctx.prog = prog; + + /* Not able to allocate memory for offsets[] , then + * we must fall back to the interpreter + */ + ctx.offsets = kcalloc(prog->len, sizeof(int), GFP_KERNEL); + if (ctx.offsets == NULL) { + prog = orig_prog; goto out; + } + + /* 1) fake pass to find in the length of the JITed code, + * to compute ctx->offsets and other context variables + * needed to compute final JITed code. + * Also, calculate random starting pointer/start of JITed code + * which is prefixed by random number of fault instructions. + * + * If the first pass fails then there is no chance of it + * being successful in the second pass, so just fall back + * to the interpreter. + */ + if (build_body(&ctx)) { + prog = orig_prog; + goto out_off; + } tmp_idx = ctx.idx; build_prologue(&ctx); ctx.prologue_bytes = (ctx.idx - tmp_idx) * 4; + ctx.epilogue_offset = ctx.idx; + #if __LINUX_ARM_ARCH__ < 7 tmp_idx = ctx.idx; build_epilogue(&ctx); @@ -1021,64 +1880,83 @@ void bpf_jit_compile(struct bpf_prog *fp) ctx.idx += ctx.imm_count; if (ctx.imm_count) { - ctx.imms = kzalloc(4 * ctx.imm_count, GFP_KERNEL); - if (ctx.imms == NULL) - goto out; + ctx.imms = kcalloc(ctx.imm_count, sizeof(u32), GFP_KERNEL); + if (ctx.imms == NULL) { + prog = orig_prog; + goto out_off; + } } #else - /* there's nothing after the epilogue on ARMv7 */ + /* there's nothing about the epilogue on ARMv7 */ build_epilogue(&ctx); #endif - alloc_size = 4 * ctx.idx; - header = bpf_jit_binary_alloc(alloc_size, &target_ptr, - 4, jit_fill_hole); - if (header == NULL) - goto out; + /* Now we can get the actual image size of the JITed arm code. + * Currently, we are not considering the THUMB-2 instructions + * for jit, although it can decrease the size of the image. + * + * As each arm instruction is of length 32bit, we are translating + * number of JITed intructions into the size required to store these + * JITed code. + */ + image_size = sizeof(u32) * ctx.idx; - ctx.target = (u32 *) target_ptr; + /* Now we know the size of the structure to make */ + header = bpf_jit_binary_alloc(image_size, &image_ptr, + sizeof(u32), jit_fill_hole); + /* Not able to allocate memory for the structure then + * we must fall back to the interpretation + */ + if (header == NULL) { + prog = orig_prog; + goto out_imms; + } + + /* 2.) Actual pass to generate final JIT code */ + ctx.target = (u32 *) image_ptr; ctx.idx = 0; build_prologue(&ctx); + + /* If building the body of the JITed code fails somehow, + * we fall back to the interpretation. + */ if (build_body(&ctx) < 0) { -#if __LINUX_ARM_ARCH__ < 7 - if (ctx.imm_count) - kfree(ctx.imms); -#endif + image_ptr = NULL; bpf_jit_binary_free(header); - goto out; + prog = orig_prog; + goto out_imms; } build_epilogue(&ctx); + /* 3.) Extra pass to validate JITed Code */ + if (validate_code(&ctx)) { + image_ptr = NULL; + bpf_jit_binary_free(header); + prog = orig_prog; + goto out_imms; + } flush_icache_range((u32)header, (u32)(ctx.target + ctx.idx)); -#if __LINUX_ARM_ARCH__ < 7 - if (ctx.imm_count) - kfree(ctx.imms); -#endif - if (bpf_jit_enable > 1) /* there are 2 passes here */ - bpf_jit_dump(fp->len, alloc_size, 2, ctx.target); + bpf_jit_dump(prog->len, image_size, 2, ctx.target); set_memory_ro((unsigned long)header, header->pages); - fp->bpf_func = (void *)ctx.target; - fp->jited = 1; -out: + prog->bpf_func = (void *)ctx.target; + prog->jited = 1; + prog->jited_len = image_size; + +out_imms: +#if __LINUX_ARM_ARCH__ < 7 + if (ctx.imm_count) + kfree(ctx.imms); +#endif +out_off: kfree(ctx.offsets); - return; +out: + if (tmp_blinded) + bpf_jit_prog_release_other(prog, prog == orig_prog ? + tmp : orig_prog); + return prog; } -void bpf_jit_free(struct bpf_prog *fp) -{ - unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; - struct bpf_binary_header *header = (void *)addr; - - if (!fp->jited) - goto free_filter; - - set_memory_rw(addr, header->pages); - bpf_jit_binary_free(header); - -free_filter: - bpf_prog_unlock_free(fp); -} diff --git a/arch/arm/net/bpf_jit_32.h b/arch/arm/net/bpf_jit_32.h index c46fca2972f7..d5cf5f6208aa 100644 --- a/arch/arm/net/bpf_jit_32.h +++ b/arch/arm/net/bpf_jit_32.h @@ -11,6 +11,7 @@ #ifndef PFILTER_OPCODES_ARM_H #define PFILTER_OPCODES_ARM_H +/* ARM 32bit Registers */ #define ARM_R0 0 #define ARM_R1 1 #define ARM_R2 2 @@ -22,38 +23,43 @@ #define ARM_R8 8 #define ARM_R9 9 #define ARM_R10 10 -#define ARM_FP 11 -#define ARM_IP 12 -#define ARM_SP 13 -#define ARM_LR 14 -#define ARM_PC 15 - -#define ARM_COND_EQ 0x0 -#define ARM_COND_NE 0x1 -#define ARM_COND_CS 0x2 +#define ARM_FP 11 /* Frame Pointer */ +#define ARM_IP 12 /* Intra-procedure scratch register */ +#define ARM_SP 13 /* Stack pointer: as load/store base reg */ +#define ARM_LR 14 /* Link Register */ +#define ARM_PC 15 /* Program counter */ + +#define ARM_COND_EQ 0x0 /* == */ +#define ARM_COND_NE 0x1 /* != */ +#define ARM_COND_CS 0x2 /* unsigned >= */ #define ARM_COND_HS ARM_COND_CS -#define ARM_COND_CC 0x3 +#define ARM_COND_CC 0x3 /* unsigned < */ #define ARM_COND_LO ARM_COND_CC -#define ARM_COND_MI 0x4 -#define ARM_COND_PL 0x5 -#define ARM_COND_VS 0x6 -#define ARM_COND_VC 0x7 -#define ARM_COND_HI 0x8 -#define ARM_COND_LS 0x9 -#define ARM_COND_GE 0xa -#define ARM_COND_LT 0xb -#define ARM_COND_GT 0xc -#define ARM_COND_LE 0xd -#define ARM_COND_AL 0xe +#define ARM_COND_MI 0x4 /* < 0 */ +#define ARM_COND_PL 0x5 /* >= 0 */ +#define ARM_COND_VS 0x6 /* Signed Overflow */ +#define ARM_COND_VC 0x7 /* No Signed Overflow */ +#define ARM_COND_HI 0x8 /* unsigned > */ +#define ARM_COND_LS 0x9 /* unsigned <= */ +#define ARM_COND_GE 0xa /* Signed >= */ +#define ARM_COND_LT 0xb /* Signed < */ +#define ARM_COND_GT 0xc /* Signed > */ +#define ARM_COND_LE 0xd /* Signed <= */ +#define ARM_COND_AL 0xe /* None */ /* register shift types */ #define SRTYPE_LSL 0 #define SRTYPE_LSR 1 #define SRTYPE_ASR 2 #define SRTYPE_ROR 3 +#define SRTYPE_ASL (SRTYPE_LSL) #define ARM_INST_ADD_R 0x00800000 +#define ARM_INST_ADDS_R 0x00900000 +#define ARM_INST_ADC_R 0x00a00000 +#define ARM_INST_ADC_I 0x02a00000 #define ARM_INST_ADD_I 0x02800000 +#define ARM_INST_ADDS_I 0x02900000 #define ARM_INST_AND_R 0x00000000 #define ARM_INST_AND_I 0x02000000 @@ -76,8 +82,10 @@ #define ARM_INST_LDRH_I 0x01d000b0 #define ARM_INST_LDRH_R 0x019000b0 #define ARM_INST_LDR_I 0x05900000 +#define ARM_INST_LDR_R 0x07900000 #define ARM_INST_LDM 0x08900000 +#define ARM_INST_LDM_IA 0x08b00000 #define ARM_INST_LSL_I 0x01a00000 #define ARM_INST_LSL_R 0x01a00010 @@ -86,6 +94,7 @@ #define ARM_INST_LSR_R 0x01a00030 #define ARM_INST_MOV_R 0x01a00000 +#define ARM_INST_MOVS_R 0x01b00000 #define ARM_INST_MOV_I 0x03a00000 #define ARM_INST_MOVW 0x03000000 #define ARM_INST_MOVT 0x03400000 @@ -96,17 +105,28 @@ #define ARM_INST_PUSH 0x092d0000 #define ARM_INST_ORR_R 0x01800000 +#define ARM_INST_ORRS_R 0x01900000 #define ARM_INST_ORR_I 0x03800000 #define ARM_INST_REV 0x06bf0f30 #define ARM_INST_REV16 0x06bf0fb0 #define ARM_INST_RSB_I 0x02600000 +#define ARM_INST_RSBS_I 0x02700000 +#define ARM_INST_RSC_I 0x02e00000 #define ARM_INST_SUB_R 0x00400000 +#define ARM_INST_SUBS_R 0x00500000 +#define ARM_INST_RSB_R 0x00600000 #define ARM_INST_SUB_I 0x02400000 +#define ARM_INST_SUBS_I 0x02500000 +#define ARM_INST_SBC_I 0x02c00000 +#define ARM_INST_SBC_R 0x00c00000 +#define ARM_INST_SBCS_R 0x00d00000 #define ARM_INST_STR_I 0x05800000 +#define ARM_INST_STRB_I 0x05c00000 +#define ARM_INST_STRH_I 0x01c000b0 #define ARM_INST_TST_R 0x01100000 #define ARM_INST_TST_I 0x03100000 @@ -117,6 +137,8 @@ #define ARM_INST_MLS 0x00600090 +#define ARM_INST_UXTH 0x06ff0070 + /* * Use a suitable undefined instruction to use for ARM/Thumb2 faulting. * We need to be careful not to conflict with those used by other modules @@ -135,9 +157,15 @@ #define _AL3_R(op, rd, rn, rm) ((op ## _R) | (rd) << 12 | (rn) << 16 | (rm)) /* immediate */ #define _AL3_I(op, rd, rn, imm) ((op ## _I) | (rd) << 12 | (rn) << 16 | (imm)) +/* register with register-shift */ +#define _AL3_SR(inst) (inst | (1 << 4)) #define ARM_ADD_R(rd, rn, rm) _AL3_R(ARM_INST_ADD, rd, rn, rm) +#define ARM_ADDS_R(rd, rn, rm) _AL3_R(ARM_INST_ADDS, rd, rn, rm) #define ARM_ADD_I(rd, rn, imm) _AL3_I(ARM_INST_ADD, rd, rn, imm) +#define ARM_ADDS_I(rd, rn, imm) _AL3_I(ARM_INST_ADDS, rd, rn, imm) +#define ARM_ADC_R(rd, rn, rm) _AL3_R(ARM_INST_ADC, rd, rn, rm) +#define ARM_ADC_I(rd, rn, imm) _AL3_I(ARM_INST_ADC, rd, rn, imm) #define ARM_AND_R(rd, rn, rm) _AL3_R(ARM_INST_AND, rd, rn, rm) #define ARM_AND_I(rd, rn, imm) _AL3_I(ARM_INST_AND, rd, rn, imm) @@ -156,7 +184,9 @@ #define ARM_EOR_I(rd, rn, imm) _AL3_I(ARM_INST_EOR, rd, rn, imm) #define ARM_LDR_I(rt, rn, off) (ARM_INST_LDR_I | (rt) << 12 | (rn) << 16 \ - | (off)) + | ((off) & 0xfff)) +#define ARM_LDR_R(rt, rn, rm) (ARM_INST_LDR_R | (rt) << 12 | (rn) << 16 \ + | (rm)) #define ARM_LDRB_I(rt, rn, off) (ARM_INST_LDRB_I | (rt) << 12 | (rn) << 16 \ | (off)) #define ARM_LDRB_R(rt, rn, rm) (ARM_INST_LDRB_R | (rt) << 12 | (rn) << 16 \ @@ -167,15 +197,23 @@ | (rm)) #define ARM_LDM(rn, regs) (ARM_INST_LDM | (rn) << 16 | (regs)) +#define ARM_LDM_IA(rn, regs) (ARM_INST_LDM_IA | (rn) << 16 | (regs)) #define ARM_LSL_R(rd, rn, rm) (_AL3_R(ARM_INST_LSL, rd, 0, rn) | (rm) << 8) #define ARM_LSL_I(rd, rn, imm) (_AL3_I(ARM_INST_LSL, rd, 0, rn) | (imm) << 7) #define ARM_LSR_R(rd, rn, rm) (_AL3_R(ARM_INST_LSR, rd, 0, rn) | (rm) << 8) #define ARM_LSR_I(rd, rn, imm) (_AL3_I(ARM_INST_LSR, rd, 0, rn) | (imm) << 7) +#define ARM_ASR_R(rd, rn, rm) (_AL3_R(ARM_INST_ASR, rd, 0, rn) | (rm) << 8) +#define ARM_ASR_I(rd, rn, imm) (_AL3_I(ARM_INST_ASR, rd, 0, rn) | (imm) << 7) #define ARM_MOV_R(rd, rm) _AL3_R(ARM_INST_MOV, rd, 0, rm) +#define ARM_MOVS_R(rd, rm) _AL3_R(ARM_INST_MOVS, rd, 0, rm) #define ARM_MOV_I(rd, imm) _AL3_I(ARM_INST_MOV, rd, 0, imm) +#define ARM_MOV_SR(rd, rm, type, rs) \ + (_AL3_SR(ARM_MOV_R(rd, rm)) | (type) << 5 | (rs) << 8) +#define ARM_MOV_SI(rd, rm, type, imm6) \ + (ARM_MOV_R(rd, rm) | (type) << 5 | (imm6) << 7) #define ARM_MOVW(rd, imm) \ (ARM_INST_MOVW | ((imm) >> 12) << 16 | (rd) << 12 | ((imm) & 0x0fff)) @@ -190,19 +228,38 @@ #define ARM_ORR_R(rd, rn, rm) _AL3_R(ARM_INST_ORR, rd, rn, rm) #define ARM_ORR_I(rd, rn, imm) _AL3_I(ARM_INST_ORR, rd, rn, imm) -#define ARM_ORR_S(rd, rn, rm, type, rs) \ - (ARM_ORR_R(rd, rn, rm) | (type) << 5 | (rs) << 7) +#define ARM_ORR_SR(rd, rn, rm, type, rs) \ + (_AL3_SR(ARM_ORR_R(rd, rn, rm)) | (type) << 5 | (rs) << 8) +#define ARM_ORRS_R(rd, rn, rm) _AL3_R(ARM_INST_ORRS, rd, rn, rm) +#define ARM_ORRS_SR(rd, rn, rm, type, rs) \ + (_AL3_SR(ARM_ORRS_R(rd, rn, rm)) | (type) << 5 | (rs) << 8) +#define ARM_ORR_SI(rd, rn, rm, type, imm6) \ + (ARM_ORR_R(rd, rn, rm) | (type) << 5 | (imm6) << 7) +#define ARM_ORRS_SI(rd, rn, rm, type, imm6) \ + (ARM_ORRS_R(rd, rn, rm) | (type) << 5 | (imm6) << 7) #define ARM_REV(rd, rm) (ARM_INST_REV | (rd) << 12 | (rm)) #define ARM_REV16(rd, rm) (ARM_INST_REV16 | (rd) << 12 | (rm)) #define ARM_RSB_I(rd, rn, imm) _AL3_I(ARM_INST_RSB, rd, rn, imm) +#define ARM_RSBS_I(rd, rn, imm) _AL3_I(ARM_INST_RSBS, rd, rn, imm) +#define ARM_RSC_I(rd, rn, imm) _AL3_I(ARM_INST_RSC, rd, rn, imm) #define ARM_SUB_R(rd, rn, rm) _AL3_R(ARM_INST_SUB, rd, rn, rm) +#define ARM_SUBS_R(rd, rn, rm) _AL3_R(ARM_INST_SUBS, rd, rn, rm) +#define ARM_RSB_R(rd, rn, rm) _AL3_R(ARM_INST_RSB, rd, rn, rm) +#define ARM_SBC_R(rd, rn, rm) _AL3_R(ARM_INST_SBC, rd, rn, rm) +#define ARM_SBCS_R(rd, rn, rm) _AL3_R(ARM_INST_SBCS, rd, rn, rm) #define ARM_SUB_I(rd, rn, imm) _AL3_I(ARM_INST_SUB, rd, rn, imm) +#define ARM_SUBS_I(rd, rn, imm) _AL3_I(ARM_INST_SUBS, rd, rn, imm) +#define ARM_SBC_I(rd, rn, imm) _AL3_I(ARM_INST_SBC, rd, rn, imm) #define ARM_STR_I(rt, rn, off) (ARM_INST_STR_I | (rt) << 12 | (rn) << 16 \ - | (off)) + | ((off) & 0xfff)) +#define ARM_STRH_I(rt, rn, off) (ARM_INST_STRH_I | (rt) << 12 | (rn) << 16 \ + | (((off) & 0xf0) << 4) | ((off) & 0xf)) +#define ARM_STRB_I(rt, rn, off) (ARM_INST_STRB_I | (rt) << 12 | (rn) << 16 \ + | (((off) & 0xf0) << 4) | ((off) & 0xf)) #define ARM_TST_R(rn, rm) _AL3_R(ARM_INST_TST, 0, rn, rm) #define ARM_TST_I(rn, imm) _AL3_I(ARM_INST_TST, 0, rn, imm) @@ -214,5 +271,6 @@ #define ARM_MLS(rd, rn, rm, ra) (ARM_INST_MLS | (rd) << 16 | (rn) | (rm) << 8 \ | (ra) << 12) +#define ARM_UXTH(rd, rm) (ARM_INST_UXTH | (rd) << 12 | (rm)) #endif /* PFILTER_OPCODES_ARM_H */ diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index dfd908630631..0df64a6a56d4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -75,6 +75,7 @@ config ARM64 select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_VMAP_STACK select HAVE_ARM_SMCCC select HAVE_EBPF_JIT select HAVE_C_RECORDMCOUNT @@ -960,6 +961,18 @@ config ARM64_UAO regular load/store instructions if the cpu does not implement the feature. +config ARM64_PMEM + bool "Enable support for persistent memory" + select ARCH_HAS_PMEM_API + select ARCH_HAS_UACCESS_FLUSHCACHE + help + Say Y to enable support for the persistent memory API based on the + ARMv8.2 DCPoP feature. + + The feature is detected at runtime, and the kernel will use DC CVAC + operations if DC CVAP is not supported (following the behaviour of + DC CVAP itself if the system does not define a point of persistence). + endmenu config ARM64_MODULE_CMODEL_LARGE diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts index ba2fde2909f9..6872135d7f84 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts @@ -51,7 +51,6 @@ compatible = "sinovoip,bananapi-m64", "allwinner,sun50i-a64"; aliases { - ethernet0 = &emac; serial0 = &uart0; serial1 = &uart1; }; @@ -68,14 +67,6 @@ }; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&rgmii_pins>; - phy-mode = "rgmii"; - phy-handle = <&ext_rgmii_phy>; - status = "okay"; -}; - &i2c1 { pinctrl-names = "default"; pinctrl-0 = <&i2c1_pins>; @@ -86,13 +77,6 @@ bias-pull-up; }; -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - }; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins>; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts index 24f1aac366d6..f82ccf332c0f 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts @@ -48,18 +48,3 @@ /* TODO: Camera, touchscreen, etc. */ }; - -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&rgmii_pins>; - phy-mode = "rgmii"; - phy-handle = <&ext_rgmii_phy>; - status = "okay"; -}; - -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - }; -}; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts index 827168bc22ed..7c533b6d4ba9 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts @@ -51,7 +51,6 @@ compatible = "pine64,pine64", "allwinner,sun50i-a64"; aliases { - ethernet0 = &emac; serial0 = &uart0; serial1 = &uart1; serial2 = &uart2; @@ -79,15 +78,6 @@ status = "okay"; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&rmii_pins>; - phy-mode = "rmii"; - phy-handle = <&ext_rmii_phy1>; - status = "okay"; - -}; - &i2c1 { pinctrl-names = "default"; pinctrl-0 = <&i2c1_pins>; @@ -98,13 +88,6 @@ bias-pull-up; }; -&mdio { - ext_rmii_phy1: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - }; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins>; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts index 216e3a5dafae..d891a1a27f6c 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts @@ -53,7 +53,6 @@ "allwinner,sun50i-a64"; aliases { - ethernet0 = &emac; serial0 = &uart0; }; @@ -77,21 +76,6 @@ status = "okay"; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&rgmii_pins>; - phy-mode = "rgmii"; - phy-handle = <&ext_rgmii_phy>; - status = "okay"; -}; - -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - }; -}; - &mmc2 { pinctrl-names = "default"; pinctrl-0 = <&mmc2_pins>; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi index bd0f33b77f57..68aadc9b96dc 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi @@ -449,26 +449,6 @@ #size-cells = <0>; }; - emac: ethernet@1c30000 { - compatible = "allwinner,sun50i-a64-emac"; - syscon = <&syscon>; - reg = <0x01c30000 0x10000>; - interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>; - interrupt-names = "macirq"; - resets = <&ccu RST_BUS_EMAC>; - reset-names = "stmmaceth"; - clocks = <&ccu CLK_BUS_EMAC>; - clock-names = "stmmaceth"; - status = "disabled"; - #address-cells = <1>; - #size-cells = <0>; - - mdio: mdio { - #address-cells = <1>; - #size-cells = <0>; - }; - }; - gic: interrupt-controller@1c81000 { compatible = "arm,gic-400"; reg = <0x01c81000 0x1000>, diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts index 968908761194..1c2387bd5df6 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-nanopi-neo2.dts @@ -50,7 +50,6 @@ compatible = "friendlyarm,nanopi-neo2", "allwinner,sun50i-h5"; aliases { - ethernet0 = &emac; serial0 = &uart0; }; @@ -109,22 +108,6 @@ status = "okay"; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&emac_rgmii_pins>; - phy-supply = <®_gmac_3v3>; - phy-handle = <&ext_rgmii_phy>; - phy-mode = "rgmii"; - status = "okay"; -}; - -&mdio { - ext_rgmii_phy: ethernet-phy@7 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <7>; - }; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts index a8296feee884..4f77c8470f6c 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-pc2.dts @@ -59,7 +59,6 @@ }; aliases { - ethernet0 = &emac; serial0 = &uart0; }; @@ -137,28 +136,12 @@ status = "okay"; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&emac_rgmii_pins>; - phy-supply = <®_gmac_3v3>; - phy-handle = <&ext_rgmii_phy>; - phy-mode = "rgmii"; - status = "okay"; -}; - &ir { pinctrl-names = "default"; pinctrl-0 = <&ir_pins_a>; status = "okay"; }; -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - }; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts index d906b302cbcd..6be06873e5af 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-prime.dts @@ -54,7 +54,6 @@ compatible = "xunlong,orangepi-prime", "allwinner,sun50i-h5"; aliases { - ethernet0 = &emac; serial0 = &uart0; }; @@ -144,28 +143,12 @@ status = "okay"; }; -&emac { - pinctrl-names = "default"; - pinctrl-0 = <&emac_rgmii_pins>; - phy-supply = <®_gmac_3v3>; - phy-handle = <&ext_rgmii_phy>; - phy-mode = "rgmii"; - status = "okay"; -}; - &ir { pinctrl-names = "default"; pinctrl-0 = <&ir_pins_a>; status = "okay"; }; -&mdio { - ext_rgmii_phy: ethernet-phy@1 { - compatible = "ethernet-phy-ieee802.3-c22"; - reg = <1>; - }; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>; diff --git a/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi b/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi index e2b0da2c0bc7..105b2938082f 100644 --- a/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi +++ b/arch/arm64/boot/dts/exynos/exynos5433-tm2-common.dtsi @@ -280,9 +280,6 @@ &decon { status = "okay"; - - i80-if-timings { - }; }; &decon_tv { @@ -1116,9 +1113,6 @@ &mic { status = "okay"; - - i80-if-timings { - }; }; &pmu_system_controller { diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index 31fd77f82ced..d16b9cc1e825 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -653,21 +653,21 @@ }; msi1: msi-controller1@1571000 { - compatible = "fsl,1s1043a-msi"; + compatible = "fsl,ls1043a-msi"; reg = <0x0 0x1571000 0x0 0x8>; msi-controller; interrupts = <0 116 0x4>; }; msi2: msi-controller2@1572000 { - compatible = "fsl,1s1043a-msi"; + compatible = "fsl,ls1043a-msi"; reg = <0x0 0x1572000 0x0 0x8>; msi-controller; interrupts = <0 126 0x4>; }; msi3: msi-controller3@1573000 { - compatible = "fsl,1s1043a-msi"; + compatible = "fsl,ls1043a-msi"; reg = <0x0 0x1573000 0x0 0x8>; msi-controller; interrupts = <0 160 0x4>; @@ -689,7 +689,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x40 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi1>; + msi-parent = <&msi1>, <&msi2>, <&msi3>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic 0 110 0x4>, @@ -714,7 +714,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x48 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi2>; + msi-parent = <&msi1>, <&msi2>, <&msi3>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic 0 120 0x4>, @@ -739,7 +739,7 @@ bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x50 0x00010000 0x0 0x00010000 /* downstream I/O */ 0x82000000 0x0 0x40000000 0x50 0x40000000 0x0 0x40000000>; /* non-prefetchable memory */ - msi-parent = <&msi3>; + msi-parent = <&msi1>, <&msi2>, <&msi3>; #interrupt-cells = <1>; interrupt-map-mask = <0 0 0 7>; interrupt-map = <0000 0 0 1 &gic 0 154 0x4>, diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi index dc1640be0345..c8ff0baddf1d 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi @@ -630,6 +630,37 @@ interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clockgen 4 1>; }; + + msi1: msi-controller@1580000 { + compatible = "fsl,ls1046a-msi"; + msi-controller; + reg = <0x0 0x1580000 0x0 0x10000>; + interrupts = <GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>; + }; + + msi2: msi-controller@1590000 { + compatible = "fsl,ls1046a-msi"; + msi-controller; + reg = <0x0 0x1590000 0x0 0x10000>; + interrupts = <GIC_SPI 126 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 122 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 123 IRQ_TYPE_LEVEL_HIGH>; + }; + + msi3: msi-controller@15a0000 { + compatible = "fsl,ls1046a-msi"; + msi-controller; + reg = <0x0 0x15a0000 0x0 0x10000>; + interrupts = <GIC_SPI 160 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 155 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 156 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 157 IRQ_TYPE_LEVEL_HIGH>; + }; + }; reserved-memory { diff --git a/arch/arm64/boot/dts/marvell/armada-ap806.dtsi b/arch/arm64/boot/dts/marvell/armada-ap806.dtsi index 1eb1f1e9aac4..4d360713ed12 100644 --- a/arch/arm64/boot/dts/marvell/armada-ap806.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-ap806.dtsi @@ -268,10 +268,10 @@ ap_gpio: gpio { compatible = "marvell,armada-8k-gpio"; offset = <0x1040>; - ngpios = <19>; + ngpios = <20>; gpio-controller; #gpio-cells = <2>; - gpio-ranges = <&ap_pinctrl 0 0 19>; + gpio-ranges = <&ap_pinctrl 0 0 20>; }; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3328-evb.dts b/arch/arm64/boot/dts/rockchip/rk3328-evb.dts index cf272392cebf..b9f36dad17e6 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-evb.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-evb.dts @@ -50,6 +50,23 @@ chosen { stdout-path = "serial2:1500000n8"; }; + + vcc_phy: vcc-phy-regulator { + compatible = "regulator-fixed"; + regulator-name = "vcc_phy"; + regulator-always-on; + regulator-boot-on; + }; +}; + +&gmac2phy { + phy-supply = <&vcc_phy>; + clock_in_out = "output"; + assigned-clocks = <&cru SCLK_MAC2PHY_SRC>; + assigned-clock-rate = <50000000>; + assigned-clocks = <&cru SCLK_MAC2PHY>; + assigned-clock-parents = <&cru SCLK_MAC2PHY_SRC>; + status = "okay"; }; &uart2 { diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index 0be96cee27bd..d48bf5d9f8bd 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -63,6 +63,8 @@ i2c1 = &i2c1; i2c2 = &i2c2; i2c3 = &i2c3; + ethernet0 = &gmac2io; + ethernet1 = &gmac2phy; }; cpus { @@ -424,6 +426,43 @@ status = "disabled"; }; + gmac2phy: ethernet@ff550000 { + compatible = "rockchip,rk3328-gmac"; + reg = <0x0 0xff550000 0x0 0x10000>; + rockchip,grf = <&grf>; + interrupts = <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "macirq"; + clocks = <&cru SCLK_MAC2PHY_SRC>, <&cru SCLK_MAC2PHY_RXTX>, + <&cru SCLK_MAC2PHY_RXTX>, <&cru SCLK_MAC2PHY_REF>, + <&cru ACLK_MAC2PHY>, <&cru PCLK_MAC2PHY>, + <&cru SCLK_MAC2PHY_OUT>; + clock-names = "stmmaceth", "mac_clk_rx", + "mac_clk_tx", "clk_mac_ref", + "aclk_mac", "pclk_mac", + "clk_macphy"; + resets = <&cru SRST_GMAC2PHY_A>, <&cru SRST_MACPHY>; + reset-names = "stmmaceth", "mac-phy"; + phy-mode = "rmii"; + phy-handle = <&phy>; + status = "disabled"; + + mdio { + compatible = "snps,dwmac-mdio"; + #address-cells = <1>; + #size-cells = <0>; + + phy: phy@0 { + compatible = "ethernet-phy-id1234.d400", "ethernet-phy-ieee802.3-c22"; + reg = <0>; + clocks = <&cru SCLK_MAC2PHY_OUT>; + resets = <&cru SRST_MACPHY>; + pinctrl-names = "default"; + pinctrl-0 = <&fephyled_rxm1 &fephyled_linkm1>; + phy-is-integrated; + }; + }; + }; + gic: interrupt-controller@ff811000 { compatible = "arm,gic-400"; #interrupt-cells = <3>; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index b4ca115b3be1..cdde4f56a281 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -203,6 +203,7 @@ CONFIG_MARVELL_PHY=m CONFIG_MESON_GXL_PHY=m CONFIG_MICREL_PHY=y CONFIG_REALTEK_PHY=m +CONFIG_ROCKCHIP_PHY=y CONFIG_USB_PEGASUS=m CONFIG_USB_RTL8150=m CONFIG_USB_RTL8152=m diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index d92293747d63..7ca54a76f6b9 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -18,18 +18,23 @@ config CRYPTO_SHA512_ARM64 config CRYPTO_SHA1_ARM64_CE tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_HASH + select CRYPTO_SHA1 config CRYPTO_SHA2_ARM64_CE tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_HASH + select CRYPTO_SHA256_ARM64 config CRYPTO_GHASH_ARM64_CE - tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions" - depends on ARM64 && KERNEL_MODE_NEON + tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions" + depends on KERNEL_MODE_NEON select CRYPTO_HASH + select CRYPTO_GF128MUL + select CRYPTO_AES + select CRYPTO_AES_ARM64 config CRYPTO_CRCT10DIF_ARM64_CE tristate "CRCT10DIF digest algorithm using PMULL instructions" @@ -49,25 +54,29 @@ config CRYPTO_AES_ARM64_CE tristate "AES core cipher using ARMv8 Crypto Extensions" depends on ARM64 && KERNEL_MODE_NEON select CRYPTO_ALGAPI + select CRYPTO_AES_ARM64 config CRYPTO_AES_ARM64_CE_CCM tristate "AES in CCM mode using ARMv8 Crypto Extensions" depends on ARM64 && KERNEL_MODE_NEON select CRYPTO_ALGAPI select CRYPTO_AES_ARM64_CE + select CRYPTO_AES_ARM64 select CRYPTO_AEAD config CRYPTO_AES_ARM64_CE_BLK tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_AES_ARM64_CE + select CRYPTO_AES_ARM64 select CRYPTO_SIMD config CRYPTO_AES_ARM64_NEON_BLK tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER + select CRYPTO_AES_ARM64 select CRYPTO_AES select CRYPTO_SIMD @@ -82,6 +91,7 @@ config CRYPTO_AES_ARM64_BS depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_AES_ARM64_NEON_BLK + select CRYPTO_AES_ARM64 select CRYPTO_SIMD endif diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S index 3363560c79b7..e3a375c4cb83 100644 --- a/arch/arm64/crypto/aes-ce-ccm-core.S +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -1,7 +1,7 @@ /* * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions * - * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -32,7 +32,7 @@ ENTRY(ce_aes_ccm_auth_data) beq 8f /* out of input? */ cbnz w8, 0b eor v0.16b, v0.16b, v1.16b -1: ld1 {v3.16b}, [x4] /* load first round key */ +1: ld1 {v3.4s}, [x4] /* load first round key */ prfm pldl1strm, [x1] cmp w5, #12 /* which key size? */ add x6, x4, #16 @@ -42,17 +42,17 @@ ENTRY(ce_aes_ccm_auth_data) mov v5.16b, v3.16b b 4f 2: mov v4.16b, v3.16b - ld1 {v5.16b}, [x6], #16 /* load 2nd round key */ + ld1 {v5.4s}, [x6], #16 /* load 2nd round key */ 3: aese v0.16b, v4.16b aesmc v0.16b, v0.16b -4: ld1 {v3.16b}, [x6], #16 /* load next round key */ +4: ld1 {v3.4s}, [x6], #16 /* load next round key */ aese v0.16b, v5.16b aesmc v0.16b, v0.16b -5: ld1 {v4.16b}, [x6], #16 /* load next round key */ +5: ld1 {v4.4s}, [x6], #16 /* load next round key */ subs w7, w7, #3 aese v0.16b, v3.16b aesmc v0.16b, v0.16b - ld1 {v5.16b}, [x6], #16 /* load next round key */ + ld1 {v5.4s}, [x6], #16 /* load next round key */ bpl 3b aese v0.16b, v4.16b subs w2, w2, #16 /* last data? */ @@ -90,7 +90,7 @@ ENDPROC(ce_aes_ccm_auth_data) * u32 rounds); */ ENTRY(ce_aes_ccm_final) - ld1 {v3.16b}, [x2], #16 /* load first round key */ + ld1 {v3.4s}, [x2], #16 /* load first round key */ ld1 {v0.16b}, [x0] /* load mac */ cmp w3, #12 /* which key size? */ sub w3, w3, #2 /* modified # of rounds */ @@ -100,17 +100,17 @@ ENTRY(ce_aes_ccm_final) mov v5.16b, v3.16b b 2f 0: mov v4.16b, v3.16b -1: ld1 {v5.16b}, [x2], #16 /* load next round key */ +1: ld1 {v5.4s}, [x2], #16 /* load next round key */ aese v0.16b, v4.16b aesmc v0.16b, v0.16b aese v1.16b, v4.16b aesmc v1.16b, v1.16b -2: ld1 {v3.16b}, [x2], #16 /* load next round key */ +2: ld1 {v3.4s}, [x2], #16 /* load next round key */ aese v0.16b, v5.16b aesmc v0.16b, v0.16b aese v1.16b, v5.16b aesmc v1.16b, v1.16b -3: ld1 {v4.16b}, [x2], #16 /* load next round key */ +3: ld1 {v4.4s}, [x2], #16 /* load next round key */ subs w3, w3, #3 aese v0.16b, v3.16b aesmc v0.16b, v0.16b @@ -137,31 +137,31 @@ CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */ cmp w4, #12 /* which key size? */ sub w7, w4, #2 /* get modified # of rounds */ ins v1.d[1], x9 /* no carry in lower ctr */ - ld1 {v3.16b}, [x3] /* load first round key */ + ld1 {v3.4s}, [x3] /* load first round key */ add x10, x3, #16 bmi 1f bne 4f mov v5.16b, v3.16b b 3f 1: mov v4.16b, v3.16b - ld1 {v5.16b}, [x10], #16 /* load 2nd round key */ + ld1 {v5.4s}, [x10], #16 /* load 2nd round key */ 2: /* inner loop: 3 rounds, 2x interleaved */ aese v0.16b, v4.16b aesmc v0.16b, v0.16b aese v1.16b, v4.16b aesmc v1.16b, v1.16b -3: ld1 {v3.16b}, [x10], #16 /* load next round key */ +3: ld1 {v3.4s}, [x10], #16 /* load next round key */ aese v0.16b, v5.16b aesmc v0.16b, v0.16b aese v1.16b, v5.16b aesmc v1.16b, v1.16b -4: ld1 {v4.16b}, [x10], #16 /* load next round key */ +4: ld1 {v4.4s}, [x10], #16 /* load next round key */ subs w7, w7, #3 aese v0.16b, v3.16b aesmc v0.16b, v0.16b aese v1.16b, v3.16b aesmc v1.16b, v1.16b - ld1 {v5.16b}, [x10], #16 /* load next round key */ + ld1 {v5.4s}, [x10], #16 /* load next round key */ bpl 2b aese v0.16b, v4.16b aese v1.16b, v4.16b diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c index 6a7dbc7c83a6..a1254036f2b1 100644 --- a/arch/arm64/crypto/aes-ce-ccm-glue.c +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c @@ -1,7 +1,7 @@ /* * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions * - * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,6 +9,7 @@ */ #include <asm/neon.h> +#include <asm/simd.h> #include <asm/unaligned.h> #include <crypto/aes.h> #include <crypto/scatterwalk.h> @@ -44,6 +45,8 @@ asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[], u32 rounds); +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key, unsigned int key_len) { @@ -103,7 +106,45 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen) return 0; } -static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) +static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[], + u32 abytes, u32 *macp, bool use_neon) +{ + if (likely(use_neon)) { + ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc, + num_rounds(key)); + } else { + if (*macp > 0 && *macp < AES_BLOCK_SIZE) { + int added = min(abytes, AES_BLOCK_SIZE - *macp); + + crypto_xor(&mac[*macp], in, added); + + *macp += added; + in += added; + abytes -= added; + } + + while (abytes > AES_BLOCK_SIZE) { + __aes_arm64_encrypt(key->key_enc, mac, mac, + num_rounds(key)); + crypto_xor(mac, in, AES_BLOCK_SIZE); + + in += AES_BLOCK_SIZE; + abytes -= AES_BLOCK_SIZE; + } + + if (abytes > 0) { + __aes_arm64_encrypt(key->key_enc, mac, mac, + num_rounds(key)); + crypto_xor(mac, in, abytes); + *macp = abytes; + } else { + *macp = 0; + } + } +} + +static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[], + bool use_neon) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); @@ -122,8 +163,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) ltag.len = 6; } - ce_aes_ccm_auth_data(mac, (u8 *)<ag, ltag.len, &macp, ctx->key_enc, - num_rounds(ctx)); + ccm_update_mac(ctx, mac, (u8 *)<ag, ltag.len, &macp, use_neon); scatterwalk_start(&walk, req->src); do { @@ -135,8 +175,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) n = scatterwalk_clamp(&walk, len); } p = scatterwalk_map(&walk); - ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc, - num_rounds(ctx)); + ccm_update_mac(ctx, mac, p, n, &macp, use_neon); len -= n; scatterwalk_unmap(p); @@ -145,6 +184,56 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) } while (len); } +static int ccm_crypt_fallback(struct skcipher_walk *walk, u8 mac[], u8 iv0[], + struct crypto_aes_ctx *ctx, bool enc) +{ + u8 buf[AES_BLOCK_SIZE]; + int err = 0; + + while (walk->nbytes) { + int blocks = walk->nbytes / AES_BLOCK_SIZE; + u32 tail = walk->nbytes % AES_BLOCK_SIZE; + u8 *dst = walk->dst.virt.addr; + u8 *src = walk->src.virt.addr; + u32 nbytes = walk->nbytes; + + if (nbytes == walk->total && tail > 0) { + blocks++; + tail = 0; + } + + do { + u32 bsize = AES_BLOCK_SIZE; + + if (nbytes < AES_BLOCK_SIZE) + bsize = nbytes; + + crypto_inc(walk->iv, AES_BLOCK_SIZE); + __aes_arm64_encrypt(ctx->key_enc, buf, walk->iv, + num_rounds(ctx)); + __aes_arm64_encrypt(ctx->key_enc, mac, mac, + num_rounds(ctx)); + if (enc) + crypto_xor(mac, src, bsize); + crypto_xor_cpy(dst, src, buf, bsize); + if (!enc) + crypto_xor(mac, dst, bsize); + dst += bsize; + src += bsize; + nbytes -= bsize; + } while (--blocks); + + err = skcipher_walk_done(walk, tail); + } + + if (!err) { + __aes_arm64_encrypt(ctx->key_enc, buf, iv0, num_rounds(ctx)); + __aes_arm64_encrypt(ctx->key_enc, mac, mac, num_rounds(ctx)); + crypto_xor(mac, buf, AES_BLOCK_SIZE); + } + return err; +} + static int ccm_encrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); @@ -153,39 +242,46 @@ static int ccm_encrypt(struct aead_request *req) u8 __aligned(8) mac[AES_BLOCK_SIZE]; u8 buf[AES_BLOCK_SIZE]; u32 len = req->cryptlen; + bool use_neon = may_use_simd(); int err; err = ccm_init_mac(req, mac, len); if (err) return err; - kernel_neon_begin_partial(6); + if (likely(use_neon)) + kernel_neon_begin(); if (req->assoclen) - ccm_calculate_auth_mac(req, mac); + ccm_calculate_auth_mac(req, mac, use_neon); /* preserve the original iv for the final round */ memcpy(buf, req->iv, AES_BLOCK_SIZE); err = skcipher_walk_aead_encrypt(&walk, req, true); - while (walk.nbytes) { - u32 tail = walk.nbytes % AES_BLOCK_SIZE; - - if (walk.nbytes == walk.total) - tail = 0; + if (likely(use_neon)) { + while (walk.nbytes) { + u32 tail = walk.nbytes % AES_BLOCK_SIZE; - ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes - tail, ctx->key_enc, - num_rounds(ctx), mac, walk.iv); + if (walk.nbytes == walk.total) + tail = 0; - err = skcipher_walk_done(&walk, tail); - } - if (!err) - ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + ce_aes_ccm_encrypt(walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes - tail, ctx->key_enc, + num_rounds(ctx), mac, walk.iv); - kernel_neon_end(); + err = skcipher_walk_done(&walk, tail); + } + if (!err) + ce_aes_ccm_final(mac, buf, ctx->key_enc, + num_rounds(ctx)); + kernel_neon_end(); + } else { + err = ccm_crypt_fallback(&walk, mac, buf, ctx, true); + } if (err) return err; @@ -205,38 +301,46 @@ static int ccm_decrypt(struct aead_request *req) u8 __aligned(8) mac[AES_BLOCK_SIZE]; u8 buf[AES_BLOCK_SIZE]; u32 len = req->cryptlen - authsize; + bool use_neon = may_use_simd(); int err; err = ccm_init_mac(req, mac, len); if (err) return err; - kernel_neon_begin_partial(6); + if (likely(use_neon)) + kernel_neon_begin(); if (req->assoclen) - ccm_calculate_auth_mac(req, mac); + ccm_calculate_auth_mac(req, mac, use_neon); /* preserve the original iv for the final round */ memcpy(buf, req->iv, AES_BLOCK_SIZE); err = skcipher_walk_aead_decrypt(&walk, req, true); - while (walk.nbytes) { - u32 tail = walk.nbytes % AES_BLOCK_SIZE; + if (likely(use_neon)) { + while (walk.nbytes) { + u32 tail = walk.nbytes % AES_BLOCK_SIZE; - if (walk.nbytes == walk.total) - tail = 0; + if (walk.nbytes == walk.total) + tail = 0; - ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes - tail, ctx->key_enc, - num_rounds(ctx), mac, walk.iv); + ce_aes_ccm_decrypt(walk.dst.virt.addr, + walk.src.virt.addr, + walk.nbytes - tail, ctx->key_enc, + num_rounds(ctx), mac, walk.iv); - err = skcipher_walk_done(&walk, tail); - } - if (!err) - ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + err = skcipher_walk_done(&walk, tail); + } + if (!err) + ce_aes_ccm_final(mac, buf, ctx->key_enc, + num_rounds(ctx)); - kernel_neon_end(); + kernel_neon_end(); + } else { + err = ccm_crypt_fallback(&walk, mac, buf, ctx, false); + } if (err) return err; diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c index 50d9fe11d0c8..6a75cd75ed11 100644 --- a/arch/arm64/crypto/aes-ce-cipher.c +++ b/arch/arm64/crypto/aes-ce-cipher.c @@ -1,7 +1,7 @@ /* * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions * - * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,6 +9,8 @@ */ #include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> #include <crypto/aes.h> #include <linux/cpufeature.h> #include <linux/crypto.h> @@ -20,6 +22,9 @@ MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); +asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + struct aes_block { u8 b[AES_BLOCK_SIZE]; }; @@ -44,27 +49,32 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) void *dummy0; int dummy1; - kernel_neon_begin_partial(4); + if (!may_use_simd()) { + __aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx)); + return; + } + + kernel_neon_begin(); __asm__(" ld1 {v0.16b}, %[in] ;" - " ld1 {v1.16b}, [%[key]], #16 ;" + " ld1 {v1.4s}, [%[key]], #16 ;" " cmp %w[rounds], #10 ;" " bmi 0f ;" " bne 3f ;" " mov v3.16b, v1.16b ;" " b 2f ;" "0: mov v2.16b, v1.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" + " ld1 {v3.4s}, [%[key]], #16 ;" "1: aese v0.16b, v2.16b ;" " aesmc v0.16b, v0.16b ;" - "2: ld1 {v1.16b}, [%[key]], #16 ;" + "2: ld1 {v1.4s}, [%[key]], #16 ;" " aese v0.16b, v3.16b ;" " aesmc v0.16b, v0.16b ;" - "3: ld1 {v2.16b}, [%[key]], #16 ;" + "3: ld1 {v2.4s}, [%[key]], #16 ;" " subs %w[rounds], %w[rounds], #3 ;" " aese v0.16b, v1.16b ;" " aesmc v0.16b, v0.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" + " ld1 {v3.4s}, [%[key]], #16 ;" " bpl 1b ;" " aese v0.16b, v2.16b ;" " eor v0.16b, v0.16b, v3.16b ;" @@ -89,27 +99,32 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) void *dummy0; int dummy1; - kernel_neon_begin_partial(4); + if (!may_use_simd()) { + __aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx)); + return; + } + + kernel_neon_begin(); __asm__(" ld1 {v0.16b}, %[in] ;" - " ld1 {v1.16b}, [%[key]], #16 ;" + " ld1 {v1.4s}, [%[key]], #16 ;" " cmp %w[rounds], #10 ;" " bmi 0f ;" " bne 3f ;" " mov v3.16b, v1.16b ;" " b 2f ;" "0: mov v2.16b, v1.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" + " ld1 {v3.4s}, [%[key]], #16 ;" "1: aesd v0.16b, v2.16b ;" " aesimc v0.16b, v0.16b ;" - "2: ld1 {v1.16b}, [%[key]], #16 ;" + "2: ld1 {v1.4s}, [%[key]], #16 ;" " aesd v0.16b, v3.16b ;" " aesimc v0.16b, v0.16b ;" - "3: ld1 {v2.16b}, [%[key]], #16 ;" + "3: ld1 {v2.4s}, [%[key]], #16 ;" " subs %w[rounds], %w[rounds], #3 ;" " aesd v0.16b, v1.16b ;" " aesimc v0.16b, v0.16b ;" - " ld1 {v3.16b}, [%[key]], #16 ;" + " ld1 {v3.4s}, [%[key]], #16 ;" " bpl 1b ;" " aesd v0.16b, v2.16b ;" " eor v0.16b, v0.16b, v3.16b ;" @@ -165,20 +180,16 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, key_len != AES_KEYSIZE_256) return -EINVAL; - memcpy(ctx->key_enc, in_key, key_len); ctx->key_length = key_len; + for (i = 0; i < kwords; i++) + ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32)); - kernel_neon_begin_partial(2); + kernel_neon_begin(); for (i = 0; i < sizeof(rcon); i++) { u32 *rki = ctx->key_enc + (i * kwords); u32 *rko = rki + kwords; -#ifndef CONFIG_CPU_BIG_ENDIAN rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0]; -#else - rko[0] = rol32(aes_sub(rki[kwords - 1]), 8) ^ (rcon[i] << 24) ^ - rki[0]; -#endif rko[1] = rko[0] ^ rki[1]; rko[2] = rko[1] ^ rki[2]; rko[3] = rko[2] ^ rki[3]; @@ -210,9 +221,9 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, key_dec[0] = key_enc[j]; for (i = 1, j--; j > 0; i++, j--) - __asm__("ld1 {v0.16b}, %[in] ;" + __asm__("ld1 {v0.4s}, %[in] ;" "aesimc v1.16b, v0.16b ;" - "st1 {v1.16b}, %[out] ;" + "st1 {v1.4s}, %[out] ;" : [out] "=Q"(key_dec[i]) : [in] "Q"(key_enc[j]) diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S index b46093d567e5..50330f5c3adc 100644 --- a/arch/arm64/crypto/aes-ce.S +++ b/arch/arm64/crypto/aes-ce.S @@ -2,7 +2,7 @@ * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with * Crypto Extensions * - * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -22,11 +22,11 @@ cmp \rounds, #12 blo 2222f /* 128 bits */ beq 1111f /* 192 bits */ - ld1 {v17.16b-v18.16b}, [\rk], #32 -1111: ld1 {v19.16b-v20.16b}, [\rk], #32 -2222: ld1 {v21.16b-v24.16b}, [\rk], #64 - ld1 {v25.16b-v28.16b}, [\rk], #64 - ld1 {v29.16b-v31.16b}, [\rk] + ld1 {v17.4s-v18.4s}, [\rk], #32 +1111: ld1 {v19.4s-v20.4s}, [\rk], #32 +2222: ld1 {v21.4s-v24.4s}, [\rk], #64 + ld1 {v25.4s-v28.4s}, [\rk], #64 + ld1 {v29.4s-v31.4s}, [\rk] .endm /* prepare for encryption with key in rk[] */ diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S index f2f9cc519309..6d2445d603cc 100644 --- a/arch/arm64/crypto/aes-cipher-core.S +++ b/arch/arm64/crypto/aes-cipher-core.S @@ -10,6 +10,7 @@ #include <linux/linkage.h> #include <asm/assembler.h> +#include <asm/cache.h> .text @@ -17,94 +18,155 @@ out .req x1 in .req x2 rounds .req x3 - tt .req x4 - lt .req x2 + tt .req x2 - .macro __pair, enc, reg0, reg1, in0, in1e, in1d, shift + .macro __pair1, sz, op, reg0, reg1, in0, in1e, in1d, shift + .ifc \op\shift, b0 + ubfiz \reg0, \in0, #2, #8 + ubfiz \reg1, \in1e, #2, #8 + .else ubfx \reg0, \in0, #\shift, #8 - .if \enc ubfx \reg1, \in1e, #\shift, #8 - .else - ubfx \reg1, \in1d, #\shift, #8 .endif + + /* + * AArch64 cannot do byte size indexed loads from a table containing + * 32-bit quantities, i.e., 'ldrb w12, [tt, w12, uxtw #2]' is not a + * valid instruction. So perform the shift explicitly first for the + * high bytes (the low byte is shifted implicitly by using ubfiz rather + * than ubfx above) + */ + .ifnc \op, b ldr \reg0, [tt, \reg0, uxtw #2] ldr \reg1, [tt, \reg1, uxtw #2] + .else + .if \shift > 0 + lsl \reg0, \reg0, #2 + lsl \reg1, \reg1, #2 + .endif + ldrb \reg0, [tt, \reg0, uxtw] + ldrb \reg1, [tt, \reg1, uxtw] + .endif .endm - .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc + .macro __pair0, sz, op, reg0, reg1, in0, in1e, in1d, shift + ubfx \reg0, \in0, #\shift, #8 + ubfx \reg1, \in1d, #\shift, #8 + ldr\op \reg0, [tt, \reg0, uxtw #\sz] + ldr\op \reg1, [tt, \reg1, uxtw #\sz] + .endm + + .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc, sz, op ldp \out0, \out1, [rk], #8 - __pair \enc, w13, w14, \in0, \in1, \in3, 0 - __pair \enc, w15, w16, \in1, \in2, \in0, 8 - __pair \enc, w17, w18, \in2, \in3, \in1, 16 - __pair \enc, \t0, \t1, \in3, \in0, \in2, 24 - - eor \out0, \out0, w13 - eor \out1, \out1, w14 - eor \out0, \out0, w15, ror #24 - eor \out1, \out1, w16, ror #24 - eor \out0, \out0, w17, ror #16 - eor \out1, \out1, w18, ror #16 + __pair\enc \sz, \op, w12, w13, \in0, \in1, \in3, 0 + __pair\enc \sz, \op, w14, w15, \in1, \in2, \in0, 8 + __pair\enc \sz, \op, w16, w17, \in2, \in3, \in1, 16 + __pair\enc \sz, \op, \t0, \t1, \in3, \in0, \in2, 24 + + eor \out0, \out0, w12 + eor \out1, \out1, w13 + eor \out0, \out0, w14, ror #24 + eor \out1, \out1, w15, ror #24 + eor \out0, \out0, w16, ror #16 + eor \out1, \out1, w17, ror #16 eor \out0, \out0, \t0, ror #8 eor \out1, \out1, \t1, ror #8 .endm - .macro fround, out0, out1, out2, out3, in0, in1, in2, in3 - __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1 - __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1 + .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op + __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op .endm - .macro iround, out0, out1, out2, out3, in0, in1, in2, in3 - __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0 - __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0 + .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op + __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op .endm - .macro do_crypt, round, ttab, ltab - ldp w5, w6, [in] - ldp w7, w8, [in, #8] - ldp w9, w10, [rk], #16 - ldp w11, w12, [rk, #-8] + .macro do_crypt, round, ttab, ltab, bsz + ldp w4, w5, [in] + ldp w6, w7, [in, #8] + ldp w8, w9, [rk], #16 + ldp w10, w11, [rk, #-8] +CPU_BE( rev w4, w4 ) CPU_BE( rev w5, w5 ) CPU_BE( rev w6, w6 ) CPU_BE( rev w7, w7 ) -CPU_BE( rev w8, w8 ) + eor w4, w4, w8 eor w5, w5, w9 eor w6, w6, w10 eor w7, w7, w11 - eor w8, w8, w12 adr_l tt, \ttab - adr_l lt, \ltab tbnz rounds, #1, 1f -0: \round w9, w10, w11, w12, w5, w6, w7, w8 - \round w5, w6, w7, w8, w9, w10, w11, w12 +0: \round w8, w9, w10, w11, w4, w5, w6, w7 + \round w4, w5, w6, w7, w8, w9, w10, w11 1: subs rounds, rounds, #4 - \round w9, w10, w11, w12, w5, w6, w7, w8 - csel tt, tt, lt, hi - \round w5, w6, w7, w8, w9, w10, w11, w12 - b.hi 0b - + \round w8, w9, w10, w11, w4, w5, w6, w7 + b.ls 3f +2: \round w4, w5, w6, w7, w8, w9, w10, w11 + b 0b +3: adr_l tt, \ltab + \round w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b + +CPU_BE( rev w4, w4 ) CPU_BE( rev w5, w5 ) CPU_BE( rev w6, w6 ) CPU_BE( rev w7, w7 ) -CPU_BE( rev w8, w8 ) - stp w5, w6, [out] - stp w7, w8, [out, #8] + stp w4, w5, [out] + stp w6, w7, [out, #8] ret .endm - .align 5 + .align L1_CACHE_SHIFT + .type __aes_arm64_inverse_sbox, %object +__aes_arm64_inverse_sbox: + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + .size __aes_arm64_inverse_sbox, . - __aes_arm64_inverse_sbox + ENTRY(__aes_arm64_encrypt) - do_crypt fround, crypto_ft_tab, crypto_fl_tab + do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2 ENDPROC(__aes_arm64_encrypt) .align 5 ENTRY(__aes_arm64_decrypt) - do_crypt iround, crypto_it_tab, crypto_il_tab + do_crypt iround, crypto_it_tab, __aes_arm64_inverse_sbox, 0 ENDPROC(__aes_arm64_decrypt) diff --git a/arch/arm64/crypto/aes-ctr-fallback.h b/arch/arm64/crypto/aes-ctr-fallback.h new file mode 100644 index 000000000000..c9285717b6b5 --- /dev/null +++ b/arch/arm64/crypto/aes-ctr-fallback.h @@ -0,0 +1,53 @@ +/* + * Fallback for sync aes(ctr) in contexts where kernel mode NEON + * is not allowed + * + * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <crypto/aes.h> +#include <crypto/internal/skcipher.h> + +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + +static inline int aes_ctr_encrypt_fallback(struct crypto_aes_ctx *ctx, + struct skcipher_request *req) +{ + struct skcipher_walk walk; + u8 buf[AES_BLOCK_SIZE]; + int err; + + err = skcipher_walk_virt(&walk, req, true); + + while (walk.nbytes > 0) { + u8 *dst = walk.dst.virt.addr; + u8 *src = walk.src.virt.addr; + int nbytes = walk.nbytes; + int tail = 0; + + if (nbytes < walk.total) { + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + tail = walk.nbytes % AES_BLOCK_SIZE; + } + + do { + int bsize = min(nbytes, AES_BLOCK_SIZE); + + __aes_arm64_encrypt(ctx->key_enc, buf, walk.iv, + 6 + ctx->key_length / 4); + crypto_xor_cpy(dst, src, buf, bsize); + crypto_inc(walk.iv, AES_BLOCK_SIZE); + + dst += AES_BLOCK_SIZE; + src += AES_BLOCK_SIZE; + nbytes -= AES_BLOCK_SIZE; + } while (nbytes > 0); + + err = skcipher_walk_done(&walk, tail); + } + return err; +} diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index bcf596b0197e..998ba519a026 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -10,6 +10,7 @@ #include <asm/neon.h> #include <asm/hwcap.h> +#include <asm/simd.h> #include <crypto/aes.h> #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> @@ -19,6 +20,7 @@ #include <crypto/xts.h> #include "aes-ce-setkey.h" +#include "aes-ctr-fallback.h" #ifdef USE_V8_CRYPTO_EXTENSIONS #define MODE "ce" @@ -241,9 +243,7 @@ static int ctr_encrypt(struct skcipher_request *req) aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds, blocks, walk.iv, first); - if (tdst != tsrc) - memcpy(tdst, tsrc, nbytes); - crypto_xor(tdst, tail, nbytes); + crypto_xor_cpy(tdst, tsrc, tail, nbytes); err = skcipher_walk_done(&walk, 0); } kernel_neon_end(); @@ -251,6 +251,17 @@ static int ctr_encrypt(struct skcipher_request *req) return err; } +static int ctr_encrypt_sync(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + + if (!may_use_simd()) + return aes_ctr_encrypt_fallback(ctx, req); + + return ctr_encrypt(req); +} + static int xts_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); @@ -357,8 +368,8 @@ static struct skcipher_alg aes_algs[] = { { .ivsize = AES_BLOCK_SIZE, .chunksize = AES_BLOCK_SIZE, .setkey = skcipher_aes_setkey, - .encrypt = ctr_encrypt, - .decrypt = ctr_encrypt, + .encrypt = ctr_encrypt_sync, + .decrypt = ctr_encrypt_sync, }, { .base = { .cra_name = "__xts(aes)", @@ -460,11 +471,35 @@ static int mac_init(struct shash_desc *desc) return 0; } +static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks, + u8 dg[], int enc_before, int enc_after) +{ + int rounds = 6 + ctx->key_length / 4; + + if (may_use_simd()) { + kernel_neon_begin(); + aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before, + enc_after); + kernel_neon_end(); + } else { + if (enc_before) + __aes_arm64_encrypt(ctx->key_enc, dg, dg, rounds); + + while (blocks--) { + crypto_xor(dg, in, AES_BLOCK_SIZE); + in += AES_BLOCK_SIZE; + + if (blocks || enc_after) + __aes_arm64_encrypt(ctx->key_enc, dg, dg, + rounds); + } + } +} + static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len) { struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - int rounds = 6 + tctx->key.key_length / 4; while (len > 0) { unsigned int l; @@ -476,10 +511,8 @@ static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len) len %= AES_BLOCK_SIZE; - kernel_neon_begin(); - aes_mac_update(p, tctx->key.key_enc, rounds, blocks, - ctx->dg, (ctx->len != 0), (len != 0)); - kernel_neon_end(); + mac_do_update(&tctx->key, p, blocks, ctx->dg, + (ctx->len != 0), (len != 0)); p += blocks * AES_BLOCK_SIZE; @@ -507,11 +540,8 @@ static int cbcmac_final(struct shash_desc *desc, u8 *out) { struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - int rounds = 6 + tctx->key.key_length / 4; - kernel_neon_begin(); - aes_mac_update(NULL, tctx->key.key_enc, rounds, 0, ctx->dg, 1, 0); - kernel_neon_end(); + mac_do_update(&tctx->key, NULL, 0, ctx->dg, 1, 0); memcpy(out, ctx->dg, AES_BLOCK_SIZE); @@ -522,7 +552,6 @@ static int cmac_final(struct shash_desc *desc, u8 *out) { struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - int rounds = 6 + tctx->key.key_length / 4; u8 *consts = tctx->consts; if (ctx->len != AES_BLOCK_SIZE) { @@ -530,9 +559,7 @@ static int cmac_final(struct shash_desc *desc, u8 *out) consts += AES_BLOCK_SIZE; } - kernel_neon_begin(); - aes_mac_update(consts, tctx->key.key_enc, rounds, 1, ctx->dg, 0, 1); - kernel_neon_end(); + mac_do_update(&tctx->key, consts, 1, ctx->dg, 0, 1); memcpy(out, ctx->dg, AES_BLOCK_SIZE); diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index db2501d93550..c55d68ccb89f 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -1,7 +1,7 @@ /* * Bit sliced AES using NEON instructions * - * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,12 +9,15 @@ */ #include <asm/neon.h> +#include <asm/simd.h> #include <crypto/aes.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/xts.h> #include <linux/module.h> +#include "aes-ctr-fallback.h" + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); @@ -58,6 +61,11 @@ struct aesbs_cbc_ctx { u32 enc[AES_MAX_KEYLENGTH_U32]; }; +struct aesbs_ctr_ctx { + struct aesbs_ctx key; /* must be first member */ + struct crypto_aes_ctx fallback; +}; + struct aesbs_xts_ctx { struct aesbs_ctx key; u32 twkey[AES_MAX_KEYLENGTH_U32]; @@ -196,6 +204,25 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } +static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); + int err; + + err = crypto_aes_expand_key(&ctx->fallback, in_key, key_len); + if (err) + return err; + + ctx->key.rounds = 6 + key_len / 4; + + kernel_neon_begin(); + aesbs_convert_key(ctx->key.rk, ctx->fallback.key_enc, ctx->key.rounds); + kernel_neon_end(); + + return 0; +} + static int ctr_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); @@ -224,9 +251,8 @@ static int ctr_encrypt(struct skcipher_request *req) u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; - if (dst != src) - memcpy(dst, src, walk.total % AES_BLOCK_SIZE); - crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE); + crypto_xor_cpy(dst, src, final, + walk.total % AES_BLOCK_SIZE); err = skcipher_walk_done(&walk, 0); break; @@ -260,6 +286,17 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key, return aesbs_setkey(tfm, in_key, key_len); } +static int ctr_encrypt_sync(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); + + if (!may_use_simd()) + return aes_ctr_encrypt_fallback(&ctx->fallback, req); + + return ctr_encrypt(req); +} + static int __xts_crypt(struct skcipher_request *req, void (*fn)(u8 out[], u8 const in[], u8 const rk[], int rounds, int blocks, u8 iv[])) @@ -356,7 +393,7 @@ static struct skcipher_alg aes_algs[] = { { .base.cra_driver_name = "ctr-aes-neonbs", .base.cra_priority = 250 - 1, .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct aesbs_ctx), + .base.cra_ctxsize = sizeof(struct aesbs_ctr_ctx), .base.cra_module = THIS_MODULE, .min_keysize = AES_MIN_KEY_SIZE, @@ -364,9 +401,9 @@ static struct skcipher_alg aes_algs[] = { { .chunksize = AES_BLOCK_SIZE, .walksize = 8 * AES_BLOCK_SIZE, .ivsize = AES_BLOCK_SIZE, - .setkey = aesbs_setkey, - .encrypt = ctr_encrypt, - .decrypt = ctr_encrypt, + .setkey = aesbs_ctr_setkey_sync, + .encrypt = ctr_encrypt_sync, + .decrypt = ctr_encrypt_sync, }, { .base.cra_name = "__xts(aes)", .base.cra_driver_name = "__xts-aes-neonbs", diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c index a7cd575ea223..cbdb75d15cd0 100644 --- a/arch/arm64/crypto/chacha20-neon-glue.c +++ b/arch/arm64/crypto/chacha20-neon-glue.c @@ -1,7 +1,7 @@ /* * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions * - * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -26,6 +26,7 @@ #include <asm/hwcap.h> #include <asm/neon.h> +#include <asm/simd.h> asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); @@ -64,7 +65,7 @@ static int chacha20_neon(struct skcipher_request *req) u32 state[16]; int err; - if (req->cryptlen <= CHACHA20_BLOCK_SIZE) + if (!may_use_simd() || req->cryptlen <= CHACHA20_BLOCK_SIZE) return crypto_chacha20_crypt(req); err = skcipher_walk_virt(&walk, req, true); diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c index eccb1ae90064..624f4137918c 100644 --- a/arch/arm64/crypto/crc32-ce-glue.c +++ b/arch/arm64/crypto/crc32-ce-glue.c @@ -1,7 +1,7 @@ /* * Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions * - * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,6 +19,7 @@ #include <asm/hwcap.h> #include <asm/neon.h> +#include <asm/simd.h> #include <asm/unaligned.h> #define PMULL_MIN_LEN 64L /* minimum size of buffer @@ -105,10 +106,10 @@ static int crc32_pmull_update(struct shash_desc *desc, const u8 *data, length -= l; } - if (length >= PMULL_MIN_LEN) { + if (length >= PMULL_MIN_LEN && may_use_simd()) { l = round_down(length, SCALE_F); - kernel_neon_begin_partial(10); + kernel_neon_begin(); *crc = crc32_pmull_le(data, l, *crc); kernel_neon_end(); @@ -137,10 +138,10 @@ static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data, length -= l; } - if (length >= PMULL_MIN_LEN) { + if (length >= PMULL_MIN_LEN && may_use_simd()) { l = round_down(length, SCALE_F); - kernel_neon_begin_partial(10); + kernel_neon_begin(); *crc = crc32c_pmull_le(data, l, *crc); kernel_neon_end(); diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c index 60cb590c2590..96f0cae4a022 100644 --- a/arch/arm64/crypto/crct10dif-ce-glue.c +++ b/arch/arm64/crypto/crct10dif-ce-glue.c @@ -1,7 +1,7 @@ /* * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions * - * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -18,6 +18,7 @@ #include <crypto/internal/hash.h> #include <asm/neon.h> +#include <asm/simd.h> #define CRC_T10DIF_PMULL_CHUNK_SIZE 16U @@ -48,9 +49,13 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data, } if (length > 0) { - kernel_neon_begin_partial(14); - *crc = crc_t10dif_pmull(*crc, data, length); - kernel_neon_end(); + if (may_use_simd()) { + kernel_neon_begin(); + *crc = crc_t10dif_pmull(*crc, data, length); + kernel_neon_end(); + } else { + *crc = crc_t10dif_generic(*crc, data, length); + } } return 0; diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index f0bb9f0b524f..11ebf1ae248a 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -1,7 +1,7 @@ /* * Accelerated GHASH implementation with ARMv8 PMULL instructions. * - * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -11,31 +11,215 @@ #include <linux/linkage.h> #include <asm/assembler.h> - SHASH .req v0 - SHASH2 .req v1 - T1 .req v2 - T2 .req v3 - MASK .req v4 - XL .req v5 - XM .req v6 - XH .req v7 - IN1 .req v7 + SHASH .req v0 + SHASH2 .req v1 + T1 .req v2 + T2 .req v3 + MASK .req v4 + XL .req v5 + XM .req v6 + XH .req v7 + IN1 .req v7 + + k00_16 .req v8 + k32_48 .req v9 + + t3 .req v10 + t4 .req v11 + t5 .req v12 + t6 .req v13 + t7 .req v14 + t8 .req v15 + t9 .req v16 + + perm1 .req v17 + perm2 .req v18 + perm3 .req v19 + + sh1 .req v20 + sh2 .req v21 + sh3 .req v22 + sh4 .req v23 + + ss1 .req v24 + ss2 .req v25 + ss3 .req v26 + ss4 .req v27 .text .arch armv8-a+crypto - /* - * void pmull_ghash_update(int blocks, u64 dg[], const char *src, - * struct ghash_key const *k, const char *head) - */ -ENTRY(pmull_ghash_update) + .macro __pmull_p64, rd, rn, rm + pmull \rd\().1q, \rn\().1d, \rm\().1d + .endm + + .macro __pmull2_p64, rd, rn, rm + pmull2 \rd\().1q, \rn\().2d, \rm\().2d + .endm + + .macro __pmull_p8, rq, ad, bd + ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 + ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 + ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 + + __pmull_p8_\bd \rq, \ad + .endm + + .macro __pmull2_p8, rq, ad, bd + tbl t3.16b, {\ad\().16b}, perm1.16b // A1 + tbl t5.16b, {\ad\().16b}, perm2.16b // A2 + tbl t7.16b, {\ad\().16b}, perm3.16b // A3 + + __pmull2_p8_\bd \rq, \ad + .endm + + .macro __pmull_p8_SHASH, rq, ad + __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 + .endm + + .macro __pmull_p8_SHASH2, rq, ad + __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 + .endm + + .macro __pmull2_p8_SHASH, rq, ad + __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 + .endm + + .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 + pmull\t t3.8h, t3.\nb, \bd // F = A1*B + pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 + pmull\t t5.8h, t5.\nb, \bd // H = A2*B + pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 + pmull\t t7.8h, t7.\nb, \bd // J = A3*B + pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 + pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 + pmull\t \rq\().8h, \ad, \bd // D = A*B + + eor t3.16b, t3.16b, t4.16b // L = E + F + eor t5.16b, t5.16b, t6.16b // M = G + H + eor t7.16b, t7.16b, t8.16b // N = I + J + + uzp1 t4.2d, t3.2d, t5.2d + uzp2 t3.2d, t3.2d, t5.2d + uzp1 t6.2d, t7.2d, t9.2d + uzp2 t7.2d, t7.2d, t9.2d + + // t3 = (L) (P0 + P1) << 8 + // t5 = (M) (P2 + P3) << 16 + eor t4.16b, t4.16b, t3.16b + and t3.16b, t3.16b, k32_48.16b + + // t7 = (N) (P4 + P5) << 24 + // t9 = (K) (P6 + P7) << 32 + eor t6.16b, t6.16b, t7.16b + and t7.16b, t7.16b, k00_16.16b + + eor t4.16b, t4.16b, t3.16b + eor t6.16b, t6.16b, t7.16b + + zip2 t5.2d, t4.2d, t3.2d + zip1 t3.2d, t4.2d, t3.2d + zip2 t9.2d, t6.2d, t7.2d + zip1 t7.2d, t6.2d, t7.2d + + ext t3.16b, t3.16b, t3.16b, #15 + ext t5.16b, t5.16b, t5.16b, #14 + ext t7.16b, t7.16b, t7.16b, #13 + ext t9.16b, t9.16b, t9.16b, #12 + + eor t3.16b, t3.16b, t5.16b + eor t7.16b, t7.16b, t9.16b + eor \rq\().16b, \rq\().16b, t3.16b + eor \rq\().16b, \rq\().16b, t7.16b + .endm + + .macro __pmull_pre_p64 + movi MASK.16b, #0xe1 + shl MASK.2d, MASK.2d, #57 + .endm + + .macro __pmull_pre_p8 + // k00_16 := 0x0000000000000000_000000000000ffff + // k32_48 := 0x00000000ffffffff_0000ffffffffffff + movi k32_48.2d, #0xffffffff + mov k32_48.h[2], k32_48.h[0] + ushr k00_16.2d, k32_48.2d, #32 + + // prepare the permutation vectors + mov_q x5, 0x080f0e0d0c0b0a09 + movi T1.8b, #8 + dup perm1.2d, x5 + eor perm1.16b, perm1.16b, T1.16b + ushr perm2.2d, perm1.2d, #8 + ushr perm3.2d, perm1.2d, #16 + ushr T1.2d, perm1.2d, #24 + sli perm2.2d, perm1.2d, #56 + sli perm3.2d, perm1.2d, #48 + sli T1.2d, perm1.2d, #40 + + // precompute loop invariants + tbl sh1.16b, {SHASH.16b}, perm1.16b + tbl sh2.16b, {SHASH.16b}, perm2.16b + tbl sh3.16b, {SHASH.16b}, perm3.16b + tbl sh4.16b, {SHASH.16b}, T1.16b + ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 + ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 + ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 + ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 + .endm + + // + // PMULL (64x64->128) based reduction for CPUs that can do + // it in a single instruction. + // + .macro __pmull_reduce_p64 + pmull T2.1q, XL.1d, MASK.1d + eor XM.16b, XM.16b, T1.16b + + mov XH.d[0], XM.d[1] + mov XM.d[1], XL.d[0] + + eor XL.16b, XM.16b, T2.16b + ext T2.16b, XL.16b, XL.16b, #8 + pmull XL.1q, XL.1d, MASK.1d + .endm + + // + // Alternative reduction for CPUs that lack support for the + // 64x64->128 PMULL instruction + // + .macro __pmull_reduce_p8 + eor XM.16b, XM.16b, T1.16b + + mov XL.d[1], XM.d[0] + mov XH.d[0], XM.d[1] + + shl T1.2d, XL.2d, #57 + shl T2.2d, XL.2d, #62 + eor T2.16b, T2.16b, T1.16b + shl T1.2d, XL.2d, #63 + eor T2.16b, T2.16b, T1.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor T2.16b, T2.16b, T1.16b + + mov XL.d[1], T2.d[0] + mov XH.d[0], T2.d[1] + + ushr T2.2d, XL.2d, #1 + eor XH.16b, XH.16b, XL.16b + eor XL.16b, XL.16b, T2.16b + ushr T2.2d, T2.2d, #6 + ushr XL.2d, XL.2d, #1 + .endm + + .macro __pmull_ghash, pn ld1 {SHASH.2d}, [x3] ld1 {XL.2d}, [x1] - movi MASK.16b, #0xe1 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 - shl MASK.2d, MASK.2d, #57 eor SHASH2.16b, SHASH2.16b, SHASH.16b + __pmull_pre_\pn + /* do the head block first, if supplied */ cbz x4, 0f ld1 {T1.2d}, [x4] @@ -52,28 +236,209 @@ CPU_LE( rev64 T1.16b, T1.16b ) eor T1.16b, T1.16b, T2.16b eor XL.16b, XL.16b, IN1.16b + __pmull2_\pn XH, XL, SHASH // a1 * b1 + eor T1.16b, T1.16b, XL.16b + __pmull_\pn XL, XL, SHASH // a0 * b0 + __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) + + eor T2.16b, XL.16b, XH.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor XM.16b, XM.16b, T2.16b + + __pmull_reduce_\pn + + eor T2.16b, T2.16b, XH.16b + eor XL.16b, XL.16b, T2.16b + + cbnz w0, 0b + + st1 {XL.2d}, [x1] + ret + .endm + + /* + * void pmull_ghash_update(int blocks, u64 dg[], const char *src, + * struct ghash_key const *k, const char *head) + */ +ENTRY(pmull_ghash_update_p64) + __pmull_ghash p64 +ENDPROC(pmull_ghash_update_p64) + +ENTRY(pmull_ghash_update_p8) + __pmull_ghash p8 +ENDPROC(pmull_ghash_update_p8) + + KS .req v8 + CTR .req v9 + INP .req v10 + + .macro load_round_keys, rounds, rk + cmp \rounds, #12 + blo 2222f /* 128 bits */ + beq 1111f /* 192 bits */ + ld1 {v17.4s-v18.4s}, [\rk], #32 +1111: ld1 {v19.4s-v20.4s}, [\rk], #32 +2222: ld1 {v21.4s-v24.4s}, [\rk], #64 + ld1 {v25.4s-v28.4s}, [\rk], #64 + ld1 {v29.4s-v31.4s}, [\rk] + .endm + + .macro enc_round, state, key + aese \state\().16b, \key\().16b + aesmc \state\().16b, \state\().16b + .endm + + .macro enc_block, state, rounds + cmp \rounds, #12 + b.lo 2222f /* 128 bits */ + b.eq 1111f /* 192 bits */ + enc_round \state, v17 + enc_round \state, v18 +1111: enc_round \state, v19 + enc_round \state, v20 +2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 + enc_round \state, \key + .endr + aese \state\().16b, v30.16b + eor \state\().16b, \state\().16b, v31.16b + .endm + + .macro pmull_gcm_do_crypt, enc + ld1 {SHASH.2d}, [x4] + ld1 {XL.2d}, [x1] + ldr x8, [x5, #8] // load lower counter + + movi MASK.16b, #0xe1 + ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 +CPU_LE( rev x8, x8 ) + shl MASK.2d, MASK.2d, #57 + eor SHASH2.16b, SHASH2.16b, SHASH.16b + + .if \enc == 1 + ld1 {KS.16b}, [x7] + .endif + +0: ld1 {CTR.8b}, [x5] // load upper counter + ld1 {INP.16b}, [x3], #16 + rev x9, x8 + add x8, x8, #1 + sub w0, w0, #1 + ins CTR.d[1], x9 // set lower counter + + .if \enc == 1 + eor INP.16b, INP.16b, KS.16b // encrypt input + st1 {INP.16b}, [x2], #16 + .endif + + rev64 T1.16b, INP.16b + + cmp w6, #12 + b.ge 2f // AES-192/256? + +1: enc_round CTR, v21 + + ext T2.16b, XL.16b, XL.16b, #8 + ext IN1.16b, T1.16b, T1.16b, #8 + + enc_round CTR, v22 + + eor T1.16b, T1.16b, T2.16b + eor XL.16b, XL.16b, IN1.16b + + enc_round CTR, v23 + pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 eor T1.16b, T1.16b, XL.16b + + enc_round CTR, v24 + pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) + enc_round CTR, v25 + ext T1.16b, XL.16b, XH.16b, #8 eor T2.16b, XL.16b, XH.16b eor XM.16b, XM.16b, T1.16b + + enc_round CTR, v26 + eor XM.16b, XM.16b, T2.16b pmull T2.1q, XL.1d, MASK.1d + enc_round CTR, v27 + mov XH.d[0], XM.d[1] mov XM.d[1], XL.d[0] + enc_round CTR, v28 + eor XL.16b, XM.16b, T2.16b + + enc_round CTR, v29 + ext T2.16b, XL.16b, XL.16b, #8 + + aese CTR.16b, v30.16b + pmull XL.1q, XL.1d, MASK.1d eor T2.16b, T2.16b, XH.16b + + eor KS.16b, CTR.16b, v31.16b + eor XL.16b, XL.16b, T2.16b + .if \enc == 0 + eor INP.16b, INP.16b, KS.16b + st1 {INP.16b}, [x2], #16 + .endif + cbnz w0, 0b +CPU_LE( rev x8, x8 ) st1 {XL.2d}, [x1] + str x8, [x5, #8] // store lower counter + + .if \enc == 1 + st1 {KS.16b}, [x7] + .endif + + ret + +2: b.eq 3f // AES-192? + enc_round CTR, v17 + enc_round CTR, v18 +3: enc_round CTR, v19 + enc_round CTR, v20 + b 1b + .endm + + /* + * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], + * struct ghash_key const *k, u8 ctr[], + * int rounds, u8 ks[]) + */ +ENTRY(pmull_gcm_encrypt) + pmull_gcm_do_crypt 1 +ENDPROC(pmull_gcm_encrypt) + + /* + * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], + * struct ghash_key const *k, u8 ctr[], + * int rounds) + */ +ENTRY(pmull_gcm_decrypt) + pmull_gcm_do_crypt 0 +ENDPROC(pmull_gcm_decrypt) + + /* + * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) + */ +ENTRY(pmull_gcm_encrypt_block) + cbz x2, 0f + load_round_keys w3, x2 +0: ld1 {v0.16b}, [x1] + enc_block v0, w3 + st1 {v0.16b}, [x0] ret -ENDPROC(pmull_ghash_update) +ENDPROC(pmull_gcm_encrypt_block) diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 833ec1e3f3e9..cfc9c92814fd 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -1,7 +1,7 @@ /* * Accelerated GHASH implementation with ARMv8 PMULL instructions. * - * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published @@ -9,22 +9,33 @@ */ #include <asm/neon.h> +#include <asm/simd.h> #include <asm/unaligned.h> +#include <crypto/aes.h> +#include <crypto/algapi.h> +#include <crypto/b128ops.h> +#include <crypto/gf128mul.h> +#include <crypto/internal/aead.h> #include <crypto/internal/hash.h> +#include <crypto/internal/skcipher.h> +#include <crypto/scatterwalk.h> #include <linux/cpufeature.h> #include <linux/crypto.h> #include <linux/module.h> -MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); +MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("ghash"); #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 +#define GCM_IV_SIZE 12 struct ghash_key { u64 a; u64 b; + be128 k; }; struct ghash_desc_ctx { @@ -33,8 +44,35 @@ struct ghash_desc_ctx { u32 count; }; -asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, - struct ghash_key const *k, const char *head); +struct gcm_aes_ctx { + struct crypto_aes_ctx aes_key; + struct ghash_key ghash_key; +}; + +asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); + +asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); + +static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, + const char *head); + +asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], + const u8 src[], struct ghash_key const *k, + u8 ctr[], int rounds, u8 ks[]); + +asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], + const u8 src[], struct ghash_key const *k, + u8 ctr[], int rounds); + +asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[], + u32 const rk[], int rounds); + +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); static int ghash_init(struct shash_desc *desc) { @@ -44,6 +82,36 @@ static int ghash_init(struct shash_desc *desc) return 0; } +static void ghash_do_update(int blocks, u64 dg[], const char *src, + struct ghash_key *key, const char *head) +{ + if (likely(may_use_simd())) { + kernel_neon_begin(); + pmull_ghash_update(blocks, dg, src, key, head); + kernel_neon_end(); + } else { + be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) }; + + do { + const u8 *in = src; + + if (head) { + in = head; + blocks++; + head = NULL; + } else { + src += GHASH_BLOCK_SIZE; + } + + crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE); + gf128mul_lle(&dst, &key->k); + } while (--blocks); + + dg[0] = be64_to_cpu(dst.b); + dg[1] = be64_to_cpu(dst.a); + } +} + static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int len) { @@ -67,10 +135,9 @@ static int ghash_update(struct shash_desc *desc, const u8 *src, blocks = len / GHASH_BLOCK_SIZE; len %= GHASH_BLOCK_SIZE; - kernel_neon_begin_partial(8); - pmull_ghash_update(blocks, ctx->digest, src, key, - partial ? ctx->buf : NULL); - kernel_neon_end(); + ghash_do_update(blocks, ctx->digest, src, key, + partial ? ctx->buf : NULL); + src += blocks * GHASH_BLOCK_SIZE; partial = 0; } @@ -89,9 +156,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst) memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); - kernel_neon_begin_partial(8); - pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL); - kernel_neon_end(); + ghash_do_update(1, ctx->digest, ctx->buf, key, NULL); } put_unaligned_be64(ctx->digest[1], dst); put_unaligned_be64(ctx->digest[0], dst + 8); @@ -100,16 +165,13 @@ static int ghash_final(struct shash_desc *desc, u8 *dst) return 0; } -static int ghash_setkey(struct crypto_shash *tfm, - const u8 *inkey, unsigned int keylen) +static int __ghash_setkey(struct ghash_key *key, + const u8 *inkey, unsigned int keylen) { - struct ghash_key *key = crypto_shash_ctx(tfm); u64 a, b; - if (keylen != GHASH_BLOCK_SIZE) { - crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } + /* needed for the fallback */ + memcpy(&key->k, inkey, GHASH_BLOCK_SIZE); /* perform multiplication by 'x' in GF(2^128) */ b = get_unaligned_be64(inkey); @@ -124,33 +186,418 @@ static int ghash_setkey(struct crypto_shash *tfm, return 0; } +static int ghash_setkey(struct crypto_shash *tfm, + const u8 *inkey, unsigned int keylen) +{ + struct ghash_key *key = crypto_shash_ctx(tfm); + + if (keylen != GHASH_BLOCK_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + return __ghash_setkey(key, inkey, keylen); +} + static struct shash_alg ghash_alg = { - .digestsize = GHASH_DIGEST_SIZE, - .init = ghash_init, - .update = ghash_update, - .final = ghash_final, - .setkey = ghash_setkey, - .descsize = sizeof(struct ghash_desc_ctx), - .base = { - .cra_name = "ghash", - .cra_driver_name = "ghash-ce", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, - .cra_blocksize = GHASH_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct ghash_key), - .cra_module = THIS_MODULE, - }, + .base.cra_name = "ghash", + .base.cra_driver_name = "ghash-ce", + .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = GHASH_BLOCK_SIZE, + .base.cra_ctxsize = sizeof(struct ghash_key), + .base.cra_module = THIS_MODULE, + + .digestsize = GHASH_DIGEST_SIZE, + .init = ghash_init, + .update = ghash_update, + .final = ghash_final, + .setkey = ghash_setkey, + .descsize = sizeof(struct ghash_desc_ctx), +}; + +static int num_rounds(struct crypto_aes_ctx *ctx) +{ + /* + * # of rounds specified by AES: + * 128 bit key 10 rounds + * 192 bit key 12 rounds + * 256 bit key 14 rounds + * => n byte key => 6 + (n/4) rounds + */ + return 6 + ctx->key_length / 4; +} + +static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey, + unsigned int keylen) +{ + struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm); + u8 key[GHASH_BLOCK_SIZE]; + int ret; + + ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen); + if (ret) { + tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + __aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){}, + num_rounds(&ctx->aes_key)); + + return __ghash_setkey(&ctx->ghash_key, key, sizeof(key)); +} + +static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12 ... 16: + break; + default: + return -EINVAL; + } + return 0; +} + +static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[], + int *buf_count, struct gcm_aes_ctx *ctx) +{ + if (*buf_count > 0) { + int buf_added = min(count, GHASH_BLOCK_SIZE - *buf_count); + + memcpy(&buf[*buf_count], src, buf_added); + + *buf_count += buf_added; + src += buf_added; + count -= buf_added; + } + + if (count >= GHASH_BLOCK_SIZE || *buf_count == GHASH_BLOCK_SIZE) { + int blocks = count / GHASH_BLOCK_SIZE; + + ghash_do_update(blocks, dg, src, &ctx->ghash_key, + *buf_count ? buf : NULL); + + src += blocks * GHASH_BLOCK_SIZE; + count %= GHASH_BLOCK_SIZE; + *buf_count = 0; + } + + if (count > 0) { + memcpy(buf, src, count); + *buf_count = count; + } +} + +static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[]) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + u8 buf[GHASH_BLOCK_SIZE]; + struct scatter_walk walk; + u32 len = req->assoclen; + int buf_count = 0; + + scatterwalk_start(&walk, req->src); + + do { + u32 n = scatterwalk_clamp(&walk, len); + u8 *p; + + if (!n) { + scatterwalk_start(&walk, sg_next(walk.sg)); + n = scatterwalk_clamp(&walk, len); + } + p = scatterwalk_map(&walk); + + gcm_update_mac(dg, p, n, buf, &buf_count, ctx); + len -= n; + + scatterwalk_unmap(p); + scatterwalk_advance(&walk, n); + scatterwalk_done(&walk, 0, len); + } while (len); + + if (buf_count) { + memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count); + ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + } +} + +static void gcm_final(struct aead_request *req, struct gcm_aes_ctx *ctx, + u64 dg[], u8 tag[], int cryptlen) +{ + u8 mac[AES_BLOCK_SIZE]; + u128 lengths; + + lengths.a = cpu_to_be64(req->assoclen * 8); + lengths.b = cpu_to_be64(cryptlen * 8); + + ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL); + + put_unaligned_be64(dg[1], mac); + put_unaligned_be64(dg[0], mac + 8); + + crypto_xor(tag, mac, AES_BLOCK_SIZE); +} + +static int gcm_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + struct skcipher_walk walk; + u8 iv[AES_BLOCK_SIZE]; + u8 ks[AES_BLOCK_SIZE]; + u8 tag[AES_BLOCK_SIZE]; + u64 dg[2] = {}; + int err; + + if (req->assoclen) + gcm_calculate_auth_mac(req, dg); + + memcpy(iv, req->iv, GCM_IV_SIZE); + put_unaligned_be32(1, iv + GCM_IV_SIZE); + + if (likely(may_use_simd())) { + kernel_neon_begin(); + + pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + pmull_gcm_encrypt_block(ks, iv, NULL, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(3, iv + GCM_IV_SIZE); + + err = skcipher_walk_aead_encrypt(&walk, req, true); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = walk.nbytes / AES_BLOCK_SIZE; + + pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr, + walk.src.virt.addr, &ctx->ghash_key, + iv, num_rounds(&ctx->aes_key), ks); + + err = skcipher_walk_done(&walk, + walk.nbytes % AES_BLOCK_SIZE); + } + kernel_neon_end(); + } else { + __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + + err = skcipher_walk_aead_encrypt(&walk, req, true); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = walk.nbytes / AES_BLOCK_SIZE; + u8 *dst = walk.dst.virt.addr; + u8 *src = walk.src.virt.addr; + + do { + __aes_arm64_encrypt(ctx->aes_key.key_enc, + ks, iv, + num_rounds(&ctx->aes_key)); + crypto_xor_cpy(dst, src, ks, AES_BLOCK_SIZE); + crypto_inc(iv, AES_BLOCK_SIZE); + + dst += AES_BLOCK_SIZE; + src += AES_BLOCK_SIZE; + } while (--blocks > 0); + + ghash_do_update(walk.nbytes / AES_BLOCK_SIZE, dg, + walk.dst.virt.addr, &ctx->ghash_key, + NULL); + + err = skcipher_walk_done(&walk, + walk.nbytes % AES_BLOCK_SIZE); + } + if (walk.nbytes) + __aes_arm64_encrypt(ctx->aes_key.key_enc, ks, iv, + num_rounds(&ctx->aes_key)); + } + + /* handle the tail */ + if (walk.nbytes) { + u8 buf[GHASH_BLOCK_SIZE]; + + crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, ks, + walk.nbytes); + + memcpy(buf, walk.dst.virt.addr, walk.nbytes); + memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes); + ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + + err = skcipher_walk_done(&walk, 0); + } + + if (err) + return err; + + gcm_final(req, ctx, dg, tag, req->cryptlen); + + /* copy authtag to end of dst */ + scatterwalk_map_and_copy(tag, req->dst, req->assoclen + req->cryptlen, + crypto_aead_authsize(aead), 1); + + return 0; +} + +static int gcm_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead); + unsigned int authsize = crypto_aead_authsize(aead); + struct skcipher_walk walk; + u8 iv[AES_BLOCK_SIZE]; + u8 tag[AES_BLOCK_SIZE]; + u8 buf[GHASH_BLOCK_SIZE]; + u64 dg[2] = {}; + int err; + + if (req->assoclen) + gcm_calculate_auth_mac(req, dg); + + memcpy(iv, req->iv, GCM_IV_SIZE); + put_unaligned_be32(1, iv + GCM_IV_SIZE); + + if (likely(may_use_simd())) { + kernel_neon_begin(); + + pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + + err = skcipher_walk_aead_decrypt(&walk, req, true); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = walk.nbytes / AES_BLOCK_SIZE; + + pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr, + walk.src.virt.addr, &ctx->ghash_key, + iv, num_rounds(&ctx->aes_key)); + + err = skcipher_walk_done(&walk, + walk.nbytes % AES_BLOCK_SIZE); + } + if (walk.nbytes) + pmull_gcm_encrypt_block(iv, iv, NULL, + num_rounds(&ctx->aes_key)); + + kernel_neon_end(); + } else { + __aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv, + num_rounds(&ctx->aes_key)); + put_unaligned_be32(2, iv + GCM_IV_SIZE); + + err = skcipher_walk_aead_decrypt(&walk, req, true); + + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = walk.nbytes / AES_BLOCK_SIZE; + u8 *dst = walk.dst.virt.addr; + u8 *src = walk.src.virt.addr; + + ghash_do_update(blocks, dg, walk.src.virt.addr, + &ctx->ghash_key, NULL); + + do { + __aes_arm64_encrypt(ctx->aes_key.key_enc, + buf, iv, + num_rounds(&ctx->aes_key)); + crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE); + crypto_inc(iv, AES_BLOCK_SIZE); + + dst += AES_BLOCK_SIZE; + src += AES_BLOCK_SIZE; + } while (--blocks > 0); + + err = skcipher_walk_done(&walk, + walk.nbytes % AES_BLOCK_SIZE); + } + if (walk.nbytes) + __aes_arm64_encrypt(ctx->aes_key.key_enc, iv, iv, + num_rounds(&ctx->aes_key)); + } + + /* handle the tail */ + if (walk.nbytes) { + memcpy(buf, walk.src.virt.addr, walk.nbytes); + memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes); + ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL); + + crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv, + walk.nbytes); + + err = skcipher_walk_done(&walk, 0); + } + + if (err) + return err; + + gcm_final(req, ctx, dg, tag, req->cryptlen - authsize); + + /* compare calculated auth tag with the stored one */ + scatterwalk_map_and_copy(buf, req->src, + req->assoclen + req->cryptlen - authsize, + authsize, 0); + + if (crypto_memneq(tag, buf, authsize)) + return -EBADMSG; + return 0; +} + +static struct aead_alg gcm_aes_alg = { + .ivsize = GCM_IV_SIZE, + .chunksize = AES_BLOCK_SIZE, + .maxauthsize = AES_BLOCK_SIZE, + .setkey = gcm_setkey, + .setauthsize = gcm_setauthsize, + .encrypt = gcm_encrypt, + .decrypt = gcm_decrypt, + + .base.cra_name = "gcm(aes)", + .base.cra_driver_name = "gcm-aes-ce", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct gcm_aes_ctx), + .base.cra_module = THIS_MODULE, }; static int __init ghash_ce_mod_init(void) { - return crypto_register_shash(&ghash_alg); + int ret; + + if (!(elf_hwcap & HWCAP_ASIMD)) + return -ENODEV; + + if (elf_hwcap & HWCAP_PMULL) + pmull_ghash_update = pmull_ghash_update_p64; + + else + pmull_ghash_update = pmull_ghash_update_p8; + + ret = crypto_register_shash(&ghash_alg); + if (ret) + return ret; + + if (elf_hwcap & HWCAP_PMULL) { + ret = crypto_register_aead(&gcm_aes_alg); + if (ret) + crypto_unregister_shash(&ghash_alg); + } + return ret; } static void __exit ghash_ce_mod_exit(void) { crypto_unregister_shash(&ghash_alg); + crypto_unregister_aead(&gcm_aes_alg); } -module_cpu_feature_match(PMULL, ghash_ce_mod_init); +static const struct cpu_feature ghash_cpu_feature[] = { + { cpu_feature(PMULL) }, { } +}; +MODULE_DEVICE_TABLE(cpu, ghash_cpu_feature); + +module_init(ghash_ce_mod_init); module_exit(ghash_ce_mod_exit); diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index ea319c055f5d..efbeb3e0dcfb 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -1,7 +1,7 @@ /* * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions * - * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,6 +9,7 @@ */ #include <asm/neon.h> +#include <asm/simd.h> #include <asm/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/sha.h> @@ -37,8 +38,11 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data, { struct sha1_ce_state *sctx = shash_desc_ctx(desc); + if (!may_use_simd()) + return crypto_sha1_update(desc, data, len); + sctx->finalize = 0; - kernel_neon_begin_partial(16); + kernel_neon_begin(); sha1_base_do_update(desc, data, len, (sha1_block_fn *)sha1_ce_transform); kernel_neon_end(); @@ -52,13 +56,16 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, struct sha1_ce_state *sctx = shash_desc_ctx(desc); bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE); + if (!may_use_simd()) + return crypto_sha1_finup(desc, data, len, out); + /* * Allow the asm code to perform the finalization if there is no * partial data and the input is a round multiple of the block size. */ sctx->finalize = finalize; - kernel_neon_begin_partial(16); + kernel_neon_begin(); sha1_base_do_update(desc, data, len, (sha1_block_fn *)sha1_ce_transform); if (!finalize) @@ -71,8 +78,11 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out) { struct sha1_ce_state *sctx = shash_desc_ctx(desc); + if (!may_use_simd()) + return crypto_sha1_finup(desc, NULL, 0, out); + sctx->finalize = 0; - kernel_neon_begin_partial(16); + kernel_neon_begin(); sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform); kernel_neon_end(); return sha1_base_finish(desc, out); diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index 0ed9486f75dd..fd1ff2b13dfa 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -1,7 +1,7 @@ /* * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions * - * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,6 +9,7 @@ */ #include <asm/neon.h> +#include <asm/simd.h> #include <asm/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/sha.h> @@ -34,13 +35,19 @@ const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state, const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state, finalize); +asmlinkage void sha256_block_data_order(u32 *digest, u8 const *src, int blocks); + static int sha256_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct sha256_ce_state *sctx = shash_desc_ctx(desc); + if (!may_use_simd()) + return sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_data_order); + sctx->finalize = 0; - kernel_neon_begin_partial(28); + kernel_neon_begin(); sha256_base_do_update(desc, data, len, (sha256_block_fn *)sha2_ce_transform); kernel_neon_end(); @@ -54,13 +61,22 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data, struct sha256_ce_state *sctx = shash_desc_ctx(desc); bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE); + if (!may_use_simd()) { + if (len) + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_data_order); + sha256_base_do_finalize(desc, + (sha256_block_fn *)sha256_block_data_order); + return sha256_base_finish(desc, out); + } + /* * Allow the asm code to perform the finalization if there is no * partial data and the input is a round multiple of the block size. */ sctx->finalize = finalize; - kernel_neon_begin_partial(28); + kernel_neon_begin(); sha256_base_do_update(desc, data, len, (sha256_block_fn *)sha2_ce_transform); if (!finalize) @@ -74,8 +90,14 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out) { struct sha256_ce_state *sctx = shash_desc_ctx(desc); + if (!may_use_simd()) { + sha256_base_do_finalize(desc, + (sha256_block_fn *)sha256_block_data_order); + return sha256_base_finish(desc, out); + } + sctx->finalize = 0; - kernel_neon_begin_partial(28); + kernel_neon_begin(); sha256_base_do_finalize(desc, (sha256_block_fn *)sha2_ce_transform); kernel_neon_end(); return sha256_base_finish(desc, out); diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c index a2226f841960..b064d925fe2a 100644 --- a/arch/arm64/crypto/sha256-glue.c +++ b/arch/arm64/crypto/sha256-glue.c @@ -29,6 +29,7 @@ MODULE_ALIAS_CRYPTO("sha256"); asmlinkage void sha256_block_data_order(u32 *digest, const void *data, unsigned int num_blks); +EXPORT_SYMBOL(sha256_block_data_order); asmlinkage void sha256_block_neon(u32 *digest, const void *data, unsigned int num_blks); diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index f81c7b685fc6..2326e39d5892 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -20,7 +20,6 @@ generic-y += rwsem.h generic-y += segment.h generic-y += serial.h generic-y += set_memory.h -generic-y += simd.h generic-y += sizes.h generic-y += switch_to.h generic-y += trace_clock.h diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index 8cef47fa2218..b7e3f74822da 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -116,6 +116,8 @@ static inline void gic_write_bpr1(u32 val) #define gic_read_typer(c) readq_relaxed(c) #define gic_write_irouter(v, c) writeq_relaxed(v, c) +#define gic_read_lpir(c) readq_relaxed(c) +#define gic_write_lpir(v, c) writeq_relaxed(v, c) #define gic_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) @@ -133,5 +135,10 @@ static inline void gic_write_bpr1(u32 val) #define gicr_write_pendbaser(v, c) writeq_relaxed(v, c) #define gicr_read_pendbaser(c) readq_relaxed(c) +#define gits_write_vpropbaser(v, c) writeq_relaxed(v, c) + +#define gits_write_vpendbaser(v, c) writeq_relaxed(v, c) +#define gits_read_vpendbaser(c) readq_relaxed(c) + #endif /* __ASSEMBLY__ */ #endif /* __ASM_ARCH_GICV3_H */ diff --git a/arch/arm64/include/asm/asm-bug.h b/arch/arm64/include/asm/asm-bug.h new file mode 100644 index 000000000000..636e755bcdca --- /dev/null +++ b/arch/arm64/include/asm/asm-bug.h @@ -0,0 +1,54 @@ +#ifndef __ASM_ASM_BUG_H +/* + * Copyright (C) 2017 ARM Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#define __ASM_ASM_BUG_H + +#include <asm/brk-imm.h> + +#ifdef CONFIG_DEBUG_BUGVERBOSE +#define _BUGVERBOSE_LOCATION(file, line) __BUGVERBOSE_LOCATION(file, line) +#define __BUGVERBOSE_LOCATION(file, line) \ + .pushsection .rodata.str,"aMS",@progbits,1; \ + 2: .string file; \ + .popsection; \ + \ + .long 2b - 0b; \ + .short line; +#else +#define _BUGVERBOSE_LOCATION(file, line) +#endif + +#ifdef CONFIG_GENERIC_BUG + +#define __BUG_ENTRY(flags) \ + .pushsection __bug_table,"aw"; \ + .align 2; \ + 0: .long 1f - 0b; \ +_BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ + .short flags; \ + .popsection; \ + 1: +#else +#define __BUG_ENTRY(flags) +#endif + +#define ASM_BUG_FLAGS(flags) \ + __BUG_ENTRY(flags) \ + brk BUG_BRK_IMM + +#define ASM_BUG() ASM_BUG_FLAGS(0) + +#endif /* __ASM_ASM_BUG_H */ diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 1b67c3782d00..d58a6253c6ab 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -230,12 +230,18 @@ lr .req x30 // link register .endm /* - * @dst: Result of per_cpu(sym, smp_processor_id()) + * @dst: Result of per_cpu(sym, smp_processor_id()), can be SP for + * non-module code * @sym: The name of the per-cpu variable * @tmp: scratch register */ .macro adr_this_cpu, dst, sym, tmp +#ifndef MODULE + adrp \tmp, \sym + add \dst, \tmp, #:lo12:\sym +#else adr_l \dst, \sym +#endif mrs \tmp, tpidr_el1 add \dst, \dst, \tmp .endm @@ -353,6 +359,12 @@ alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE alternative_else dc civac, \kaddr alternative_endif + .elseif (\op == cvap) +alternative_if ARM64_HAS_DCPOP + sys 3, c7, c12, 1, \kaddr // dc cvap +alternative_else + dc cvac, \kaddr +alternative_endif .else dc \op, \kaddr .endif @@ -403,6 +415,17 @@ alternative_endif .size __pi_##x, . - x; \ ENDPROC(x) +/* + * Annotate a function as being unsuitable for kprobes. + */ +#ifdef CONFIG_KPROBES +#define NOKPROBE(x) \ + .pushsection "_kprobe_blacklist", "aw"; \ + .quad x; \ + .popsection; +#else +#define NOKPROBE(x) +#endif /* * Emit a 64-bit absolute little endian symbol reference in a way that * ensures that it will be resolved at build time, even when building a diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h index a02a57186f56..d7dc43752705 100644 --- a/arch/arm64/include/asm/bug.h +++ b/arch/arm64/include/asm/bug.h @@ -18,41 +18,12 @@ #ifndef _ARCH_ARM64_ASM_BUG_H #define _ARCH_ARM64_ASM_BUG_H -#include <asm/brk-imm.h> +#include <linux/stringify.h> -#ifdef CONFIG_DEBUG_BUGVERBOSE -#define _BUGVERBOSE_LOCATION(file, line) __BUGVERBOSE_LOCATION(file, line) -#define __BUGVERBOSE_LOCATION(file, line) \ - ".pushsection .rodata.str,\"aMS\",@progbits,1\n" \ - "2: .string \"" file "\"\n\t" \ - ".popsection\n\t" \ - \ - ".long 2b - 0b\n\t" \ - ".short " #line "\n\t" -#else -#define _BUGVERBOSE_LOCATION(file, line) -#endif - -#ifdef CONFIG_GENERIC_BUG - -#define __BUG_ENTRY(flags) \ - ".pushsection __bug_table,\"aw\"\n\t" \ - ".align 2\n\t" \ - "0: .long 1f - 0b\n\t" \ -_BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ - ".short " #flags "\n\t" \ - ".popsection\n" \ - "1: " -#else -#define __BUG_ENTRY(flags) "" -#endif +#include <asm/asm-bug.h> #define __BUG_FLAGS(flags) \ - asm volatile ( \ - __BUG_ENTRY(flags) \ - "brk %[imm]" :: [imm] "i" (BUG_BRK_IMM) \ - ); - + asm volatile (__stringify(ASM_BUG_FLAGS(flags))); #define BUG() do { \ __BUG_FLAGS(0); \ diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index d74a284abdc2..76d1cc85d5b1 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -67,7 +67,9 @@ */ extern void flush_icache_range(unsigned long start, unsigned long end); extern void __flush_dcache_area(void *addr, size_t len); +extern void __inval_dcache_area(void *addr, size_t len); extern void __clean_dcache_area_poc(void *addr, size_t len); +extern void __clean_dcache_area_pop(void *addr, size_t len); extern void __clean_dcache_area_pou(void *addr, size_t len); extern long __flush_cache_user_range(unsigned long start, unsigned long end); extern void sync_icache_aliases(void *kaddr, unsigned long len); @@ -150,6 +152,6 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) { } -int set_memory_valid(unsigned long addr, unsigned long size, int enable); +int set_memory_valid(unsigned long addr, int numpages, int enable); #endif diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 8d2272c6822c..8da621627d7c 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -39,7 +39,8 @@ #define ARM64_WORKAROUND_QCOM_FALKOR_E1003 18 #define ARM64_WORKAROUND_858921 19 #define ARM64_WORKAROUND_CAVIUM_30115 20 +#define ARM64_HAS_DCPOP 21 -#define ARM64_NCAPS 21 +#define ARM64_NCAPS 22 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 464ac850c5e2..b93904b16fc2 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -3,7 +3,9 @@ #include <asm/boot.h> #include <asm/cpufeature.h> +#include <asm/fpsimd.h> #include <asm/io.h> +#include <asm/memory.h> #include <asm/mmu_context.h> #include <asm/neon.h> #include <asm/ptrace.h> @@ -20,8 +22,8 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); #define arch_efi_call_virt_setup() \ ({ \ - kernel_neon_begin(); \ efi_virtmap_load(); \ + __efi_fpsimd_begin(); \ }) #define arch_efi_call_virt(p, f, args...) \ @@ -33,8 +35,8 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); #define arch_efi_call_virt_teardown() \ ({ \ + __efi_fpsimd_end(); \ efi_virtmap_unload(); \ - kernel_neon_end(); \ }) #define ARCH_EFI_IRQ_FLAGS_MASK (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT) @@ -48,6 +50,13 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); */ #define EFI_FDT_ALIGN SZ_2M /* used by allocate_new_fdt_and_exit_boot() */ +/* + * In some configurations (e.g. VMAP_STACK && 64K pages), stacks built into the + * kernel need greater alignment than we require the segments to be padded to. + */ +#define EFI_KIMG_ALIGN \ + (SEGMENT_ALIGN > THREAD_ALIGN ? SEGMENT_ALIGN : THREAD_ALIGN) + /* on arm64, the FDT may be located anywhere in system RAM */ static inline unsigned long efi_get_max_fdt_addr(unsigned long dram_base) { diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 3288c2b36731..33be513ef24c 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -139,7 +139,6 @@ typedef struct user_fpsimd_state elf_fpregset_t; #define SET_PERSONALITY(ex) \ ({ \ - clear_bit(TIF_32BIT, ¤t->mm->context.flags); \ clear_thread_flag(TIF_32BIT); \ current->personality &= ~READ_IMPLIES_EXEC; \ }) @@ -195,7 +194,6 @@ typedef compat_elf_greg_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; */ #define COMPAT_SET_PERSONALITY(ex) \ ({ \ - set_bit(TIF_32BIT, ¤t->mm->context.flags); \ set_thread_flag(TIF_32BIT); \ }) #define COMPAT_ARCH_DLINFO diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 8cabd57b6348..66ed8b6b9976 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -77,16 +77,23 @@ #define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT) #define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) -#define ESR_ELx_IL (UL(1) << 25) +#define ESR_ELx_IL_SHIFT (25) +#define ESR_ELx_IL (UL(1) << ESR_ELx_IL_SHIFT) #define ESR_ELx_ISS_MASK (ESR_ELx_IL - 1) /* ISS field definitions shared by different classes */ -#define ESR_ELx_WNR (UL(1) << 6) +#define ESR_ELx_WNR_SHIFT (6) +#define ESR_ELx_WNR (UL(1) << ESR_ELx_WNR_SHIFT) /* Shared ISS field definitions for Data/Instruction aborts */ -#define ESR_ELx_FnV (UL(1) << 10) -#define ESR_ELx_EA (UL(1) << 9) -#define ESR_ELx_S1PTW (UL(1) << 7) +#define ESR_ELx_SET_SHIFT (11) +#define ESR_ELx_SET_MASK (UL(3) << ESR_ELx_SET_SHIFT) +#define ESR_ELx_FnV_SHIFT (10) +#define ESR_ELx_FnV (UL(1) << ESR_ELx_FnV_SHIFT) +#define ESR_ELx_EA_SHIFT (9) +#define ESR_ELx_EA (UL(1) << ESR_ELx_EA_SHIFT) +#define ESR_ELx_S1PTW_SHIFT (7) +#define ESR_ELx_S1PTW (UL(1) << ESR_ELx_S1PTW_SHIFT) /* Shared ISS fault status code(IFSC/DFSC) for Data/Instruction aborts */ #define ESR_ELx_FSC (0x3F) @@ -97,15 +104,20 @@ #define ESR_ELx_FSC_PERM (0x0C) /* ISS field definitions for Data Aborts */ -#define ESR_ELx_ISV (UL(1) << 24) +#define ESR_ELx_ISV_SHIFT (24) +#define ESR_ELx_ISV (UL(1) << ESR_ELx_ISV_SHIFT) #define ESR_ELx_SAS_SHIFT (22) #define ESR_ELx_SAS (UL(3) << ESR_ELx_SAS_SHIFT) -#define ESR_ELx_SSE (UL(1) << 21) +#define ESR_ELx_SSE_SHIFT (21) +#define ESR_ELx_SSE (UL(1) << ESR_ELx_SSE_SHIFT) #define ESR_ELx_SRT_SHIFT (16) #define ESR_ELx_SRT_MASK (UL(0x1F) << ESR_ELx_SRT_SHIFT) -#define ESR_ELx_SF (UL(1) << 15) -#define ESR_ELx_AR (UL(1) << 14) -#define ESR_ELx_CM (UL(1) << 8) +#define ESR_ELx_SF_SHIFT (15) +#define ESR_ELx_SF (UL(1) << ESR_ELx_SF_SHIFT) +#define ESR_ELx_AR_SHIFT (14) +#define ESR_ELx_AR (UL(1) << ESR_ELx_AR_SHIFT) +#define ESR_ELx_CM_SHIFT (8) +#define ESR_ELx_CM (UL(1) << ESR_ELx_CM_SHIFT) /* ISS field definitions for exceptions taken in to Hyp */ #define ESR_ELx_CV (UL(1) << 24) @@ -157,9 +169,10 @@ /* * User space cache operations have the following sysreg encoding * in System instructions. - * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 14 }, WRITE (L=0) + * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 12, 14 }, WRITE (L=0) */ #define ESR_ELx_SYS64_ISS_CRM_DC_CIVAC 14 +#define ESR_ELx_SYS64_ISS_CRM_DC_CVAP 12 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAU 11 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAC 10 #define ESR_ELx_SYS64_ISS_CRM_IC_IVAU 5 @@ -209,6 +222,13 @@ #ifndef __ASSEMBLY__ #include <asm/types.h> +static inline bool esr_is_data_abort(u32 esr) +{ + const u32 ec = ESR_ELx_EC(esr); + + return ec == ESR_ELx_EC_DABT_LOW || ec == ESR_ELx_EC_DABT_CUR; +} + const char *esr_get_class_string(u32 esr); #endif /* __ASSEMBLY */ diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 50f559f574fe..410c48163c6a 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -41,16 +41,6 @@ struct fpsimd_state { unsigned int cpu; }; -/* - * Struct for stacking the bottom 'n' FP/SIMD registers. - */ -struct fpsimd_partial_state { - u32 fpsr; - u32 fpcr; - u32 num_regs; - __uint128_t vregs[32]; -}; - #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* Masks for extracting the FPSR and FPCR from the FPSCR */ @@ -77,9 +67,9 @@ extern void fpsimd_update_current_state(struct fpsimd_state *state); extern void fpsimd_flush_task_state(struct task_struct *target); -extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state, - u32 num_regs); -extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state); +/* For use by EFI runtime services calls only */ +extern void __efi_fpsimd_begin(void); +extern void __efi_fpsimd_end(void); #endif diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index a2daf1293028..0f5fdd388b0d 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -75,59 +75,3 @@ ldr w\tmpnr, [\state, #16 * 2 + 4] fpsimd_restore_fpcr x\tmpnr, \state .endm - -.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2 - mrs x\tmpnr1, fpsr - str w\numnr, [\state, #8] - mrs x\tmpnr2, fpcr - stp w\tmpnr1, w\tmpnr2, [\state] - adr x\tmpnr1, 0f - add \state, \state, x\numnr, lsl #4 - sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1 - br x\tmpnr1 - stp q30, q31, [\state, #-16 * 30 - 16] - stp q28, q29, [\state, #-16 * 28 - 16] - stp q26, q27, [\state, #-16 * 26 - 16] - stp q24, q25, [\state, #-16 * 24 - 16] - stp q22, q23, [\state, #-16 * 22 - 16] - stp q20, q21, [\state, #-16 * 20 - 16] - stp q18, q19, [\state, #-16 * 18 - 16] - stp q16, q17, [\state, #-16 * 16 - 16] - stp q14, q15, [\state, #-16 * 14 - 16] - stp q12, q13, [\state, #-16 * 12 - 16] - stp q10, q11, [\state, #-16 * 10 - 16] - stp q8, q9, [\state, #-16 * 8 - 16] - stp q6, q7, [\state, #-16 * 6 - 16] - stp q4, q5, [\state, #-16 * 4 - 16] - stp q2, q3, [\state, #-16 * 2 - 16] - stp q0, q1, [\state, #-16 * 0 - 16] -0: -.endm - -.macro fpsimd_restore_partial state, tmpnr1, tmpnr2 - ldp w\tmpnr1, w\tmpnr2, [\state] - msr fpsr, x\tmpnr1 - fpsimd_restore_fpcr x\tmpnr2, x\tmpnr1 - adr x\tmpnr1, 0f - ldr w\tmpnr2, [\state, #8] - add \state, \state, x\tmpnr2, lsl #4 - sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1 - br x\tmpnr1 - ldp q30, q31, [\state, #-16 * 30 - 16] - ldp q28, q29, [\state, #-16 * 28 - 16] - ldp q26, q27, [\state, #-16 * 26 - 16] - ldp q24, q25, [\state, #-16 * 24 - 16] - ldp q22, q23, [\state, #-16 * 22 - 16] - ldp q20, q21, [\state, #-16 * 20 - 16] - ldp q18, q19, [\state, #-16 * 18 - 16] - ldp q16, q17, [\state, #-16 * 16 - 16] - ldp q14, q15, [\state, #-16 * 14 - 16] - ldp q12, q13, [\state, #-16 * 12 - 16] - ldp q10, q11, [\state, #-16 * 10 - 16] - ldp q8, q9, [\state, #-16 * 8 - 16] - ldp q6, q7, [\state, #-16 * 6 - 16] - ldp q4, q5, [\state, #-16 * 4 - 16] - ldp q2, q3, [\state, #-16 * 2 - 16] - ldp q0, q1, [\state, #-16 * 0 - 16] -0: -.endm diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index f32b42e8725d..5bb2fd4674e7 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -48,20 +48,10 @@ do { \ } while (0) static inline int -futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (int)(encoded_op << 8) >> 20; - int cmparg = (int)(encoded_op << 20) >> 20; int oldval = 0, ret, tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1U << (oparg & 0x1f); - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -91,17 +81,9 @@ futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 793bd73b0d07..1dca41bea16a 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -18,7 +18,6 @@ #ifndef __ASM_HUGETLB_H #define __ASM_HUGETLB_H -#include <asm-generic/hugetlb.h> #include <asm/page.h> static inline pte_t huge_ptep_get(pte_t *ptep) @@ -82,6 +81,14 @@ extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); extern void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz); +#define huge_pte_clear huge_pte_clear +extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz); +#define set_huge_swap_pte_at set_huge_swap_pte_at + +#include <asm-generic/hugetlb.h> #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE static inline bool gigantic_page_supported(void) { return true; } diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index b77197d941fc..5e6f77239064 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -1,45 +1,12 @@ #ifndef __ASM_IRQ_H #define __ASM_IRQ_H -#define IRQ_STACK_SIZE THREAD_SIZE -#define IRQ_STACK_START_SP THREAD_START_SP - #ifndef __ASSEMBLER__ -#include <linux/percpu.h> - #include <asm-generic/irq.h> -#include <asm/thread_info.h> struct pt_regs; -DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); - -/* - * The highest address on the stack, and the first to be used. Used to - * find the dummy-stack frame put down by el?_irq() in entry.S, which - * is structured as follows: - * - * ------------ - * | | <- irq_stack_ptr - * top ------------ - * | x19 | <- irq_stack_ptr - 0x08 - * ------------ - * | x29 | <- irq_stack_ptr - 0x10 - * ------------ - * - * where x19 holds a copy of the task stack pointer where the struct pt_regs - * from kernel_entry can be found. - * - */ -#define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) - -/* - * The offset from irq_stack_ptr where entry.S will store the original - * stack pointer. Used by unwind_frame() and dump_backtrace(). - */ -#define IRQ_STACK_TO_TASK_STACK(ptr) (*((unsigned long *)((ptr) - 0x08))) - extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); static inline int nr_legacy_irqs(void) @@ -47,14 +14,5 @@ static inline int nr_legacy_irqs(void) return 0; } -static inline bool on_irq_stack(unsigned long sp, int cpu) -{ - /* variable names the same as kernel/stacktrace.c */ - unsigned long low = (unsigned long)per_cpu(irq_stack, cpu); - unsigned long high = low + IRQ_STACK_START_SP; - - return (low <= sp && sp <= high); -} - #endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index d68630007b14..e923b58606e2 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -326,12 +326,6 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -/* We do not have shadow page tables, hence the empty hooks */ -static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address) -{ -} - struct kvm_vcpu *kvm_arm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); void kvm_arm_halt_guest(struct kvm *kvm); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index a89cc22abadc..672c8684d5c2 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -175,18 +175,15 @@ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) static inline void kvm_set_s2pte_readonly(pte_t *pte) { - pteval_t pteval; - unsigned long tmp; - - asm volatile("// kvm_set_s2pte_readonly\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " and %0, %0, %3 // clear PTE_S2_RDWR\n" - " orr %0, %0, %4 // set PTE_S2_RDONLY\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*pte)) - : "L" (~PTE_S2_RDWR), "L" (PTE_S2_RDONLY)); + pteval_t old_pteval, pteval; + + pteval = READ_ONCE(pte_val(*pte)); + do { + old_pteval = pteval; + pteval &= ~PTE_S2_RDWR; + pteval |= PTE_S2_RDONLY; + pteval = cmpxchg_relaxed(&pte_val(*pte), old_pteval, pteval); + } while (pteval != old_pteval); } static inline bool kvm_s2pte_readonly(pte_t *pte) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index ef39dcb9ca6a..3585a5e26151 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -25,6 +25,7 @@ #include <linux/const.h> #include <linux/types.h> #include <asm/bug.h> +#include <asm/page-def.h> #include <asm/sizes.h> /* @@ -103,6 +104,58 @@ #define KASAN_SHADOW_SIZE (0) #endif +#define MIN_THREAD_SHIFT 14 + +/* + * VMAP'd stacks are allocated at page granularity, so we must ensure that such + * stacks are a multiple of page size. + */ +#if defined(CONFIG_VMAP_STACK) && (MIN_THREAD_SHIFT < PAGE_SHIFT) +#define THREAD_SHIFT PAGE_SHIFT +#else +#define THREAD_SHIFT MIN_THREAD_SHIFT +#endif + +#if THREAD_SHIFT >= PAGE_SHIFT +#define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT) +#endif + +#define THREAD_SIZE (UL(1) << THREAD_SHIFT) + +/* + * By aligning VMAP'd stacks to 2 * THREAD_SIZE, we can detect overflow by + * checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry + * assembly. + */ +#ifdef CONFIG_VMAP_STACK +#define THREAD_ALIGN (2 * THREAD_SIZE) +#else +#define THREAD_ALIGN THREAD_SIZE +#endif + +#define IRQ_STACK_SIZE THREAD_SIZE + +#define OVERFLOW_STACK_SIZE SZ_4K + +/* + * Alignment of kernel segments (e.g. .text, .data). + */ +#if defined(CONFIG_DEBUG_ALIGN_RODATA) +/* + * 4 KB granule: 1 level 2 entry + * 16 KB granule: 128 level 3 entries, with contiguous bit + * 64 KB granule: 32 level 3 entries, with contiguous bit + */ +#define SEGMENT_ALIGN SZ_2M +#else +/* + * 4 KB granule: 16 level 3 entries, with contiguous bit + * 16 KB granule: 4 level 3 entries, without contiguous bit + * 64 KB granule: 1 level 3 entry + */ +#define SEGMENT_ALIGN SZ_64K +#endif + /* * Memory types available. */ diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 5468c834b072..0d34bf0a89c7 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -16,6 +16,8 @@ #ifndef __ASM_MMU_H #define __ASM_MMU_H +#define MMCF_AARCH32 0x1 /* mm context flag for AArch32 executables */ + typedef struct { atomic64_t id; void *vdso; diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h index ad4cdc966c0f..f922eaf780f9 100644 --- a/arch/arm64/include/asm/neon.h +++ b/arch/arm64/include/asm/neon.h @@ -8,12 +8,22 @@ * published by the Free Software Foundation. */ +#ifndef __ASM_NEON_H +#define __ASM_NEON_H + #include <linux/types.h> #include <asm/fpsimd.h> #define cpu_has_neon() system_supports_fpsimd() -#define kernel_neon_begin() kernel_neon_begin_partial(32) - -void kernel_neon_begin_partial(u32 num_regs); +void kernel_neon_begin(void); void kernel_neon_end(void); + +/* + * Temporary macro to allow the crypto code to compile. Note that the + * semantics of kernel_neon_begin_partial() are now different from the + * original as it does not allow being called in an interrupt context. + */ +#define kernel_neon_begin_partial(num_regs) kernel_neon_begin() + +#endif /* ! __ASM_NEON_H */ diff --git a/arch/arm64/include/asm/numa.h b/arch/arm64/include/asm/numa.h index bf466d1876e3..ef7b23863a7c 100644 --- a/arch/arm64/include/asm/numa.h +++ b/arch/arm64/include/asm/numa.h @@ -7,9 +7,6 @@ #define NR_NODE_MEMBLKS (MAX_NUMNODES * 2) -/* currently, arm64 implements flat NUMA topology */ -#define parent_node(node) (node) - int __node_distance(int from, int to); #define node_distance(a, b) __node_distance(a, b) diff --git a/arch/arm64/include/asm/page-def.h b/arch/arm64/include/asm/page-def.h new file mode 100644 index 000000000000..01591a29dc2e --- /dev/null +++ b/arch/arm64/include/asm/page-def.h @@ -0,0 +1,34 @@ +/* + * Based on arch/arm/include/asm/page.h + * + * Copyright (C) 1995-2003 Russell King + * Copyright (C) 2017 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#ifndef __ASM_PAGE_DEF_H +#define __ASM_PAGE_DEF_H + +#include <linux/const.h> + +/* PAGE_SHIFT determines the page size */ +/* CONT_SHIFT determines the number of pages which can be tracked together */ +#define PAGE_SHIFT CONFIG_ARM64_PAGE_SHIFT +#define CONT_SHIFT CONFIG_ARM64_CONT_SHIFT +#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) + +#define CONT_SIZE (_AC(1, UL) << (CONT_SHIFT + PAGE_SHIFT)) +#define CONT_MASK (~(CONT_SIZE-1)) + +#endif /* __ASM_PAGE_DEF_H */ diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 8472c6def5ef..60d02c81a3a2 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -19,17 +19,7 @@ #ifndef __ASM_PAGE_H #define __ASM_PAGE_H -#include <linux/const.h> - -/* PAGE_SHIFT determines the page size */ -/* CONT_SHIFT determines the number of pages which can be tracked together */ -#define PAGE_SHIFT CONFIG_ARM64_PAGE_SHIFT -#define CONT_SHIFT CONFIG_ARM64_CONT_SHIFT -#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) - -#define CONT_SIZE (_AC(1, UL) << (CONT_SHIFT + PAGE_SHIFT)) -#define CONT_MASK (~(CONT_SIZE-1)) +#include <asm/page-def.h> #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 2142c7726e76..0a5635fb0ef9 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -63,23 +63,21 @@ #define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) #define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_UXN) -#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_PXN | PTE_UXN) +#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_PXN | PTE_UXN) #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) #define PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) -#define PAGE_COPY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) -#define PAGE_COPY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) -#define PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) -#define PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) -#define PAGE_EXECONLY __pgprot(_PAGE_DEFAULT | PTE_NG | PTE_PXN) +#define PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) +#define PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN) +#define PAGE_EXECONLY __pgprot(_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN) #define __P000 PAGE_NONE #define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY +#define __P010 PAGE_READONLY +#define __P011 PAGE_READONLY #define __P100 PAGE_EXECONLY #define __P101 PAGE_READONLY_EXEC -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_EXEC +#define __P110 PAGE_READONLY_EXEC +#define __P111 PAGE_READONLY_EXEC #define __S000 PAGE_NONE #define __S001 PAGE_READONLY diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 6eae342ced6b..bc4e92337d16 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -39,6 +39,7 @@ #ifndef __ASSEMBLY__ +#include <asm/cmpxchg.h> #include <asm/fixmap.h> #include <linux/mmdebug.h> @@ -84,11 +85,7 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; (__boundary - 1 < (end) - 1) ? __boundary : (end); \ }) -#ifdef CONFIG_ARM64_HW_AFDBM #define pte_hw_dirty(pte) (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY)) -#else -#define pte_hw_dirty(pte) (0) -#endif #define pte_sw_dirty(pte) (!!(pte_val(pte) & PTE_DIRTY)) #define pte_dirty(pte) (pte_sw_dirty(pte) || pte_hw_dirty(pte)) @@ -124,12 +121,16 @@ static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot) static inline pte_t pte_wrprotect(pte_t pte) { - return clear_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = clear_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); + return pte; } static inline pte_t pte_mkwrite(pte_t pte) { - return set_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = set_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY)); + return pte; } static inline pte_t pte_mkclean(pte_t pte) @@ -168,11 +169,6 @@ static inline pte_t pte_mknoncont(pte_t pte) return clear_pte_bit(pte, __pgprot(PTE_CONT)); } -static inline pte_t pte_clear_rdonly(pte_t pte) -{ - return clear_pte_bit(pte, __pgprot(PTE_RDONLY)); -} - static inline pte_t pte_mkpresent(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_VALID)); @@ -220,22 +216,15 @@ extern void __sync_icache_dcache(pte_t pteval, unsigned long addr); static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - if (pte_present(pte)) { - if (pte_sw_dirty(pte) && pte_write(pte)) - pte_val(pte) &= ~PTE_RDONLY; - else - pte_val(pte) |= PTE_RDONLY; - if (pte_user_exec(pte) && !pte_special(pte)) - __sync_icache_dcache(pte, addr); - } + if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte)) + __sync_icache_dcache(pte, addr); /* * If the existing pte is valid, check for potential race with * hardware updates of the pte (ptep_set_access_flags safely changes * valid ptes without going through an invalid entry). */ - if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && - pte_valid(*ptep) && pte_valid(pte)) { + if (pte_valid(*ptep) && pte_valid(pte)) { VM_WARN_ONCE(!pte_young(pte), "%s: racy access flag clearing: 0x%016llx -> 0x%016llx", __func__, pte_val(*ptep), pte_val(pte)); @@ -571,7 +560,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); } -#ifdef CONFIG_ARM64_HW_AFDBM #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, @@ -593,20 +581,17 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int __ptep_test_and_clear_young(pte_t *ptep) { - pteval_t pteval; - unsigned int tmp, res; + pte_t old_pte, pte; - asm volatile("// __ptep_test_and_clear_young\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " ubfx %w3, %w0, %5, #1 // extract PTE_AF (young)\n" - " and %0, %0, %4 // clear PTE_AF\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)), "=&r" (res) - : "L" (~PTE_AF), "I" (ilog2(PTE_AF))); + pte = READ_ONCE(*ptep); + do { + old_pte = pte; + pte = pte_mkold(pte); + pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), + pte_val(old_pte), pte_val(pte)); + } while (pte_val(pte) != pte_val(old_pte)); - return res; + return pte_young(pte); } static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, @@ -630,17 +615,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pteval_t old_pteval; - unsigned int tmp; - - asm volatile("// ptep_get_and_clear\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " stxr %w1, xzr, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep))); - - return __pte(old_pteval); + return __pte(xchg_relaxed(&pte_val(*ptep), 0)); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -653,27 +628,32 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * ptep_set_wrprotect - mark read-only while trasferring potential hardware - * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. + * ptep_set_wrprotect - mark read-only while preserving the hardware update of + * the Access Flag. */ #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pteval_t pteval; - unsigned long tmp; + pte_t old_pte, pte; - asm volatile("// ptep_set_wrprotect\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " tst %0, %4 // check for hw dirty (!PTE_RDONLY)\n" - " csel %1, %3, xzr, eq // set PTE_DIRTY|PTE_RDONLY if dirty\n" - " orr %0, %0, %1 // if !dirty, PTE_RDONLY is already set\n" - " and %0, %0, %5 // clear PTE_WRITE/PTE_DBM\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)) - : "r" (PTE_DIRTY|PTE_RDONLY), "L" (PTE_RDONLY), "L" (~PTE_WRITE) - : "cc"); + /* + * ptep_set_wrprotect() is only called on CoW mappings which are + * private (!VM_SHARED) with the pte either read-only (!PTE_WRITE && + * PTE_RDONLY) or writable and software-dirty (PTE_WRITE && + * !PTE_RDONLY && PTE_DIRTY); see is_cow_mapping() and + * protection_map[]. There is no race with the hardware update of the + * dirty state: clearing of PTE_RDONLY when PTE_WRITE (a.k.a. PTE_DBM) + * is set. + */ + VM_WARN_ONCE(pte_write(*ptep) && !pte_dirty(*ptep), + "%s: potential race with hardware DBM", __func__); + pte = READ_ONCE(*ptep); + do { + old_pte = pte; + pte = pte_wrprotect(pte); + pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), + pte_val(old_pte), pte_val(pte)); + } while (pte_val(pte) != pte_val(old_pte)); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -684,7 +664,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, ptep_set_wrprotect(mm, address, (pte_t *)pmdp); } #endif -#endif /* CONFIG_ARM64_HW_AFDBM */ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 64c9e78f9882..29adab8138c3 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -112,7 +112,7 @@ void tls_preserve_current_state(void); static inline void start_thread_common(struct pt_regs *regs, unsigned long pc) { memset(regs, 0, sizeof(*regs)); - regs->syscallno = ~0UL; + forget_syscall(regs); regs->pc = pc; } @@ -159,7 +159,7 @@ extern struct task_struct *cpu_switch_to(struct task_struct *prev, struct task_struct *next); #define task_pt_regs(p) \ - ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1) + ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1) #define KSTK_EIP(tsk) ((unsigned long)task_pt_regs(tsk)->pc) #define KSTK_ESP(tsk) user_stack_pointer(task_pt_regs(tsk)) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 11403fdd0a50..6069d66e0bc2 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -72,8 +72,19 @@ #define COMPAT_PT_TEXT_ADDR 0x10000 #define COMPAT_PT_DATA_ADDR 0x10004 #define COMPAT_PT_TEXT_END_ADDR 0x10008 + +/* + * If pt_regs.syscallno == NO_SYSCALL, then the thread is not executing + * a syscall -- i.e., its most recent entry into the kernel from + * userspace was not via SVC, or otherwise a tracer cancelled the syscall. + * + * This must have the value -1, for ABI compatibility with ptrace etc. + */ +#define NO_SYSCALL (-1) + #ifndef __ASSEMBLY__ #include <linux/bug.h> +#include <linux/types.h> /* sizeof(struct user) for AArch32 */ #define COMPAT_USER_SZ 296 @@ -116,11 +127,29 @@ struct pt_regs { }; }; u64 orig_x0; - u64 syscallno; +#ifdef __AARCH64EB__ + u32 unused2; + s32 syscallno; +#else + s32 syscallno; + u32 unused2; +#endif + u64 orig_addr_limit; u64 unused; // maintain 16 byte alignment + u64 stackframe[2]; }; +static inline bool in_syscall(struct pt_regs const *regs) +{ + return regs->syscallno != NO_SYSCALL; +} + +static inline void forget_syscall(struct pt_regs *regs) +{ + regs->syscallno = NO_SYSCALL; +} + #define MAX_REG_OFFSET offsetof(struct pt_regs, pstate) #define arch_has_single_step() (1) diff --git a/arch/arm64/include/asm/signal32.h b/arch/arm64/include/asm/signal32.h index eeaa97559bab..81abea0b7650 100644 --- a/arch/arm64/include/asm/signal32.h +++ b/arch/arm64/include/asm/signal32.h @@ -22,8 +22,6 @@ #define AARCH32_KERN_SIGRET_CODE_OFFSET 0x500 -extern const compat_ulong_t aarch32_sigret_code[6]; - int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs); int compat_setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h new file mode 100644 index 000000000000..fa8b3fe932e6 --- /dev/null +++ b/arch/arm64/include/asm/simd.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#ifndef __ASM_SIMD_H +#define __ASM_SIMD_H + +#include <linux/compiler.h> +#include <linux/irqflags.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/types.h> + +#ifdef CONFIG_KERNEL_MODE_NEON + +DECLARE_PER_CPU(bool, kernel_neon_busy); + +/* + * may_use_simd - whether it is allowable at this time to issue SIMD + * instructions or access the SIMD register file + * + * Callers must not assume that the result remains true beyond the next + * preempt_enable() or return from softirq context. + */ +static __must_check inline bool may_use_simd(void) +{ + /* + * The raw_cpu_read() is racy if called with preemption enabled. + * This is not a bug: kernel_neon_busy is only set when + * preemption is disabled, so we cannot migrate to another CPU + * while it is set, nor can we migrate to a CPU where it is set. + * So, if we find it clear on some CPU then we're guaranteed to + * find it clear on any CPU we could migrate to. + * + * If we are in between kernel_neon_begin()...kernel_neon_end(), + * the flag will be set, but preemption is also disabled, so we + * can't migrate to another CPU and spuriously see it become + * false. + */ + return !in_irq() && !irqs_disabled() && !in_nmi() && + !raw_cpu_read(kernel_neon_busy); +} + +#else /* ! CONFIG_KERNEL_MODE_NEON */ + +static __must_check inline bool may_use_simd(void) { + return false; +} + +#endif /* ! CONFIG_KERNEL_MODE_NEON */ + +#endif diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 55f08c5acfad..f82b447bd34f 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -148,7 +148,7 @@ static inline void cpu_panic_kernel(void) */ bool cpus_are_stuck_in_kernel(void); -extern void smp_send_crash_stop(void); +extern void crash_smp_send_stop(void); extern bool smp_crash_stop_failed(void); #endif /* ifndef __ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index cae331d553f8..95ad7102b63c 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -26,58 +26,6 @@ * The memory barriers are implicit with the load-acquire and store-release * instructions. */ -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - unsigned int tmp; - arch_spinlock_t lockval; - u32 owner; - - /* - * Ensure prior spin_lock operations to other locks have completed - * on this CPU before we test whether "lock" is locked. - */ - smp_mb(); - owner = READ_ONCE(lock->owner) << 16; - - asm volatile( -" sevl\n" -"1: wfe\n" -"2: ldaxr %w0, %2\n" - /* Is the lock free? */ -" eor %w1, %w0, %w0, ror #16\n" -" cbz %w1, 3f\n" - /* Lock taken -- has there been a subsequent unlock->lock transition? */ -" eor %w1, %w3, %w0, lsl #16\n" -" cbz %w1, 1b\n" - /* - * The owner has been updated, so there was an unlock->lock - * transition that we missed. That means we can rely on the - * store-release of the unlock operation paired with the - * load-acquire of the lock operation to publish any of our - * previous stores to the new lock owner and therefore don't - * need to bother with the writeback below. - */ -" b 4f\n" -"3:\n" - /* - * Serialise against any concurrent lockers by writing back the - * unlocked lock value - */ - ARM64_LSE_ATOMIC_INSN( - /* LL/SC */ -" stxr %w1, %w0, %2\n" - __nops(2), - /* LSE atomics */ -" mov %w1, %w0\n" -" cas %w0, %w0, %2\n" -" eor %w1, %w1, %w0\n") - /* Somebody else wrote to the lock, GOTO 10 and reload the value */ -" cbnz %w1, 2b\n" -"4:" - : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock) - : "r" (owner) - : "memory"); -} #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) @@ -176,7 +124,11 @@ static inline int arch_spin_value_unlocked(arch_spinlock_t lock) static inline int arch_spin_is_locked(arch_spinlock_t *lock) { - smp_mb(); /* See arch_spin_unlock_wait */ + /* + * Ensure prior spin_lock operations to other locks have completed + * on this CPU before we test whether "lock" is locked. + */ + smp_mb(); /* ^^^ */ return !arch_spin_value_unlocked(READ_ONCE(*lock)); } @@ -358,14 +310,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) #define arch_read_relax(lock) cpu_relax() #define arch_write_relax(lock) cpu_relax() -/* - * Accesses appearing in program order before a spin_lock() operation - * can be reordered with accesses inside the critical section, by virtue - * of arch_spin_lock being constructed using acquire semantics. - * - * In cases where this is problematic (e.g. try_to_wake_up), an - * smp_mb__before_spinlock() can restore the required ordering. - */ -#define smp_mb__before_spinlock() smp_mb() +/* See include/linux/spinlock.h */ +#define smp_mb__after_spinlock() smp_mb() #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 5b6eafccc5d8..6ad30776e984 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -16,11 +16,15 @@ #ifndef __ASM_STACKTRACE_H #define __ASM_STACKTRACE_H -struct task_struct; +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> + +#include <asm/memory.h> +#include <asm/ptrace.h> struct stackframe { unsigned long fp; - unsigned long sp; unsigned long pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER unsigned int graph; @@ -32,4 +36,57 @@ extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data); extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk); +DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); + +static inline bool on_irq_stack(unsigned long sp) +{ + unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr); + unsigned long high = low + IRQ_STACK_SIZE; + + if (!low) + return false; + + return (low <= sp && sp < high); +} + +static inline bool on_task_stack(struct task_struct *tsk, unsigned long sp) +{ + unsigned long low = (unsigned long)task_stack_page(tsk); + unsigned long high = low + THREAD_SIZE; + + return (low <= sp && sp < high); +} + +#ifdef CONFIG_VMAP_STACK +DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); + +static inline bool on_overflow_stack(unsigned long sp) +{ + unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack); + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return (low <= sp && sp < high); +} +#else +static inline bool on_overflow_stack(unsigned long sp) { return false; } +#endif + +/* + * We can only safely access per-cpu stacks from current in a non-preemptible + * context. + */ +static inline bool on_accessible_stack(struct task_struct *tsk, unsigned long sp) +{ + if (on_task_stack(tsk, sp)) + return true; + if (tsk != current || preemptible()) + return false; + if (on_irq_stack(sp)) + return true; + if (on_overflow_stack(sp)) + return true; + + return false; +} + #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index d0aa42907569..dd95d33a5bd5 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -52,6 +52,10 @@ extern void *__memset(void *, int, __kernel_size_t); #define __HAVE_ARCH_MEMCMP extern int memcmp(const void *, const void *, size_t); +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +#define __HAVE_ARCH_MEMCPY_FLUSHCACHE +void memcpy_flushcache(void *dst, const void *src, size_t cnt); +#endif #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 248339e4aaf5..f707fed5886f 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -329,6 +329,7 @@ #define ID_AA64ISAR1_LRCPC_SHIFT 20 #define ID_AA64ISAR1_FCMA_SHIFT 16 #define ID_AA64ISAR1_JSCVT_SHIFT 12 +#define ID_AA64ISAR1_DPB_SHIFT 0 /* id_aa64pfr0 */ #define ID_AA64PFR0_GIC_SHIFT 24 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 46c3b93cf865..ddded6497a8a 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -23,19 +23,11 @@ #include <linux/compiler.h> -#ifdef CONFIG_ARM64_4K_PAGES -#define THREAD_SIZE_ORDER 2 -#elif defined(CONFIG_ARM64_16K_PAGES) -#define THREAD_SIZE_ORDER 0 -#endif - -#define THREAD_SIZE 16384 -#define THREAD_START_SP (THREAD_SIZE - 16) - #ifndef __ASSEMBLY__ struct task_struct; +#include <asm/memory.h> #include <asm/stack_pointer.h> #include <asm/types.h> @@ -68,6 +60,9 @@ struct thread_info { #define thread_saved_fp(tsk) \ ((unsigned long)(tsk->thread.cpu_context.fp)) +void arch_setup_new_exec(void); +#define arch_setup_new_exec arch_setup_new_exec + #endif /* @@ -86,6 +81,7 @@ struct thread_info { #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ +#define TIF_FSCHECK 5 /* Check FS is USER_DS on return */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -107,11 +103,12 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_UPROBE (1 << TIF_UPROBE) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ - _TIF_UPROBE) + _TIF_UPROBE | _TIF_FSCHECK) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h index 02e9035b0685..d131501c6222 100644 --- a/arch/arm64/include/asm/traps.h +++ b/arch/arm64/include/asm/traps.h @@ -37,18 +37,11 @@ void unregister_undef_hook(struct undef_hook *hook); void arm64_notify_segfault(struct pt_regs *regs, unsigned long addr); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER static inline int __in_irqentry_text(unsigned long ptr) { return ptr >= (unsigned long)&__irqentry_text_start && ptr < (unsigned long)&__irqentry_text_end; } -#else -static inline int __in_irqentry_text(unsigned long ptr) -{ - return 0; -} -#endif static inline int in_exception_text(unsigned long ptr) { @@ -60,4 +53,9 @@ static inline int in_exception_text(unsigned long ptr) return in ? : __in_irqentry_text(ptr); } +static inline int in_entry_text(unsigned long ptr) +{ + return ptr >= (unsigned long)&__entry_text_start && + ptr < (unsigned long)&__entry_text_end; +} #endif diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index fab46a0ea223..fc0f9eb66039 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -45,6 +45,9 @@ static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); + /* * Enable/disable UAO so that copy_to_user() etc can access * kernel memory with the unprivileged instructions. @@ -347,4 +350,16 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +struct page; +void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len); +extern unsigned long __must_check __copy_user_flushcache(void *to, const void __user *from, unsigned long n); + +static inline int __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) +{ + kasan_check_write(dst, size); + return __copy_user_flushcache(dst, src, size); +} +#endif + #endif /* __ASM_UACCESS_H */ diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 4e187ce2a811..4b9344cba83a 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -35,5 +35,6 @@ #define HWCAP_JSCVT (1 << 13) #define HWCAP_FCMA (1 << 14) #define HWCAP_LRCPC (1 << 15) +#define HWCAP_DCPOP (1 << 16) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index e25c11e727fe..b3162715ed78 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -95,7 +95,7 @@ static int __init dt_scan_depth1_nodes(unsigned long node, * __acpi_map_table() will be called before page_init(), so early_ioremap() * or early_memremap() should be called here to for ACPI table mapping. */ -char *__init __acpi_map_table(unsigned long phys, unsigned long size) +void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size) { if (!size) return NULL; @@ -103,7 +103,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) return early_memremap(phys, size); } -void __init __acpi_unmap_table(char *map, unsigned long size) +void __init __acpi_unmap_table(void __iomem *map, unsigned long size) { if (!map || !size) return; diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index b3bb7ef97bc8..71bf088f1e4b 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -75,6 +75,7 @@ int main(void) DEFINE(S_ORIG_X0, offsetof(struct pt_regs, orig_x0)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit)); + DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); BLANK(); DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9f9e0064c8c1..cd52d365d1f0 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -120,6 +120,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_LRCPC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_FCMA_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_JSCVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_DPB_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -888,6 +889,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .min_field_value = 0, .matches = has_no_fpsimd, }, +#ifdef CONFIG_ARM64_PMEM + { + .desc = "Data cache clean to Point of Persistence", + .capability = ARM64_HAS_DCPOP, + .def_scope = SCOPE_SYSTEM, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .field_pos = ID_AA64ISAR1_DPB_SHIFT, + .min_field_value = 1, + }, +#endif {}, }; @@ -916,6 +928,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_ASIMDHP), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_DCPOP), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_JSCVT), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FCMA), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_LRCPC), diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index f495ee5049fd..311885962830 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -68,6 +68,7 @@ static const char *const hwcap_str[] = { "jscvt", "fcma", "lrcpc", + "dcpop", NULL }; diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index c44a82f146b1..6a27cd6dbfa6 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -41,27 +41,3 @@ ENTRY(fpsimd_load_state) fpsimd_restore x0, 8 ret ENDPROC(fpsimd_load_state) - -#ifdef CONFIG_KERNEL_MODE_NEON - -/* - * Save the bottom n FP registers. - * - * x0 - pointer to struct fpsimd_partial_state - */ -ENTRY(fpsimd_save_partial_state) - fpsimd_save_partial x0, 1, 8, 9 - ret -ENDPROC(fpsimd_save_partial_state) - -/* - * Load the bottom n FP registers. - * - * x0 - pointer to struct fpsimd_partial_state - */ -ENTRY(fpsimd_load_partial_state) - fpsimd_restore_partial x0, 8, 9 - ret -ENDPROC(fpsimd_load_partial_state) - -#endif diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index b738880350f9..e1c59d4008a8 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -69,8 +69,55 @@ #define BAD_FIQ 2 #define BAD_ERROR 3 - .macro kernel_entry, el, regsize = 64 + .macro kernel_ventry label + .align 7 sub sp, sp, #S_FRAME_SIZE +#ifdef CONFIG_VMAP_STACK + /* + * Test whether the SP has overflowed, without corrupting a GPR. + * Task and IRQ stacks are aligned to (1 << THREAD_SHIFT). + */ + add sp, sp, x0 // sp' = sp + x0 + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp + tbnz x0, #THREAD_SHIFT, 0f + sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 + sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp + b \label + +0: + /* + * Either we've just detected an overflow, or we've taken an exception + * while on the overflow stack. Either way, we won't return to + * userspace, and can clobber EL0 registers to free up GPRs. + */ + + /* Stash the original SP (minus S_FRAME_SIZE) in tpidr_el0. */ + msr tpidr_el0, x0 + + /* Recover the original x0 value and stash it in tpidrro_el0 */ + sub x0, sp, x0 + msr tpidrro_el0, x0 + + /* Switch to the overflow stack */ + adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0 + + /* + * Check whether we were already on the overflow stack. This may happen + * after panic() re-enables interrupts. + */ + mrs x0, tpidr_el0 // sp of interrupted context + sub x0, sp, x0 // delta with top of overflow stack + tst x0, #~(OVERFLOW_STACK_SIZE - 1) // within range? + b.ne __bad_stack // no? -> bad stack pointer + + /* We were already on the overflow stack. Restore sp/x0 and carry on. */ + sub sp, sp, x0 + mrs x0, tpidrro_el0 +#endif + b \label + .endm + + .macro kernel_entry, el, regsize = 64 .if \regsize == 32 mov w0, w0 // zero upper 32 bits of x0 .endif @@ -111,6 +158,18 @@ mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] + /* + * In order to be able to dump the contents of struct pt_regs at the + * time the exception was taken (in case we attempt to walk the call + * stack later), chain it together with the stack frames. + */ + .if \el == 0 + stp xzr, xzr, [sp, #S_STACKFRAME] + .else + stp x29, x22, [sp, #S_STACKFRAME] + .endif + add x29, sp, #S_STACKFRAME + #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Set the TTBR0 PAN bit in SPSR. When the exception is taken from @@ -138,12 +197,10 @@ alternative_else_nop_endif stp x22, x23, [sp, #S_PC] - /* - * Set syscallno to -1 by default (overridden later if real syscall). - */ + /* Not in a syscall by default (el0_svc overwrites for real syscall) */ .if \el == 0 - mvn x21, xzr - str x21, [sp, #S_SYSCALLNO] + mov w21, #NO_SYSCALL + str w21, [sp, #S_SYSCALLNO] .endif /* @@ -259,20 +316,12 @@ alternative_else_nop_endif and x25, x25, #~(THREAD_SIZE - 1) cbnz x25, 9998f - adr_this_cpu x25, irq_stack, x26 - mov x26, #IRQ_STACK_START_SP + ldr_this_cpu x25, irq_stack_ptr, x26 + mov x26, #IRQ_STACK_SIZE add x26, x25, x26 /* switch to the irq stack */ mov sp, x26 - - /* - * Add a dummy stack frame, this non-standard format is fixed up - * by unwind_frame() - */ - stp x29, x19, [sp, #-16]! - mov x29, sp - 9998: .endm @@ -290,8 +339,9 @@ alternative_else_nop_endif * * x7 is reserved for the system call number in 32-bit mode. */ -sc_nr .req x25 // number of system calls -scno .req x26 // syscall number +wsc_nr .req w25 // number of system calls +wscno .req w26 // syscall number +xscno .req x26 // syscall number (zero-extended) stbl .req x27 // syscall table pointer tsk .req x28 // current thread_info @@ -315,34 +365,62 @@ tsk .req x28 // current thread_info .align 11 ENTRY(vectors) - ventry el1_sync_invalid // Synchronous EL1t - ventry el1_irq_invalid // IRQ EL1t - ventry el1_fiq_invalid // FIQ EL1t - ventry el1_error_invalid // Error EL1t + kernel_ventry el1_sync_invalid // Synchronous EL1t + kernel_ventry el1_irq_invalid // IRQ EL1t + kernel_ventry el1_fiq_invalid // FIQ EL1t + kernel_ventry el1_error_invalid // Error EL1t - ventry el1_sync // Synchronous EL1h - ventry el1_irq // IRQ EL1h - ventry el1_fiq_invalid // FIQ EL1h - ventry el1_error_invalid // Error EL1h + kernel_ventry el1_sync // Synchronous EL1h + kernel_ventry el1_irq // IRQ EL1h + kernel_ventry el1_fiq_invalid // FIQ EL1h + kernel_ventry el1_error_invalid // Error EL1h - ventry el0_sync // Synchronous 64-bit EL0 - ventry el0_irq // IRQ 64-bit EL0 - ventry el0_fiq_invalid // FIQ 64-bit EL0 - ventry el0_error_invalid // Error 64-bit EL0 + kernel_ventry el0_sync // Synchronous 64-bit EL0 + kernel_ventry el0_irq // IRQ 64-bit EL0 + kernel_ventry el0_fiq_invalid // FIQ 64-bit EL0 + kernel_ventry el0_error_invalid // Error 64-bit EL0 #ifdef CONFIG_COMPAT - ventry el0_sync_compat // Synchronous 32-bit EL0 - ventry el0_irq_compat // IRQ 32-bit EL0 - ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 - ventry el0_error_invalid_compat // Error 32-bit EL0 + kernel_ventry el0_sync_compat // Synchronous 32-bit EL0 + kernel_ventry el0_irq_compat // IRQ 32-bit EL0 + kernel_ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 + kernel_ventry el0_error_invalid_compat // Error 32-bit EL0 #else - ventry el0_sync_invalid // Synchronous 32-bit EL0 - ventry el0_irq_invalid // IRQ 32-bit EL0 - ventry el0_fiq_invalid // FIQ 32-bit EL0 - ventry el0_error_invalid // Error 32-bit EL0 + kernel_ventry el0_sync_invalid // Synchronous 32-bit EL0 + kernel_ventry el0_irq_invalid // IRQ 32-bit EL0 + kernel_ventry el0_fiq_invalid // FIQ 32-bit EL0 + kernel_ventry el0_error_invalid // Error 32-bit EL0 #endif END(vectors) +#ifdef CONFIG_VMAP_STACK + /* + * We detected an overflow in kernel_ventry, which switched to the + * overflow stack. Stash the exception regs, and head to our overflow + * handler. + */ +__bad_stack: + /* Restore the original x0 value */ + mrs x0, tpidrro_el0 + + /* + * Store the original GPRs to the new stack. The orginal SP (minus + * S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry. + */ + sub sp, sp, #S_FRAME_SIZE + kernel_entry 1 + mrs x0, tpidr_el0 + add x0, x0, #S_FRAME_SIZE + str x0, [sp, #S_SP] + + /* Stash the regs for handle_bad_stack */ + mov x0, sp + + /* Time to die */ + bl handle_bad_stack + ASM_BUG() +#endif /* CONFIG_VMAP_STACK */ + /* * Invalid mode handlers */ @@ -351,7 +429,8 @@ END(vectors) mov x0, sp mov x1, #\reason mrs x2, esr_el1 - b bad_mode + bl bad_mode + ASM_BUG() .endm el0_sync_invalid: @@ -448,14 +527,16 @@ el1_sp_pc: mrs x0, far_el1 enable_dbg mov x2, sp - b do_sp_pc_abort + bl do_sp_pc_abort + ASM_BUG() el1_undef: /* * Undefined instruction */ enable_dbg mov x0, sp - b do_undefinstr + bl do_undefinstr + ASM_BUG() el1_dbg: /* * Debug exception handling @@ -473,7 +554,8 @@ el1_inv: mov x0, sp mov x2, x1 mov x1, #BAD_SYNC - b bad_mode + bl bad_mode + ASM_BUG() ENDPROC(el1_sync) .align 6 @@ -577,8 +659,8 @@ el0_svc_compat: * AArch32 syscall handling */ adrp stbl, compat_sys_call_table // load compat syscall table pointer - uxtw scno, w7 // syscall number in w7 (r7) - mov sc_nr, #__NR_compat_syscalls + mov wscno, w7 // syscall number in w7 (r7) + mov wsc_nr, #__NR_compat_syscalls b el0_svc_naked .align 6 @@ -707,38 +789,6 @@ el0_irq_naked: ENDPROC(el0_irq) /* - * Register switch for AArch64. The callee-saved registers need to be saved - * and restored. On entry: - * x0 = previous task_struct (must be preserved across the switch) - * x1 = next task_struct - * Previous and next are guaranteed not to be the same. - * - */ -ENTRY(cpu_switch_to) - mov x10, #THREAD_CPU_CONTEXT - add x8, x0, x10 - mov x9, sp - stp x19, x20, [x8], #16 // store callee-saved registers - stp x21, x22, [x8], #16 - stp x23, x24, [x8], #16 - stp x25, x26, [x8], #16 - stp x27, x28, [x8], #16 - stp x29, x9, [x8], #16 - str lr, [x8] - add x8, x1, x10 - ldp x19, x20, [x8], #16 // restore callee-saved registers - ldp x21, x22, [x8], #16 - ldp x23, x24, [x8], #16 - ldp x25, x26, [x8], #16 - ldp x27, x28, [x8], #16 - ldp x29, x9, [x8], #16 - ldr lr, [x8] - mov sp, x9 - msr sp_el0, x1 - ret -ENDPROC(cpu_switch_to) - -/* * This is the fast syscall return path. We do as little as possible here, * and this includes saving x0 back into the kernel stack. */ @@ -781,36 +831,24 @@ finish_ret_to_user: ENDPROC(ret_to_user) /* - * This is how we return from a fork. - */ -ENTRY(ret_from_fork) - bl schedule_tail - cbz x19, 1f // not a kernel thread - mov x0, x20 - blr x19 -1: get_thread_info tsk - b ret_to_user -ENDPROC(ret_from_fork) - -/* * SVC handler. */ .align 6 el0_svc: adrp stbl, sys_call_table // load syscall table pointer - uxtw scno, w8 // syscall number in w8 - mov sc_nr, #__NR_syscalls + mov wscno, w8 // syscall number in w8 + mov wsc_nr, #__NR_syscalls el0_svc_naked: // compat entry point - stp x0, scno, [sp, #S_ORIG_X0] // save the original x0 and syscall number + stp x0, xscno, [sp, #S_ORIG_X0] // save the original x0 and syscall number enable_dbg_and_irq ct_user_exit 1 ldr x16, [tsk, #TSK_TI_FLAGS] // check for syscall hooks tst x16, #_TIF_SYSCALL_WORK b.ne __sys_trace - cmp scno, sc_nr // check upper syscall limit + cmp wscno, wsc_nr // check upper syscall limit b.hs ni_sys - ldr x16, [stbl, scno, lsl #3] // address in the syscall table + ldr x16, [stbl, xscno, lsl #3] // address in the syscall table blr x16 // call sys_* routine b ret_fast_syscall ni_sys: @@ -824,24 +862,23 @@ ENDPROC(el0_svc) * switches, and waiting for our parent to respond. */ __sys_trace: - mov w0, #-1 // set default errno for - cmp scno, x0 // user-issued syscall(-1) + cmp wscno, #NO_SYSCALL // user-issued syscall(-1)? b.ne 1f - mov x0, #-ENOSYS + mov x0, #-ENOSYS // set default errno if so str x0, [sp, #S_X0] 1: mov x0, sp bl syscall_trace_enter - cmp w0, #-1 // skip the syscall? + cmp w0, #NO_SYSCALL // skip the syscall? b.eq __sys_trace_return_skipped - uxtw scno, w0 // syscall number (possibly new) + mov wscno, w0 // syscall number (possibly new) mov x1, sp // pointer to regs - cmp scno, sc_nr // check upper syscall limit + cmp wscno, wsc_nr // check upper syscall limit b.hs __ni_sys_trace ldp x0, x1, [sp] // restore the syscall args ldp x2, x3, [sp, #S_X2] ldp x4, x5, [sp, #S_X4] ldp x6, x7, [sp, #S_X6] - ldr x16, [stbl, scno, lsl #3] // address in the syscall table + ldr x16, [stbl, xscno, lsl #3] // address in the syscall table blr x16 // call sys_* routine __sys_trace_return: @@ -865,3 +902,49 @@ ENTRY(sys_rt_sigreturn_wrapper) mov x0, sp b sys_rt_sigreturn ENDPROC(sys_rt_sigreturn_wrapper) + +/* + * Register switch for AArch64. The callee-saved registers need to be saved + * and restored. On entry: + * x0 = previous task_struct (must be preserved across the switch) + * x1 = next task_struct + * Previous and next are guaranteed not to be the same. + * + */ +ENTRY(cpu_switch_to) + mov x10, #THREAD_CPU_CONTEXT + add x8, x0, x10 + mov x9, sp + stp x19, x20, [x8], #16 // store callee-saved registers + stp x21, x22, [x8], #16 + stp x23, x24, [x8], #16 + stp x25, x26, [x8], #16 + stp x27, x28, [x8], #16 + stp x29, x9, [x8], #16 + str lr, [x8] + add x8, x1, x10 + ldp x19, x20, [x8], #16 // restore callee-saved registers + ldp x21, x22, [x8], #16 + ldp x23, x24, [x8], #16 + ldp x25, x26, [x8], #16 + ldp x27, x28, [x8], #16 + ldp x29, x9, [x8], #16 + ldr lr, [x8] + mov sp, x9 + msr sp_el0, x1 + ret +ENDPROC(cpu_switch_to) +NOKPROBE(cpu_switch_to) + +/* + * This is how we return from a fork. + */ +ENTRY(ret_from_fork) + bl schedule_tail + cbz x19, 1f // not a kernel thread + mov x0, x20 + blr x19 +1: get_thread_info tsk + b ret_to_user +ENDPROC(ret_from_fork) +NOKPROBE(ret_from_fork) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index c7b4995868e1..3a68cf38a6b3 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -17,16 +17,19 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/bottom_half.h> #include <linux/cpu.h> #include <linux/cpu_pm.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/percpu.h> +#include <linux/preempt.h> #include <linux/sched/signal.h> #include <linux/signal.h> -#include <linux/hardirq.h> #include <asm/fpsimd.h> #include <asm/cputype.h> +#include <asm/simd.h> #define FPEXC_IOF (1 << 0) #define FPEXC_DZF (1 << 1) @@ -62,6 +65,13 @@ * CPU currently contain the most recent userland FPSIMD state of the current * task. * + * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may + * save the task's FPSIMD context back to task_struct from softirq context. + * To prevent this from racing with the manipulation of the task's FPSIMD state + * from task context and thereby corrupting the state, it is necessary to + * protect any manipulation of a task's fpsimd_state or TIF_FOREIGN_FPSTATE + * flag with local_bh_disable() unless softirqs are already masked. + * * For a certain task, the sequence may look something like this: * - the task gets scheduled in; if both the task's fpsimd_state.cpu field * contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu @@ -161,11 +171,14 @@ void fpsimd_flush_thread(void) { if (!system_supports_fpsimd()) return; - preempt_disable(); + + local_bh_disable(); + memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); fpsimd_flush_task_state(current); set_thread_flag(TIF_FOREIGN_FPSTATE); - preempt_enable(); + + local_bh_enable(); } /* @@ -176,10 +189,13 @@ void fpsimd_preserve_current_state(void) { if (!system_supports_fpsimd()) return; - preempt_disable(); + + local_bh_disable(); + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) fpsimd_save_state(¤t->thread.fpsimd_state); - preempt_enable(); + + local_bh_enable(); } /* @@ -191,15 +207,18 @@ void fpsimd_restore_current_state(void) { if (!system_supports_fpsimd()) return; - preempt_disable(); + + local_bh_disable(); + if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { struct fpsimd_state *st = ¤t->thread.fpsimd_state; fpsimd_load_state(st); - this_cpu_write(fpsimd_last_state, st); + __this_cpu_write(fpsimd_last_state, st); st->cpu = smp_processor_id(); } - preempt_enable(); + + local_bh_enable(); } /* @@ -211,15 +230,18 @@ void fpsimd_update_current_state(struct fpsimd_state *state) { if (!system_supports_fpsimd()) return; - preempt_disable(); + + local_bh_disable(); + fpsimd_load_state(state); if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { struct fpsimd_state *st = ¤t->thread.fpsimd_state; - this_cpu_write(fpsimd_last_state, st); + __this_cpu_write(fpsimd_last_state, st); st->cpu = smp_processor_id(); } - preempt_enable(); + + local_bh_enable(); } /* @@ -232,52 +254,122 @@ void fpsimd_flush_task_state(struct task_struct *t) #ifdef CONFIG_KERNEL_MODE_NEON -static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate); -static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); +DEFINE_PER_CPU(bool, kernel_neon_busy); +EXPORT_PER_CPU_SYMBOL(kernel_neon_busy); /* * Kernel-side NEON support functions */ -void kernel_neon_begin_partial(u32 num_regs) + +/* + * kernel_neon_begin(): obtain the CPU FPSIMD registers for use by the calling + * context + * + * Must not be called unless may_use_simd() returns true. + * Task context in the FPSIMD registers is saved back to memory as necessary. + * + * A matching call to kernel_neon_end() must be made before returning from the + * calling context. + * + * The caller may freely use the FPSIMD registers until kernel_neon_end() is + * called. + */ +void kernel_neon_begin(void) { if (WARN_ON(!system_supports_fpsimd())) return; - if (in_interrupt()) { - struct fpsimd_partial_state *s = this_cpu_ptr( - in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); - BUG_ON(num_regs > 32); - fpsimd_save_partial_state(s, roundup(num_regs, 2)); - } else { - /* - * Save the userland FPSIMD state if we have one and if we - * haven't done so already. Clear fpsimd_last_state to indicate - * that there is no longer userland FPSIMD state in the - * registers. - */ - preempt_disable(); - if (current->mm && - !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) - fpsimd_save_state(¤t->thread.fpsimd_state); - this_cpu_write(fpsimd_last_state, NULL); - } + BUG_ON(!may_use_simd()); + + local_bh_disable(); + + __this_cpu_write(kernel_neon_busy, true); + + /* Save unsaved task fpsimd state, if any: */ + if (current->mm && !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) + fpsimd_save_state(¤t->thread.fpsimd_state); + + /* Invalidate any task state remaining in the fpsimd regs: */ + __this_cpu_write(fpsimd_last_state, NULL); + + preempt_disable(); + + local_bh_enable(); } -EXPORT_SYMBOL(kernel_neon_begin_partial); +EXPORT_SYMBOL(kernel_neon_begin); +/* + * kernel_neon_end(): give the CPU FPSIMD registers back to the current task + * + * Must be called from a context in which kernel_neon_begin() was previously + * called, with no call to kernel_neon_end() in the meantime. + * + * The caller must not use the FPSIMD registers after this function is called, + * unless kernel_neon_begin() is called again in the meantime. + */ void kernel_neon_end(void) { + bool busy; + if (!system_supports_fpsimd()) return; - if (in_interrupt()) { - struct fpsimd_partial_state *s = this_cpu_ptr( - in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); - fpsimd_load_partial_state(s); - } else { - preempt_enable(); - } + + busy = __this_cpu_xchg(kernel_neon_busy, false); + WARN_ON(!busy); /* No matching kernel_neon_begin()? */ + + preempt_enable(); } EXPORT_SYMBOL(kernel_neon_end); +static DEFINE_PER_CPU(struct fpsimd_state, efi_fpsimd_state); +static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); + +/* + * EFI runtime services support functions + * + * The ABI for EFI runtime services allows EFI to use FPSIMD during the call. + * This means that for EFI (and only for EFI), we have to assume that FPSIMD + * is always used rather than being an optional accelerator. + * + * These functions provide the necessary support for ensuring FPSIMD + * save/restore in the contexts from which EFI is used. + * + * Do not use them for any other purpose -- if tempted to do so, you are + * either doing something wrong or you need to propose some refactoring. + */ + +/* + * __efi_fpsimd_begin(): prepare FPSIMD for making an EFI runtime services call + */ +void __efi_fpsimd_begin(void) +{ + if (!system_supports_fpsimd()) + return; + + WARN_ON(preemptible()); + + if (may_use_simd()) + kernel_neon_begin(); + else { + fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state)); + __this_cpu_write(efi_fpsimd_state_used, true); + } +} + +/* + * __efi_fpsimd_end(): clean up FPSIMD after an EFI runtime services call + */ +void __efi_fpsimd_end(void) +{ + if (!system_supports_fpsimd()) + return; + + if (__this_cpu_xchg(efi_fpsimd_state_used, false)) + fpsimd_load_state(this_cpu_ptr(&efi_fpsimd_state)); + else + kernel_neon_end(); +} + #endif /* CONFIG_KERNEL_MODE_NEON */ #ifdef CONFIG_CPU_PM diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index adb0910b88f5..7434ec0c7a27 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -143,8 +143,8 @@ preserve_boot_args: dmb sy // needed before dc ivac with // MMU off - add x1, x0, #0x20 // 4 x 8 bytes - b __inval_cache_range // tail call + mov x1, #0x20 // 4 x 8 bytes + b __inval_dcache_area // tail call ENDPROC(preserve_boot_args) /* @@ -221,20 +221,20 @@ __create_page_tables: * dirty cache lines being evicted. */ adrp x0, idmap_pg_dir - adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE - bl __inval_cache_range + ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + bl __inval_dcache_area /* * Clear the idmap and swapper page tables. */ adrp x0, idmap_pg_dir - adrp x6, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE + ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 - cmp x0, x6 - b.lo 1b + subs x1, x1, #64 + b.ne 1b mov x7, SWAPPER_MM_MMUFLAGS @@ -307,9 +307,9 @@ __create_page_tables: * tables again to remove any speculatively loaded cache lines. */ adrp x0, idmap_pg_dir - adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE + ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) dmb sy - bl __inval_cache_range + bl __inval_dcache_area ret x28 ENDPROC(__create_page_tables) @@ -361,6 +361,9 @@ __primary_switched: ret // to __primary_switch() 0: #endif + add sp, sp, #16 + mov x29, #0 + mov x30, #0 b start_kernel ENDPROC(__primary_switched) @@ -616,6 +619,7 @@ __secondary_switched: ldr x2, [x0, #CPU_BOOT_TASK] msr sp_el0, x2 mov x29, #0 + mov x30, #0 b secondary_start_kernel ENDPROC(__secondary_switched) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index a44e13942d30..095d3c170f5d 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -330,7 +330,7 @@ static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr) * read only (code, rodata). Clear the RDONLY bit from * the temporary mappings we use during restore. */ - set_pte(dst_pte, pte_clear_rdonly(pte)); + set_pte(dst_pte, pte_mkwrite(pte)); } else if (debug_pagealloc_enabled() && !pte_none(pte)) { /* * debug_pagealloc will removed the PTE_VALID bit if @@ -343,7 +343,7 @@ static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr) */ BUG_ON(!pfn_valid(pte_pfn(pte))); - set_pte(dst_pte, pte_mkpresent(pte_clear_rdonly(pte))); + set_pte(dst_pte, pte_mkpresent(pte_mkwrite(pte))); } } diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 2386b26c0712..713561e5bcab 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -23,15 +23,16 @@ #include <linux/kernel_stat.h> #include <linux/irq.h> +#include <linux/memory.h> #include <linux/smp.h> #include <linux/init.h> #include <linux/irqchip.h> #include <linux/seq_file.h> +#include <linux/vmalloc.h> unsigned long irq_err_count; -/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ -DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); +DEFINE_PER_CPU(unsigned long *, irq_stack_ptr); int arch_show_interrupts(struct seq_file *p, int prec) { @@ -50,8 +51,43 @@ void __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) handle_arch_irq = handle_irq; } +#ifdef CONFIG_VMAP_STACK +static void init_irq_stacks(void) +{ + int cpu; + unsigned long *p; + + for_each_possible_cpu(cpu) { + /* + * To ensure that VMAP'd stack overflow detection works + * correctly, the IRQ stacks need to have the same + * alignment as other stacks. + */ + p = __vmalloc_node_range(IRQ_STACK_SIZE, THREAD_ALIGN, + VMALLOC_START, VMALLOC_END, + THREADINFO_GFP, PAGE_KERNEL, + 0, cpu_to_node(cpu), + __builtin_return_address(0)); + + per_cpu(irq_stack_ptr, cpu) = p; + } +} +#else +/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ +DEFINE_PER_CPU_ALIGNED(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); + +static void init_irq_stacks(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack, cpu); +} +#endif + void __init init_IRQ(void) { + init_irq_stacks(); irqchip_init(); if (!handle_arch_irq) panic("No interrupt controller found."); diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 481f54a866c5..11121f608eb5 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -252,7 +252,7 @@ void machine_crash_shutdown(struct pt_regs *regs) local_irq_disable(); /* shutdown non-crashing cpus */ - smp_send_crash_stop(); + crash_smp_send_stop(); /* for crashing cpu */ crash_save_cpu(regs, smp_processor_id()); diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 713ca824f266..bcafd7dcfe8b 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -162,7 +162,6 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, } frame.fp = regs->regs[29]; - frame.sp = regs->sp; frame.pc = regs->pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = current->curr_ret_stack; diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index b5798ba21189..9eaef51f83ff 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -202,55 +202,6 @@ static const unsigned armv8_pmuv3_perf_map[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = ARMV8_PMUV3_PERFCTR_STALL_BACKEND, }; -/* ARM Cortex-A53 HW events mapping. */ -static const unsigned armv8_a53_perf_map[PERF_COUNT_HW_MAX] = { - PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = ARMV8_PMUV3_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_INST_RETIRED, - [PERF_COUNT_HW_CACHE_REFERENCES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [PERF_COUNT_HW_CACHE_MISSES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED, - [PERF_COUNT_HW_BRANCH_MISSES] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [PERF_COUNT_HW_BUS_CYCLES] = ARMV8_PMUV3_PERFCTR_BUS_CYCLES, -}; - -/* ARM Cortex-A57 and Cortex-A72 events mapping. */ -static const unsigned armv8_a57_perf_map[PERF_COUNT_HW_MAX] = { - PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = ARMV8_PMUV3_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_INST_RETIRED, - [PERF_COUNT_HW_CACHE_REFERENCES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [PERF_COUNT_HW_CACHE_MISSES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [PERF_COUNT_HW_BRANCH_MISSES] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [PERF_COUNT_HW_BUS_CYCLES] = ARMV8_PMUV3_PERFCTR_BUS_CYCLES, -}; - -static const unsigned armv8_thunder_perf_map[PERF_COUNT_HW_MAX] = { - PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = ARMV8_PMUV3_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_INST_RETIRED, - [PERF_COUNT_HW_CACHE_REFERENCES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [PERF_COUNT_HW_CACHE_MISSES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED, - [PERF_COUNT_HW_BRANCH_MISSES] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = ARMV8_PMUV3_PERFCTR_STALL_FRONTEND, - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = ARMV8_PMUV3_PERFCTR_STALL_BACKEND, -}; - -/* Broadcom Vulcan events mapping */ -static const unsigned armv8_vulcan_perf_map[PERF_COUNT_HW_MAX] = { - PERF_MAP_ALL_UNSUPPORTED, - [PERF_COUNT_HW_CPU_CYCLES] = ARMV8_PMUV3_PERFCTR_CPU_CYCLES, - [PERF_COUNT_HW_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_INST_RETIRED, - [PERF_COUNT_HW_CACHE_REFERENCES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [PERF_COUNT_HW_CACHE_MISSES] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV8_PMUV3_PERFCTR_BR_RETIRED, - [PERF_COUNT_HW_BRANCH_MISSES] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [PERF_COUNT_HW_BUS_CYCLES] = ARMV8_PMUV3_PERFCTR_BUS_CYCLES, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = ARMV8_PMUV3_PERFCTR_STALL_FRONTEND, - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = ARMV8_PMUV3_PERFCTR_STALL_BACKEND, -}; - static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { @@ -281,27 +232,10 @@ static const unsigned armv8_a53_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { PERF_CACHE_MAP_ALL_UNSUPPORTED, - [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [C(L1D)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, - [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE, - [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL, [C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_A53_PERFCTR_PREF_LINEFILL, - [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, - [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, - - [C(LL)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L2D_CACHE, - [C(LL)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL, - [C(LL)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L2D_CACHE, - [C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL, - - [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL, - [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, - - [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, + [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, + [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, }; static const unsigned armv8_a57_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] @@ -314,18 +248,26 @@ static const unsigned armv8_a57_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR, - [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, - [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, - [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, - [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, + [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, + [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, +}; - [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, +static const unsigned armv8_a73_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + PERF_CACHE_MAP_ALL_UNSUPPORTED, + + [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD, + [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, + + [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, + [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, + + [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, + [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, }; static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] @@ -340,8 +282,6 @@ static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(L1D)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_ACCESS, [C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_MISS, - [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, - [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, [C(L1I)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS, [C(L1I)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS, @@ -349,13 +289,6 @@ static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR, [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, - - [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, - - [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, }; static const unsigned armv8_vulcan_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] @@ -368,22 +301,11 @@ static const unsigned armv8_vulcan_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR, [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR, - [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE, - [C(L1I)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL, - - [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, - [C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB, - [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD, [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR, [C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD, [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR, - [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, - [C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, - [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD, [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR, }; @@ -846,17 +768,14 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, struct hw_perf_event *hwc = &event->hw; unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; - /* Always place a cycle counter into the cycle counter. */ + /* Always prefer to place a cycle counter into the cycle counter. */ if (evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) { - if (test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask)) - return -EAGAIN; - - return ARMV8_IDX_CYCLE_COUNTER; + if (!test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask)) + return ARMV8_IDX_CYCLE_COUNTER; } /* - * For anything other than a cycle counter, try and use - * the events counters + * Otherwise use events counters */ for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; ++idx) { if (!test_and_set_bit(idx, cpuc->used_mask)) @@ -924,7 +843,13 @@ static void armv8pmu_reset(void *info) ARMV8_PMU_PMCR_LC); } -static int armv8_pmuv3_map_event(struct perf_event *event) +static int __armv8_pmuv3_map_event(struct perf_event *event, + const unsigned (*extra_event_map) + [PERF_COUNT_HW_MAX], + const unsigned (*extra_cache_map) + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]) { int hw_event_id; struct arm_pmu *armpmu = to_arm_pmu(event->pmu); @@ -932,44 +857,47 @@ static int armv8_pmuv3_map_event(struct perf_event *event) hw_event_id = armpmu_map_event(event, &armv8_pmuv3_perf_map, &armv8_pmuv3_perf_cache_map, ARMV8_PMU_EVTYPE_EVENT); - if (hw_event_id < 0) - return hw_event_id; - /* disable micro/arch events not supported by this PMU */ - if ((hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS) && - !test_bit(hw_event_id, armpmu->pmceid_bitmap)) { - return -EOPNOTSUPP; + /* Onl expose micro/arch events supported by this PMU */ + if ((hw_event_id > 0) && (hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS) + && test_bit(hw_event_id, armpmu->pmceid_bitmap)) { + return hw_event_id; } - return hw_event_id; + return armpmu_map_event(event, extra_event_map, extra_cache_map, + ARMV8_PMU_EVTYPE_EVENT); +} + +static int armv8_pmuv3_map_event(struct perf_event *event) +{ + return __armv8_pmuv3_map_event(event, NULL, NULL); } static int armv8_a53_map_event(struct perf_event *event) { - return armpmu_map_event(event, &armv8_a53_perf_map, - &armv8_a53_perf_cache_map, - ARMV8_PMU_EVTYPE_EVENT); + return __armv8_pmuv3_map_event(event, NULL, &armv8_a53_perf_cache_map); } static int armv8_a57_map_event(struct perf_event *event) { - return armpmu_map_event(event, &armv8_a57_perf_map, - &armv8_a57_perf_cache_map, - ARMV8_PMU_EVTYPE_EVENT); + return __armv8_pmuv3_map_event(event, NULL, &armv8_a57_perf_cache_map); +} + +static int armv8_a73_map_event(struct perf_event *event) +{ + return __armv8_pmuv3_map_event(event, NULL, &armv8_a73_perf_cache_map); } static int armv8_thunder_map_event(struct perf_event *event) { - return armpmu_map_event(event, &armv8_thunder_perf_map, - &armv8_thunder_perf_cache_map, - ARMV8_PMU_EVTYPE_EVENT); + return __armv8_pmuv3_map_event(event, NULL, + &armv8_thunder_perf_cache_map); } static int armv8_vulcan_map_event(struct perf_event *event) { - return armpmu_map_event(event, &armv8_vulcan_perf_map, - &armv8_vulcan_perf_cache_map, - ARMV8_PMU_EVTYPE_EVENT); + return __armv8_pmuv3_map_event(event, NULL, + &armv8_vulcan_perf_cache_map); } struct armv8pmu_probe_info { @@ -1062,6 +990,22 @@ static int armv8_pmuv3_init(struct arm_pmu *cpu_pmu) return 0; } +static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu) +{ + int ret = armv8_pmu_init(cpu_pmu); + if (ret) + return ret; + + cpu_pmu->name = "armv8_cortex_a35"; + cpu_pmu->map_event = armv8_a53_map_event; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = + &armv8_pmuv3_events_attr_group; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = + &armv8_pmuv3_format_attr_group; + + return 0; +} + static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu) { int ret = armv8_pmu_init(cpu_pmu); @@ -1110,6 +1054,22 @@ static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu) return 0; } +static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu) +{ + int ret = armv8_pmu_init(cpu_pmu); + if (ret) + return ret; + + cpu_pmu->name = "armv8_cortex_a73"; + cpu_pmu->map_event = armv8_a73_map_event; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = + &armv8_pmuv3_events_attr_group; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = + &armv8_pmuv3_format_attr_group; + + return 0; +} + static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu) { int ret = armv8_pmu_init(cpu_pmu); @@ -1144,9 +1104,11 @@ static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu) static const struct of_device_id armv8_pmu_of_device_ids[] = { {.compatible = "arm,armv8-pmuv3", .data = armv8_pmuv3_init}, + {.compatible = "arm,cortex-a35-pmu", .data = armv8_a35_pmu_init}, {.compatible = "arm,cortex-a53-pmu", .data = armv8_a53_pmu_init}, {.compatible = "arm,cortex-a57-pmu", .data = armv8_a57_pmu_init}, {.compatible = "arm,cortex-a72-pmu", .data = armv8_a72_pmu_init}, + {.compatible = "arm,cortex-a73-pmu", .data = armv8_a73_pmu_init}, {.compatible = "cavium,thunder-pmu", .data = armv8_thunder_pmu_init}, {.compatible = "brcm,vulcan-pmu", .data = armv8_vulcan_pmu_init}, {}, diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index 26c998534dca..636ca0119c0e 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -40,7 +40,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, probe_opcode_t insn; /* TODO: Currently we do not support AARCH32 instruction probing */ - if (test_bit(TIF_32BIT, &mm->context.flags)) + if (mm->context.flags & MMCF_AARCH32) return -ENOTSUPP; else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) return -EINVAL; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 659ae8094ed5..2dc0f8482210 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -360,6 +360,8 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, /* * Complete any pending TLB or cache maintenance on this CPU in case * the thread migrates to a different CPU. + * This full barrier is also required by the membarrier system + * call. */ dsb(ish); @@ -382,15 +384,12 @@ unsigned long get_wchan(struct task_struct *p) return 0; frame.fp = thread_saved_fp(p); - frame.sp = thread_saved_sp(p); frame.pc = thread_saved_pc(p); #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = p->curr_ret_stack; #endif do { - if (frame.sp < stack_page || - frame.sp >= stack_page + THREAD_SIZE || - unwind_frame(p, &frame)) + if (unwind_frame(p, &frame)) goto out; if (!in_sched_functions(frame.pc)) { ret = frame.pc; @@ -417,3 +416,11 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) else return randomize_page(mm->brk, SZ_1G); } + +/* + * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY. + */ +void arch_setup_new_exec(void) +{ + current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0; +} diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 1b38c0150aec..9cbb6123208f 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -42,6 +42,7 @@ #include <asm/compat.h> #include <asm/debug-monitors.h> #include <asm/pgtable.h> +#include <asm/stacktrace.h> #include <asm/syscall.h> #include <asm/traps.h> #include <asm/system_misc.h> @@ -127,7 +128,7 @@ static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) { return ((addr & ~(THREAD_SIZE - 1)) == (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))) || - on_irq_stack(addr, raw_smp_processor_id()); + on_irq_stack(addr); } /** @@ -1363,7 +1364,7 @@ static void tracehook_report_syscall(struct pt_regs *regs, if (dir == PTRACE_SYSCALL_EXIT) tracehook_report_syscall_exit(regs, 0); else if (tracehook_report_syscall_entry(regs)) - regs->syscallno = ~0UL; + forget_syscall(regs); regs->regs[regno] = saved_reg; } diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 12a87f2600f2..933adbc0f654 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -42,7 +42,6 @@ void *return_address(unsigned int level) data.addr = NULL; frame.fp = (unsigned long)__builtin_frame_address(0); - frame.sp = current_stack_pointer; frame.pc = (unsigned long)return_address; /* dummy */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = current->curr_ret_stack; diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 089c3747995d..c45214f8fb54 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -29,6 +29,7 @@ #include <linux/string.h> #include <linux/tracehook.h> #include <linux/ratelimit.h> +#include <linux/syscalls.h> #include <asm/debug-monitors.h> #include <asm/elf.h> @@ -36,6 +37,7 @@ #include <asm/ucontext.h> #include <asm/unistd.h> #include <asm/fpsimd.h> +#include <asm/ptrace.h> #include <asm/signal32.h> #include <asm/vdso.h> @@ -387,7 +389,7 @@ static int restore_sigframe(struct pt_regs *regs, /* * Avoid sys_rt_sigreturn() restarting. */ - regs->syscallno = ~0UL; + forget_syscall(regs); err |= !valid_user_regs(®s->user_regs, current); if (err == 0) @@ -673,13 +675,12 @@ static void do_signal(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; - int syscall = (int)regs->syscallno; struct ksignal ksig; /* * If we were from a system call, check for system call restarting... */ - if (syscall >= 0) { + if (in_syscall(regs)) { continue_addr = regs->pc; restart_addr = continue_addr - (compat_thumb_mode(regs) ? 2 : 4); retval = regs->regs[0]; @@ -687,7 +688,7 @@ static void do_signal(struct pt_regs *regs) /* * Avoid additional syscall restarting via ret_to_user. */ - regs->syscallno = ~0UL; + forget_syscall(regs); /* * Prepare for system call restart. We do this here so that a @@ -731,7 +732,7 @@ static void do_signal(struct pt_regs *regs) * Handle restarting a different system call. As above, if a debugger * has chosen to restart at a different PC, ignore the restart. */ - if (syscall >= 0 && regs->pc == restart_addr) { + if (in_syscall(regs) && regs->pc == restart_addr) { if (retval == -ERESTART_RESTARTBLOCK) setup_restart_syscall(regs); user_rewind_single_step(current); @@ -749,6 +750,10 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, * Update the trace code with the current status. */ trace_hardirqs_off(); + + /* Check valid user FS if needed */ + addr_limit_user_check(); + do { if (thread_flags & _TIF_NEED_RESCHED) { schedule(); diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index c747a0fc5d7d..4e5a664be04b 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -354,7 +354,7 @@ static int compat_restore_sigframe(struct pt_regs *regs, /* * Avoid compat_sys_sigreturn() restarting. */ - regs->syscallno = ~0UL; + forget_syscall(regs); err |= !valid_user_regs(®s->user_regs, current); diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index dc66e6ec3a99..ffe089942ac4 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -154,7 +154,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) * page tables. */ secondary_data.task = idle; - secondary_data.stack = task_stack_page(idle) + THREAD_START_SP; + secondary_data.stack = task_stack_page(idle) + THREAD_SIZE; update_cpu_boot_status(CPU_MMU_OFF); __flush_dcache_area(&secondary_data, sizeof(secondary_data)); @@ -977,11 +977,21 @@ void smp_send_stop(void) } #ifdef CONFIG_KEXEC_CORE -void smp_send_crash_stop(void) +void crash_smp_send_stop(void) { + static int cpus_stopped; cpumask_t mask; unsigned long timeout; + /* + * This function can be called twice in panic path, but obviously + * we execute this only once. + */ + if (cpus_stopped) + return; + + cpus_stopped = 1; + if (num_online_cpus() == 1) return; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 09d37d66b630..3144584617e7 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -42,33 +42,17 @@ */ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) { - unsigned long high, low; unsigned long fp = frame->fp; - unsigned long irq_stack_ptr; + + if (fp & 0xf) + return -EINVAL; if (!tsk) tsk = current; - /* - * Switching between stacks is valid when tracing current and in - * non-preemptible context. - */ - if (tsk == current && !preemptible()) - irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); - else - irq_stack_ptr = 0; - - low = frame->sp; - /* irq stacks are not THREAD_SIZE aligned */ - if (on_irq_stack(frame->sp, raw_smp_processor_id())) - high = irq_stack_ptr; - else - high = ALIGN(low, THREAD_SIZE) - 0x20; - - if (fp < low || fp > high || fp & 0xf) + if (!on_accessible_stack(tsk, fp)) return -EINVAL; - frame->sp = fp + 0x10; frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8)); @@ -86,34 +70,13 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* - * Check whether we are going to walk through from interrupt stack - * to task stack. - * If we reach the end of the stack - and its an interrupt stack, - * unpack the dummy frame to find the original elr. - * - * Check the frame->fp we read from the bottom of the irq_stack, - * and the original task stack pointer are both in current->stack. + * Frames created upon entry from EL0 have NULL FP and PC values, so + * don't bother reporting these. Frames created by __noreturn functions + * might have a valid FP even if PC is bogus, so only terminate where + * both are NULL. */ - if (frame->sp == irq_stack_ptr) { - struct pt_regs *irq_args; - unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); - - if (object_is_on_stack((void *)orig_sp) && - object_is_on_stack((void *)frame->fp)) { - frame->sp = orig_sp; - - /* orig_sp is the saved pt_regs, find the elr */ - irq_args = (struct pt_regs *)orig_sp; - frame->pc = irq_args->pc; - } else { - /* - * This frame has a non-standard format, and we - * didn't fix it, because the data looked wrong. - * Refuse to output this frame. - */ - return -EINVAL; - } - } + if (!frame->fp && !frame->pc) + return -EINVAL; return 0; } @@ -167,7 +130,6 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) data.no_sched_functions = 0; frame.fp = regs->regs[29]; - frame.sp = regs->sp; frame.pc = regs->pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = current->curr_ret_stack; @@ -192,12 +154,10 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) if (tsk != current) { data.no_sched_functions = 1; frame.fp = thread_saved_fp(tsk); - frame.sp = thread_saved_sp(tsk); frame.pc = thread_saved_pc(tsk); } else { data.no_sched_functions = 0; frame.fp = (unsigned long)__builtin_frame_address(0); - frame.sp = current_stack_pointer; frame.pc = (unsigned long)save_stack_trace_tsk; } #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index da33c90248e9..a4391280fba9 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -50,7 +50,6 @@ unsigned long profile_pc(struct pt_regs *regs) return regs->pc; frame.fp = regs->regs[29]; - frame.sp = regs->sp; frame.pc = regs->pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = -1; /* no task info */ diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8a62648848e5..5ea4b85aee0e 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -32,6 +32,7 @@ #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> +#include <linux/sizes.h> #include <linux/syscalls.h> #include <linux/mm_types.h> @@ -41,6 +42,7 @@ #include <asm/esr.h> #include <asm/insn.h> #include <asm/traps.h> +#include <asm/smp.h> #include <asm/stack_pointer.h> #include <asm/stacktrace.h> #include <asm/exception.h> @@ -143,7 +145,6 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; - unsigned long irq_stack_ptr; int skip; pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); @@ -154,25 +155,14 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) if (!try_get_task_stack(tsk)) return; - /* - * Switching between stacks is valid when tracing current and in - * non-preemptible context. - */ - if (tsk == current && !preemptible()) - irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); - else - irq_stack_ptr = 0; - if (tsk == current) { frame.fp = (unsigned long)__builtin_frame_address(0); - frame.sp = current_stack_pointer; frame.pc = (unsigned long)dump_backtrace; } else { /* * task blocked in __switch_to */ frame.fp = thread_saved_fp(tsk); - frame.sp = thread_saved_sp(tsk); frame.pc = thread_saved_pc(tsk); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -182,13 +172,12 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) skip = !!regs; printk("Call trace:\n"); while (1) { - unsigned long where = frame.pc; unsigned long stack; int ret; /* skip until specified stack frame */ if (!skip) { - dump_backtrace_entry(where); + dump_backtrace_entry(frame.pc); } else if (frame.fp == regs->regs[29]) { skip = 0; /* @@ -203,20 +192,12 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) ret = unwind_frame(tsk, &frame); if (ret < 0) break; - stack = frame.sp; - if (in_exception_text(where)) { - /* - * If we switched to the irq_stack before calling this - * exception handler, then the pt_regs will be on the - * task stack. The easiest way to tell is if the large - * pt_regs would overlap with the end of the irq_stack. - */ - if (stack < irq_stack_ptr && - (stack + sizeof(struct pt_regs)) > irq_stack_ptr) - stack = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + if (in_entry_text(frame.pc)) { + stack = frame.fp - offsetof(struct pt_regs, stackframe); - dump_mem("", "Exception stack", stack, - stack + sizeof(struct pt_regs)); + if (on_accessible_stack(tsk, stack)) + dump_mem("", "Exception stack", stack, + stack + sizeof(struct pt_regs)); } } @@ -257,8 +238,6 @@ static int __die(const char *str, int err, struct pt_regs *regs) end_of_stack(tsk)); if (!user_mode(regs)) { - dump_mem(KERN_EMERG, "Stack: ", regs->sp, - THREAD_SIZE + (unsigned long)task_stack_page(tsk)); dump_backtrace(regs, tsk); dump_instr(KERN_EMERG, regs); } @@ -484,6 +463,9 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) case ESR_ELx_SYS64_ISS_CRM_DC_CVAC: /* DC CVAC, gets promoted */ __user_cache_maint("dc civac", address, ret); break; + case ESR_ELx_SYS64_ISS_CRM_DC_CVAP: /* DC CVAP */ + __user_cache_maint("sys 3, c7, c12, 1", address, ret); + break; case ESR_ELx_SYS64_ISS_CRM_DC_CIVAC: /* DC CIVAC */ __user_cache_maint("dc civac", address, ret); break; @@ -593,7 +575,7 @@ asmlinkage long do_ni_syscall(struct pt_regs *regs) if (show_unhandled_signals_ratelimited()) { pr_info("%s[%d]: syscall %d\n", current->comm, - task_pid_nr(current), (int)regs->syscallno); + task_pid_nr(current), regs->syscallno); dump_instr("", regs); if (user_mode(regs)) __show_regs(regs); @@ -689,6 +671,43 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) force_sig_info(info.si_signo, &info, current); } +#ifdef CONFIG_VMAP_STACK + +DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) + __aligned(16); + +asmlinkage void handle_bad_stack(struct pt_regs *regs) +{ + unsigned long tsk_stk = (unsigned long)current->stack; + unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); + unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack); + unsigned int esr = read_sysreg(esr_el1); + unsigned long far = read_sysreg(far_el1); + + console_verbose(); + pr_emerg("Insufficient stack space to handle exception!"); + + pr_emerg("ESR: 0x%08x -- %s\n", esr, esr_get_class_string(esr)); + pr_emerg("FAR: 0x%016lx\n", far); + + pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", + tsk_stk, tsk_stk + THREAD_SIZE); + pr_emerg("IRQ stack: [0x%016lx..0x%016lx]\n", + irq_stk, irq_stk + THREAD_SIZE); + pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n", + ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); + + __show_regs(regs); + + /* + * We use nmi_panic to limit the potential for recusive overflows, and + * to get a better stack trace. + */ + nmi_panic(NULL, "kernel stack overflow"); + cpu_park_loop(); +} +#endif + void __pte_error(const char *file, int line, unsigned long val) { pr_err("%s:%d: bad pte %016lx.\n", file, line, val); diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index e8f759f764f2..2d419006ad43 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -110,12 +110,27 @@ int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp) } #endif /* CONFIG_COMPAT */ +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + unsigned long new_size = new_vma->vm_end - new_vma->vm_start; + unsigned long vdso_size = vdso_end - vdso_start; + + if (vdso_size != new_size) + return -EINVAL; + + current->mm->context.vdso = (void *)new_vma->vm_start; + + return 0; +} + static struct vm_special_mapping vdso_spec[2] __ro_after_init = { { .name = "[vvar]", }, { .name = "[vdso]", + .mremap = vdso_mremap, }, }; diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 987a00ee446c..fe56c268a7d9 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -72,22 +72,6 @@ PECOFF_FILE_ALIGNMENT = 0x200; #define PECOFF_EDATA_PADDING #endif -#if defined(CONFIG_DEBUG_ALIGN_RODATA) -/* - * 4 KB granule: 1 level 2 entry - * 16 KB granule: 128 level 3 entries, with contiguous bit - * 64 KB granule: 32 level 3 entries, with contiguous bit - */ -#define SEGMENT_ALIGN SZ_2M -#else -/* - * 4 KB granule: 16 level 3 entries, with contiguous bit - * 16 KB granule: 4 level 3 entries, without contiguous bit - * 64 KB granule: 1 level 3 entry - */ -#define SEGMENT_ALIGN SZ_64K -#endif - SECTIONS { /* @@ -192,7 +176,7 @@ SECTIONS _data = .; _sdata = .; - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) /* * Data written with the MMU off but read with the MMU on requires diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c index b81f4091c909..a81f5e10fc8c 100644 --- a/arch/arm64/kvm/hyp/s2-setup.c +++ b/arch/arm64/kvm/hyp/s2-setup.c @@ -70,7 +70,7 @@ u32 __hyp_text __init_stage2_translation(void) * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2. */ tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf; - if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && tmp) + if (tmp) val |= VTCR_EL2_HA; /* diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index c86b7909ef31..a0abc142c92b 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -17,3 +17,5 @@ CFLAGS_atomic_ll_sc.o := -fcall-used-x0 -ffixed-x1 -ffixed-x2 \ -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12 \ -fcall-saved-x13 -fcall-saved-x14 -fcall-saved-x15 \ -fcall-saved-x18 + +lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o diff --git a/arch/arm64/lib/uaccess_flushcache.c b/arch/arm64/lib/uaccess_flushcache.c new file mode 100644 index 000000000000..b6ceafdb8b72 --- /dev/null +++ b/arch/arm64/lib/uaccess_flushcache.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2017 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/uaccess.h> +#include <asm/barrier.h> +#include <asm/cacheflush.h> + +void memcpy_flushcache(void *dst, const void *src, size_t cnt) +{ + /* + * We assume this should not be called with @dst pointing to + * non-cacheable memory, such that we don't need an explicit + * barrier to order the cache maintenance against the memcpy. + */ + memcpy(dst, src, cnt); + __clean_dcache_area_pop(dst, cnt); +} +EXPORT_SYMBOL_GPL(memcpy_flushcache); + +void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len) +{ + memcpy_flushcache(to, page_address(page) + offset, len); +} + +unsigned long __copy_user_flushcache(void *to, const void __user *from, + unsigned long n) +{ + unsigned long rc = __arch_copy_from_user(to, from, n); + + /* See above */ + __clean_dcache_area_pop(to, n - rc); + return rc; +} diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 83c27b6e6dca..7f1dbe962cf5 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -109,20 +109,25 @@ ENTRY(__clean_dcache_area_pou) ENDPROC(__clean_dcache_area_pou) /* - * __dma_inv_area(start, size) - * - start - virtual start address of region + * __inval_dcache_area(kaddr, size) + * + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are invalidated. Any partial lines at the ends of the interval are + * also cleaned to PoC to prevent data loss. + * + * - kaddr - kernel address * - size - size in question */ -__dma_inv_area: - add x1, x1, x0 +ENTRY(__inval_dcache_area) /* FALLTHROUGH */ /* - * __inval_cache_range(start, end) - * - start - start address of region - * - end - end address of region + * __dma_inv_area(start, size) + * - start - virtual start address of region + * - size - size in question */ -ENTRY(__inval_cache_range) +__dma_inv_area: + add x1, x1, x0 dcache_line_size x2, x3 sub x3, x2, #1 tst x1, x3 // end cache line aligned? @@ -140,7 +145,7 @@ ENTRY(__inval_cache_range) b.lo 2b dsb sy ret -ENDPIPROC(__inval_cache_range) +ENDPIPROC(__inval_dcache_area) ENDPROC(__dma_inv_area) /* @@ -167,6 +172,20 @@ ENDPIPROC(__clean_dcache_area_poc) ENDPROC(__dma_clean_area) /* + * __clean_dcache_area_pop(kaddr, size) + * + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are cleaned to the PoP. + * + * - kaddr - kernel address + * - size - size in question + */ +ENTRY(__clean_dcache_area_pop) + dcache_by_line_op cvap, sy, x0, x1, x2, x3 + ret +ENDPIPROC(__clean_dcache_area_pop) + +/* * __dma_flush_area(start, size) * * clean & invalidate D / U line diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index f27d4dd04384..614af886b7ef 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -42,7 +42,7 @@ static pgprot_t __get_dma_pgprot(unsigned long attrs, pgprot_t prot, return prot; } -static struct gen_pool *atomic_pool; +static struct gen_pool *atomic_pool __ro_after_init; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; @@ -425,7 +425,7 @@ static int __init atomic_pool_init(void) gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, - (void *)PAGE_SHIFT); + NULL); pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", atomic_pool_size / 1024); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 1f22a41565a3..89993c4be1be 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -34,6 +34,7 @@ #include <linux/hugetlb.h> #include <asm/bug.h> +#include <asm/cmpxchg.h> #include <asm/cpufeature.h> #include <asm/exception.h> #include <asm/debug-monitors.h> @@ -82,6 +83,49 @@ static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr) } #endif +static void data_abort_decode(unsigned int esr) +{ + pr_alert("Data abort info:\n"); + + if (esr & ESR_ELx_ISV) { + pr_alert(" Access size = %u byte(s)\n", + 1U << ((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT)); + pr_alert(" SSE = %lu, SRT = %lu\n", + (esr & ESR_ELx_SSE) >> ESR_ELx_SSE_SHIFT, + (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT); + pr_alert(" SF = %lu, AR = %lu\n", + (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT, + (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT); + } else { + pr_alert(" ISV = 0, ISS = 0x%08lu\n", esr & ESR_ELx_ISS_MASK); + } + + pr_alert(" CM = %lu, WnR = %lu\n", + (esr & ESR_ELx_CM) >> ESR_ELx_CM_SHIFT, + (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT); +} + +/* + * Decode mem abort information + */ +static void mem_abort_decode(unsigned int esr) +{ + pr_alert("Mem abort info:\n"); + + pr_alert(" Exception class = %s, IL = %u bits\n", + esr_get_class_string(esr), + (esr & ESR_ELx_IL) ? 32 : 16); + pr_alert(" SET = %lu, FnV = %lu\n", + (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT, + (esr & ESR_ELx_FnV) >> ESR_ELx_FnV_SHIFT); + pr_alert(" EA = %lu, S1PTW = %lu\n", + (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT, + (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT); + + if (esr_is_data_abort(esr)) + data_abort_decode(esr); +} + /* * Dump out the page tables associated with 'addr' in the currently active mm. */ @@ -139,7 +183,6 @@ void show_pte(unsigned long addr) pr_cont("\n"); } -#ifdef CONFIG_ARM64_HW_AFDBM /* * This function sets the access flags (dirty, accessed), as well as write * permission, and only to a more permissive setting. @@ -154,18 +197,13 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { - pteval_t old_pteval; - unsigned int tmp; + pteval_t old_pteval, pteval; if (pte_same(*ptep, entry)) return 0; /* only preserve the access flags and write permission */ - pte_val(entry) &= PTE_AF | PTE_WRITE | PTE_DIRTY; - - /* set PTE_RDONLY if actual read-only or clean PTE */ - if (!pte_write(entry) || !pte_sw_dirty(entry)) - pte_val(entry) |= PTE_RDONLY; + pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY; /* * Setting the flags must be done atomically to avoid racing with the @@ -174,21 +212,18 @@ int ptep_set_access_flags(struct vm_area_struct *vma, * (calculated as: a & b == ~(~a | ~b)). */ pte_val(entry) ^= PTE_RDONLY; - asm volatile("// ptep_set_access_flags\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " eor %0, %0, %3 // negate PTE_RDONLY in *ptep\n" - " orr %0, %0, %4 // set flags\n" - " eor %0, %0, %3 // negate final PTE_RDONLY\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)) - : "L" (PTE_RDONLY), "r" (pte_val(entry))); + pteval = READ_ONCE(pte_val(*ptep)); + do { + old_pteval = pteval; + pteval ^= PTE_RDONLY; + pteval |= pte_val(entry); + pteval ^= PTE_RDONLY; + pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); + } while (pteval != old_pteval); flush_tlb_fix_spurious_fault(vma, address); return 1; } -#endif static bool is_el1_instruction_abort(unsigned int esr) { @@ -248,6 +283,8 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, pr_alert("Unable to handle kernel %s at virtual address %08lx\n", msg, addr); + mem_abort_decode(esr); + show_pte(addr); die("Oops", regs, esr); bust_spinlocks(0); @@ -705,6 +742,8 @@ asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr, pr_alert("Unhandled fault: %s (0x%08x) at 0x%016lx\n", inf->name, esr, addr); + mem_abort_decode(esr); + info.si_signo = inf->sig; info.si_errno = 0; info.si_code = inf->code; diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 21a8d828cbf4..e36ed5087b5c 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -83,3 +83,19 @@ EXPORT_SYMBOL(flush_dcache_page); * Additional functions defined in assembly. */ EXPORT_SYMBOL(flush_icache_range); + +#ifdef CONFIG_ARCH_HAS_PMEM_API +void arch_wb_cache_pmem(void *addr, size_t size) +{ + /* Ensure order against any prior non-cacheable writes */ + dmb(osh); + __clean_dcache_area_pop(addr, size); +} +EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); + +void arch_invalidate_pmem(void *addr, size_t size) +{ + __inval_dcache_area(addr, size); +} +EXPORT_SYMBOL_GPL(arch_invalidate_pmem); +#endif diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 656e0ece2289..6cb0fa92a651 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -41,6 +41,16 @@ int pud_huge(pud_t pud) #endif } +/* + * Select all bits except the pfn + */ +static inline pgprot_t pte_pgprot(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); +} + static int find_num_contig(struct mm_struct *mm, unsigned long addr, pte_t *ptep, size_t *pgsize) { @@ -58,15 +68,107 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr, return CONT_PTES; } +static inline int num_contig_ptes(unsigned long size, size_t *pgsize) +{ + int contig_ptes = 0; + + *pgsize = size; + + switch (size) { +#ifdef CONFIG_ARM64_4K_PAGES + case PUD_SIZE: +#endif + case PMD_SIZE: + contig_ptes = 1; + break; + case CONT_PMD_SIZE: + *pgsize = PMD_SIZE; + contig_ptes = CONT_PMDS; + break; + case CONT_PTE_SIZE: + *pgsize = PAGE_SIZE; + contig_ptes = CONT_PTES; + break; + } + + return contig_ptes; +} + +/* + * Changing some bits of contiguous entries requires us to follow a + * Break-Before-Make approach, breaking the whole contiguous set + * before we can change any entries. See ARM DDI 0487A.k_iss10775, + * "Misprogramming of the Contiguous bit", page D4-1762. + * + * This helper performs the break step. + */ +static pte_t get_clear_flush(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + unsigned long pgsize, + unsigned long ncontig) +{ + struct vm_area_struct vma = { .vm_mm = mm }; + pte_t orig_pte = huge_ptep_get(ptep); + bool valid = pte_valid(orig_pte); + unsigned long i, saddr = addr; + + for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { + pte_t pte = ptep_get_and_clear(mm, addr, ptep); + + /* + * If HW_AFDBM is enabled, then the HW could turn on + * the dirty bit for any page in the set, so check + * them all. All hugetlb entries are already young. + */ + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + } + + if (valid) + flush_tlb_range(&vma, saddr, addr); + return orig_pte; +} + +/* + * Changing some bits of contiguous entries requires us to follow a + * Break-Before-Make approach, breaking the whole contiguous set + * before we can change any entries. See ARM DDI 0487A.k_iss10775, + * "Misprogramming of the Contiguous bit", page D4-1762. + * + * This helper performs the break step for use cases where the + * original pte is not needed. + */ +static void clear_flush(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + unsigned long pgsize, + unsigned long ncontig) +{ + struct vm_area_struct vma = { .vm_mm = mm }; + unsigned long i, saddr = addr; + + for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) + pte_clear(mm, addr, ptep); + + flush_tlb_range(&vma, saddr, addr); +} + void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { size_t pgsize; int i; int ncontig; - unsigned long pfn; + unsigned long pfn, dpfn; pgprot_t hugeprot; + /* + * Code needs to be expanded to handle huge swap and migration + * entries. Needed for HUGETLB and MEMORY_FAILURE. + */ + WARN_ON(!pte_present(pte)); + if (!pte_cont(pte)) { set_pte_at(mm, addr, ptep, pte); return; @@ -74,17 +176,30 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, ncontig = find_num_contig(mm, addr, ptep, &pgsize); pfn = pte_pfn(pte); - hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); - for (i = 0; i < ncontig; i++) { + dpfn = pgsize >> PAGE_SHIFT; + hugeprot = pte_pgprot(pte); + + clear_flush(mm, addr, ptep, pgsize, ncontig); + + for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) { pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep, pte_val(pfn_pte(pfn, hugeprot))); set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); - ptep++; - pfn += pgsize >> PAGE_SHIFT; - addr += pgsize; } } +void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ + int i, ncontig; + size_t pgsize; + + ncontig = num_contig_ptes(sz, &pgsize); + + for (i = 0; i < ncontig; i++, ptep++) + set_pte(ptep, pte); +} + pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { @@ -144,19 +259,28 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return NULL; pud = pud_offset(pgd, addr); - if (pud_none(*pud)) + if (sz != PUD_SIZE && pud_none(*pud)) return NULL; - /* swap or huge page */ - if (!pud_present(*pud) || pud_huge(*pud)) + /* hugepage or swap? */ + if (pud_huge(*pud) || !pud_present(*pud)) return (pte_t *)pud; /* table; check the next level */ + if (sz == CONT_PMD_SIZE) + addr &= CONT_PMD_MASK; + pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + if (!(sz == PMD_SIZE || sz == CONT_PMD_SIZE) && + pmd_none(*pmd)) return NULL; - if (!pmd_present(*pmd) || pmd_huge(*pmd)) + if (pmd_huge(*pmd) || !pmd_present(*pmd)) return (pte_t *)pmd; + if (sz == CONT_PTE_SIZE) { + pte_t *pte = pte_offset_kernel(pmd, (addr & CONT_PTE_MASK)); + return pte; + } + return NULL; } @@ -176,111 +300,133 @@ pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, return entry; } +void huge_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz) +{ + int i, ncontig; + size_t pgsize; + + ncontig = num_contig_ptes(sz, &pgsize); + + for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) + pte_clear(mm, addr, ptep); +} + pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_t pte; + int ncontig; + size_t pgsize; + pte_t orig_pte = huge_ptep_get(ptep); - if (pte_cont(*ptep)) { - int ncontig, i; - size_t pgsize; - bool is_dirty = false; - - ncontig = find_num_contig(mm, addr, ptep, &pgsize); - /* save the 1st pte to return */ - pte = ptep_get_and_clear(mm, addr, ptep); - for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) { - /* - * If HW_AFDBM is enabled, then the HW could - * turn on the dirty bit for any of the page - * in the set, so check them all. - */ - ++ptep; - if (pte_dirty(ptep_get_and_clear(mm, addr, ptep))) - is_dirty = true; - } - if (is_dirty) - return pte_mkdirty(pte); - else - return pte; - } else { + if (!pte_cont(orig_pte)) return ptep_get_and_clear(mm, addr, ptep); - } + + ncontig = find_num_contig(mm, addr, ptep, &pgsize); + + return get_clear_flush(mm, addr, ptep, pgsize, ncontig); } int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { - if (pte_cont(pte)) { - int ncontig, i, changed = 0; - size_t pgsize = 0; - unsigned long pfn = pte_pfn(pte); - /* Select all bits except the pfn */ - pgprot_t hugeprot = - __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ - pte_val(pte)); - - pfn = pte_pfn(pte); - ncontig = find_num_contig(vma->vm_mm, addr, ptep, - &pgsize); - for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) { - changed |= ptep_set_access_flags(vma, addr, ptep, - pfn_pte(pfn, - hugeprot), - dirty); - pfn += pgsize >> PAGE_SHIFT; - } - return changed; - } else { + int ncontig, i, changed = 0; + size_t pgsize = 0; + unsigned long pfn = pte_pfn(pte), dpfn; + pgprot_t hugeprot; + pte_t orig_pte; + + if (!pte_cont(pte)) return ptep_set_access_flags(vma, addr, ptep, pte, dirty); - } + + ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize); + dpfn = pgsize >> PAGE_SHIFT; + + orig_pte = get_clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); + if (!pte_same(orig_pte, pte)) + changed = 1; + + /* Make sure we don't lose the dirty state */ + if (pte_dirty(orig_pte)) + pte = pte_mkdirty(pte); + + hugeprot = pte_pgprot(pte); + for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) + set_pte_at(vma->vm_mm, addr, ptep, pfn_pte(pfn, hugeprot)); + + return changed; } void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - if (pte_cont(*ptep)) { - int ncontig, i; - size_t pgsize = 0; - - ncontig = find_num_contig(mm, addr, ptep, &pgsize); - for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) - ptep_set_wrprotect(mm, addr, ptep); - } else { + unsigned long pfn, dpfn; + pgprot_t hugeprot; + int ncontig, i; + size_t pgsize; + pte_t pte; + + if (!pte_cont(*ptep)) { ptep_set_wrprotect(mm, addr, ptep); + return; } + + ncontig = find_num_contig(mm, addr, ptep, &pgsize); + dpfn = pgsize >> PAGE_SHIFT; + + pte = get_clear_flush(mm, addr, ptep, pgsize, ncontig); + pte = pte_wrprotect(pte); + + hugeprot = pte_pgprot(pte); + pfn = pte_pfn(pte); + + for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) + set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); } void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - if (pte_cont(*ptep)) { - int ncontig, i; - size_t pgsize = 0; - - ncontig = find_num_contig(vma->vm_mm, addr, ptep, - &pgsize); - for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) - ptep_clear_flush(vma, addr, ptep); - } else { + size_t pgsize; + int ncontig; + + if (!pte_cont(*ptep)) { ptep_clear_flush(vma, addr, ptep); + return; } + + ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize); + clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); } static __init int setup_hugepagesz(char *opt) { unsigned long ps = memparse(opt, &opt); - if (ps == PMD_SIZE) { - hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (ps == PUD_SIZE) { - hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else { - hugetlb_bad_size(); - pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); - return 0; + switch (ps) { +#ifdef CONFIG_ARM64_4K_PAGES + case PUD_SIZE: +#endif + case PMD_SIZE * CONT_PMDS: + case PMD_SIZE: + case PAGE_SIZE * CONT_PTES: + hugetlb_add_hstate(ilog2(ps) - PAGE_SHIFT); + return 1; } - return 1; + + hugetlb_bad_size(); + pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); + return 0; } __setup("hugepagesz=", setup_hugepagesz); + +#ifdef CONFIG_ARM64_64K_PAGES +static __init int add_default_hugepagesz(void) +{ + if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL) + hugetlb_add_hstate(CONT_PTE_SHIFT); + return 0; +} +arch_initcall(add_default_hugepagesz); +#endif diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index b02a9268dfbf..783de51a6c4e 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -44,8 +44,12 @@ #define A64_COND_NE AARCH64_INSN_COND_NE /* != */ #define A64_COND_CS AARCH64_INSN_COND_CS /* unsigned >= */ #define A64_COND_HI AARCH64_INSN_COND_HI /* unsigned > */ +#define A64_COND_LS AARCH64_INSN_COND_LS /* unsigned <= */ +#define A64_COND_CC AARCH64_INSN_COND_CC /* unsigned < */ #define A64_COND_GE AARCH64_INSN_COND_GE /* signed >= */ #define A64_COND_GT AARCH64_INSN_COND_GT /* signed > */ +#define A64_COND_LE AARCH64_INSN_COND_LE /* signed <= */ +#define A64_COND_LT AARCH64_INSN_COND_LT /* signed < */ #define A64_B_(cond, imm19) A64_COND_BRANCH(cond, (imm19) << 2) /* Unconditional branch (immediate) */ diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index f32144b2e07f..ba38d403abb2 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -527,10 +527,14 @@ emit_bswap_uxt: /* IF (dst COND src) JUMP off */ case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: case BPF_JMP | BPF_JNE | BPF_X: case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: emit(A64_CMP(1, dst, src), ctx); emit_cond_jmp: jmp_offset = bpf2a64_offset(i + off, i, ctx); @@ -542,9 +546,15 @@ emit_cond_jmp: case BPF_JGT: jmp_cond = A64_COND_HI; break; + case BPF_JLT: + jmp_cond = A64_COND_CC; + break; case BPF_JGE: jmp_cond = A64_COND_CS; break; + case BPF_JLE: + jmp_cond = A64_COND_LS; + break; case BPF_JSET: case BPF_JNE: jmp_cond = A64_COND_NE; @@ -552,9 +562,15 @@ emit_cond_jmp: case BPF_JSGT: jmp_cond = A64_COND_GT; break; + case BPF_JSLT: + jmp_cond = A64_COND_LT; + break; case BPF_JSGE: jmp_cond = A64_COND_GE; break; + case BPF_JSLE: + jmp_cond = A64_COND_LE; + break; default: return -EFAULT; } @@ -566,10 +582,14 @@ emit_cond_jmp: /* IF (dst COND imm) JUMP off */ case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: emit_a64_mov_i(1, tmp, imm, ctx); emit(A64_CMP(1, dst, tmp), ctx); goto emit_cond_jmp; diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h index c58f4a83ed6f..f6431439d15d 100644 --- a/arch/blackfin/include/asm/spinlock.h +++ b/arch/blackfin/include/asm/spinlock.h @@ -48,11 +48,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) __raw_spin_unlock_asm(&lock->lock); } -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - static inline int arch_read_can_lock(arch_rwlock_t *rw) { return __raw_uncached_fetch_asm(&rw->lock) > 0; diff --git a/arch/blackfin/kernel/module.c b/arch/blackfin/kernel/module.c index 0188c933b155..15af5768c403 100644 --- a/arch/blackfin/kernel/module.c +++ b/arch/blackfin/kernel/module.c @@ -4,8 +4,6 @@ * Licensed under the GPL-2 or later */ -#define pr_fmt(fmt) "module %s: " fmt, mod->name - #include <linux/moduleloader.h> #include <linux/elf.h> #include <linux/vmalloc.h> @@ -16,6 +14,11 @@ #include <asm/cacheflush.h> #include <linux/uaccess.h> +#define mod_err(mod, fmt, ...) \ + pr_err("module %s: " fmt, (mod)->name, ##__VA_ARGS__) +#define mod_debug(mod, fmt, ...) \ + pr_debug("module %s: " fmt, (mod)->name, ##__VA_ARGS__) + /* Transfer the section to the L1 memory */ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, @@ -44,7 +47,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_inst_sram_alloc(s->sh_size); mod->arch.text_l1 = dest; if (dest == NULL) { - pr_err("L1 inst memory allocation failed\n"); + mod_err(mod, "L1 inst memory allocation failed\n"); return -1; } dma_memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -56,7 +59,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_sram_alloc(s->sh_size); mod->arch.data_a_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -68,7 +71,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_sram_zalloc(s->sh_size); mod->arch.bss_a_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } @@ -77,7 +80,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_B_sram_alloc(s->sh_size); mod->arch.data_b_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -87,7 +90,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_B_sram_alloc(s->sh_size); mod->arch.bss_b_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } memset(dest, 0, s->sh_size); @@ -99,7 +102,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l2_sram_alloc(s->sh_size); mod->arch.text_l2 = dest; if (dest == NULL) { - pr_err("L2 SRAM allocation failed\n"); + mod_err(mod, "L2 SRAM allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -111,7 +114,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l2_sram_alloc(s->sh_size); mod->arch.data_l2 = dest; if (dest == NULL) { - pr_err("L2 SRAM allocation failed\n"); + mod_err(mod, "L2 SRAM allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -123,7 +126,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l2_sram_zalloc(s->sh_size); mod->arch.bss_l2 = dest; if (dest == NULL) { - pr_err("L2 SRAM allocation failed\n"); + mod_err(mod, "L2 SRAM allocation failed\n"); return -1; } @@ -157,8 +160,8 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, Elf32_Sym *sym; unsigned long location, value, size; - pr_debug("applying relocate section %u to %u\n", - relsec, sechdrs[relsec].sh_info); + mod_debug(mod, "applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ @@ -174,14 +177,14 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, #ifdef CONFIG_SMP if (location >= COREB_L1_DATA_A_START) { - pr_err("cannot relocate in L1: %u (SMP kernel)\n", + mod_err(mod, "cannot relocate in L1: %u (SMP kernel)\n", ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; } #endif - pr_debug("location is %lx, value is %lx type is %d\n", - location, value, ELF32_R_TYPE(rel[i].r_info)); + mod_debug(mod, "location is %lx, value is %lx type is %d\n", + location, value, ELF32_R_TYPE(rel[i].r_info)); switch (ELF32_R_TYPE(rel[i].r_info)) { @@ -200,12 +203,12 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, case R_BFIN_PCREL12_JUMP: case R_BFIN_PCREL12_JUMP_S: case R_BFIN_PCREL10: - pr_err("unsupported relocation: %u (no -mlong-calls?)\n", + mod_err(mod, "unsupported relocation: %u (no -mlong-calls?)\n", ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; default: - pr_err("unknown relocation: %u\n", + mod_err(mod, "unknown relocation: %u\n", ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; } @@ -222,7 +225,7 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, isram_memcpy((void *)location, &value, size); break; default: - pr_err("invalid relocation for %#lx\n", location); + mod_err(mod, "invalid relocation for %#lx\n", location); return -ENOEXEC; } } diff --git a/arch/c6x/configs/dsk6455_defconfig b/arch/c6x/configs/dsk6455_defconfig index 4663487c67a1..d764ea4cce7f 100644 --- a/arch/c6x/configs/dsk6455_defconfig +++ b/arch/c6x/configs/dsk6455_defconfig @@ -1,5 +1,4 @@ CONFIG_SOC_TMS320C6455=y -CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_SPARSE_IRQ=y @@ -25,7 +24,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=2 CONFIG_BLK_DEV_RAM_SIZE=17000 -CONFIG_MISC_DEVICES=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set diff --git a/arch/c6x/configs/evmc6457_defconfig b/arch/c6x/configs/evmc6457_defconfig index bba40e195ec4..05d0b4a25ab1 100644 --- a/arch/c6x/configs/evmc6457_defconfig +++ b/arch/c6x/configs/evmc6457_defconfig @@ -1,5 +1,4 @@ CONFIG_SOC_TMS320C6457=y -CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_SPARSE_IRQ=y @@ -26,7 +25,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=2 CONFIG_BLK_DEV_RAM_SIZE=17000 -CONFIG_MISC_DEVICES=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set diff --git a/arch/c6x/configs/evmc6472_defconfig b/arch/c6x/configs/evmc6472_defconfig index 8c46155f6d31..8d81fcf86b0e 100644 --- a/arch/c6x/configs/evmc6472_defconfig +++ b/arch/c6x/configs/evmc6472_defconfig @@ -1,5 +1,4 @@ CONFIG_SOC_TMS320C6472=y -CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_SPARSE_IRQ=y @@ -27,7 +26,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=2 CONFIG_BLK_DEV_RAM_SIZE=17000 -CONFIG_MISC_DEVICES=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set diff --git a/arch/c6x/configs/evmc6474_defconfig b/arch/c6x/configs/evmc6474_defconfig index 15533f632313..8156a98f3958 100644 --- a/arch/c6x/configs/evmc6474_defconfig +++ b/arch/c6x/configs/evmc6474_defconfig @@ -1,5 +1,4 @@ CONFIG_SOC_TMS320C6474=y -CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_SPARSE_IRQ=y @@ -27,7 +26,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=2 CONFIG_BLK_DEV_RAM_SIZE=17000 -CONFIG_MISC_DEVICES=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set diff --git a/arch/c6x/configs/evmc6678_defconfig b/arch/c6x/configs/evmc6678_defconfig index 5f126d4905b1..c4f433c25b69 100644 --- a/arch/c6x/configs/evmc6678_defconfig +++ b/arch/c6x/configs/evmc6678_defconfig @@ -1,5 +1,4 @@ CONFIG_SOC_TMS320C6678=y -CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_SPARSE_IRQ=y @@ -27,7 +26,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=2 CONFIG_BLK_DEV_RAM_SIZE=17000 -CONFIG_MISC_DEVICES=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set diff --git a/arch/c6x/platforms/megamod-pic.c b/arch/c6x/platforms/megamod-pic.c index 43afc03e4125..9519fa5f97d0 100644 --- a/arch/c6x/platforms/megamod-pic.c +++ b/arch/c6x/platforms/megamod-pic.c @@ -208,14 +208,14 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np) pic = kzalloc(sizeof(struct megamod_pic), GFP_KERNEL); if (!pic) { - pr_err("%s: Could not alloc PIC structure.\n", np->full_name); + pr_err("%pOF: Could not alloc PIC structure.\n", np); return NULL; } pic->irqhost = irq_domain_add_linear(np, NR_COMBINERS * 32, &megamod_domain_ops, pic); if (!pic->irqhost) { - pr_err("%s: Could not alloc host.\n", np->full_name); + pr_err("%pOF: Could not alloc host.\n", np); goto error_free; } @@ -225,7 +225,7 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np) pic->regs = of_iomap(np, 0); if (!pic->regs) { - pr_err("%s: Could not map registers.\n", np->full_name); + pr_err("%pOF: Could not map registers.\n", np); goto error_free; } @@ -253,8 +253,8 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np) irq_data = irq_get_irq_data(irq); if (!irq_data) { - pr_err("%s: combiner-%d no irq_data for virq %d!\n", - np->full_name, i, irq); + pr_err("%pOF: combiner-%d no irq_data for virq %d!\n", + np, i, irq); continue; } @@ -265,16 +265,16 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np) * of the core priority interrupts (4 - 15). */ if (hwirq < 4 || hwirq >= NR_PRIORITY_IRQS) { - pr_err("%s: combiner-%d core irq %ld out of range!\n", - np->full_name, i, hwirq); + pr_err("%pOF: combiner-%d core irq %ld out of range!\n", + np, i, hwirq); continue; } /* record the mapping */ mapping[hwirq - 4] = i; - pr_debug("%s: combiner-%d cascading to hwirq %ld\n", - np->full_name, i, hwirq); + pr_debug("%pOF: combiner-%d cascading to hwirq %ld\n", + np, i, hwirq); cascade_data[i].pic = pic; cascade_data[i].index = i; @@ -290,8 +290,8 @@ static struct megamod_pic * __init init_megamod_pic(struct device_node *np) /* Finally, set up the MUX registers */ for (i = 0; i < NR_MUX_OUTPUTS; i++) { if (mapping[i] != IRQ_UNMAPPED) { - pr_debug("%s: setting mux %d to priority %d\n", - np->full_name, mapping[i], i + 4); + pr_debug("%pOF: setting mux %d to priority %d\n", + np, mapping[i], i + 4); set_megamod_mux(pic, mapping[i], i); } } diff --git a/arch/c6x/platforms/plldata.c b/arch/c6x/platforms/plldata.c index 755359eb6286..e8b6cc6a7b5a 100644 --- a/arch/c6x/platforms/plldata.c +++ b/arch/c6x/platforms/plldata.c @@ -436,8 +436,8 @@ void __init c64x_setup_clocks(void) err = of_property_read_u32(node, "clock-frequency", &val); if (err || val == 0) { - pr_err("%s: no clock-frequency found! Using %dMHz\n", - node->full_name, (int)val / 1000000); + pr_err("%pOF: no clock-frequency found! Using %dMHz\n", + node, (int)val / 1000000); val = 25000000; } clkin1.rate = val; diff --git a/arch/c6x/platforms/timer64.c b/arch/c6x/platforms/timer64.c index 0bd0452ded80..241a9a607193 100644 --- a/arch/c6x/platforms/timer64.c +++ b/arch/c6x/platforms/timer64.c @@ -204,14 +204,14 @@ void __init timer64_init(void) timer = of_iomap(np, 0); if (!timer) { - pr_debug("%s: Cannot map timer registers.\n", np->full_name); + pr_debug("%pOF: Cannot map timer registers.\n", np); goto out; } - pr_debug("%s: Timer registers=%p.\n", np->full_name, timer); + pr_debug("%pOF: Timer registers=%p.\n", np, timer); cd->irq = irq_of_parse_and_map(np, 0); if (cd->irq == NO_IRQ) { - pr_debug("%s: Cannot find interrupt.\n", np->full_name); + pr_debug("%pOF: Cannot find interrupt.\n", np); iounmap(timer); goto out; } @@ -229,7 +229,7 @@ void __init timer64_init(void) dscr_set_devstate(timer64_devstate_id, DSCR_DEVSTATE_ENABLED); } - pr_debug("%s: Timer irq=%d.\n", np->full_name, cd->irq); + pr_debug("%pOF: Timer irq=%d.\n", np, cd->irq); clockevents_calc_mult_shift(cd, c6x_core_freq / TIMER_DIVISOR, 5); diff --git a/arch/cris/arch-v32/mach-a3/arbiter.c b/arch/cris/arch-v32/mach-a3/arbiter.c index ab5c421a4de8..735a9b0abdb8 100644 --- a/arch/cris/arch-v32/mach-a3/arbiter.c +++ b/arch/cris/arch-v32/mach-a3/arbiter.c @@ -227,7 +227,7 @@ static void crisv32_arbiter_config(int arbiter, int region, int unused_slots) } } -extern char _stext, _etext; +extern char _stext[], _etext[]; static void crisv32_arbiter_init(void) { @@ -265,7 +265,7 @@ static void crisv32_arbiter_init(void) #ifndef CONFIG_ETRAX_KGDB /* Global watch for writes to kernel text segment. */ - crisv32_arbiter_watch(virt_to_phys(&_stext), &_etext - &_stext, + crisv32_arbiter_watch(virt_to_phys(_stext), _etext - _stext, MARB_CLIENTS(arbiter_all_clients, arbiter_bar_all_clients), arbiter_all_write, NULL); #endif diff --git a/arch/cris/arch-v32/mach-fs/arbiter.c b/arch/cris/arch-v32/mach-fs/arbiter.c index c97f4d8120f9..047c70bdbb23 100644 --- a/arch/cris/arch-v32/mach-fs/arbiter.c +++ b/arch/cris/arch-v32/mach-fs/arbiter.c @@ -158,7 +158,7 @@ static void crisv32_arbiter_config(int region, int unused_slots) } } -extern char _stext, _etext; +extern char _stext[], _etext[]; static void crisv32_arbiter_init(void) { @@ -190,7 +190,7 @@ static void crisv32_arbiter_init(void) #ifndef CONFIG_ETRAX_KGDB /* Global watch for writes to kernel text segment. */ - crisv32_arbiter_watch(virt_to_phys(&_stext), &_etext - &_stext, + crisv32_arbiter_watch(virt_to_phys(_stext), _etext - _stext, arbiter_all_clients, arbiter_all_write, NULL); #endif } diff --git a/arch/cris/kernel/traps.c b/arch/cris/kernel/traps.c index a01636a12a6e..d98131c45bb5 100644 --- a/arch/cris/kernel/traps.c +++ b/arch/cris/kernel/traps.c @@ -42,7 +42,7 @@ void (*nmi_handler)(struct pt_regs *); void show_trace(unsigned long *stack) { unsigned long addr, module_start, module_end; - extern char _stext, _etext; + extern char _stext[], _etext[]; int i; pr_err("\nCall Trace: "); @@ -69,8 +69,8 @@ void show_trace(unsigned long *stack) * down the cause of the crash will be able to figure * out the call path that was taken. */ - if (((addr >= (unsigned long)&_stext) && - (addr <= (unsigned long)&_etext)) || + if (((addr >= (unsigned long)_stext) && + (addr <= (unsigned long)_etext)) || ((addr >= module_start) && (addr <= module_end))) { #ifdef CONFIG_KALLSYMS print_ip_sym(addr); diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h index 2e1da71e27a4..ab346f5f8820 100644 --- a/arch/frv/include/asm/futex.h +++ b/arch/frv/include/asm/futex.h @@ -7,7 +7,8 @@ #include <asm/errno.h> #include <linux/uaccess.h> -extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr); +extern int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr); static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index f1e3b20dce9f..9abf02d6855a 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -102,5 +102,7 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c index d155ca9e5098..37f7b2bf7f73 100644 --- a/arch/frv/kernel/futex.c +++ b/arch/frv/kernel/futex.c @@ -186,20 +186,10 @@ static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *_o /* * do the futex operations */ -int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -225,18 +215,9 @@ int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; break; - } - } + if (!ret) + *oval = oldval; return ret; -} /* end futex_atomic_op_inuser() */ +} /* end arch_futex_atomic_op_inuser() */ diff --git a/arch/h8300/include/asm/traps.h b/arch/h8300/include/asm/traps.h index 15e701130b27..1c5a30ec2df8 100644 --- a/arch/h8300/include/asm/traps.h +++ b/arch/h8300/include/asm/traps.h @@ -33,9 +33,9 @@ extern unsigned long *_interrupt_redirect_table; #define TRAP2_VEC 10 #define TRAP3_VEC 11 -extern char _start, _etext; +extern char _start[], _etext[]; #define check_kernel_text(addr) \ - ((addr >= (unsigned long)(&_start)) && \ - (addr < (unsigned long)(&_etext)) && !(addr & 1)) + ((addr >= (unsigned long)(_start)) && \ + (addr < (unsigned long)(_etext)) && !(addr & 1)) #endif /* _H8300_TRAPS_H */ diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h index a62ba368b27d..fb3dfb2a667e 100644 --- a/arch/hexagon/include/asm/atomic.h +++ b/arch/hexagon/include/asm/atomic.h @@ -42,6 +42,8 @@ static inline void atomic_set(atomic_t *v, int new) ); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + /** * atomic_read - reads a word, atomically * @v: pointer to atomic value diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h index 7e597f8434da..c607b77c8215 100644 --- a/arch/hexagon/include/asm/futex.h +++ b/arch/hexagon/include/asm/futex.h @@ -31,18 +31,9 @@ static inline int -futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) - return -EFAULT; pagefault_disable(); @@ -72,30 +63,9 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/hexagon/include/asm/spinlock.h b/arch/hexagon/include/asm/spinlock.h index a1c55788c5d6..53a8d5885887 100644 --- a/arch/hexagon/include/asm/spinlock.h +++ b/arch/hexagon/include/asm/spinlock.h @@ -179,11 +179,6 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock) */ #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - #define arch_spin_is_locked(x) ((x)->lock != 0) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h index 76acbcd5c060..6d67dc1eaf2b 100644 --- a/arch/ia64/include/asm/futex.h +++ b/arch/ia64/include/asm/futex.h @@ -45,18 +45,9 @@ do { \ } while (0) static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -84,17 +75,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index ca9e76149a4a..df2c121164b8 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -76,22 +76,6 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) ACCESS_ONCE(*p) = (tmp + 2) & ~1; } -static __always_inline void __ticket_spin_unlock_wait(arch_spinlock_t *lock) -{ - int *p = (int *)&lock->lock, ticket; - - ia64_invala(); - - for (;;) { - asm volatile ("ld4.c.nc %0=[%1]" : "=r"(ticket) : "r"(p) : "memory"); - if (!(((ticket >> TICKET_SHIFT) ^ ticket) & TICKET_MASK)) - return; - cpu_relax(); - } - - smp_acquire__after_ctrl_dep(); -} - static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { long tmp = ACCESS_ONCE(lock->lock); @@ -143,11 +127,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, arch_spin_lock(lock); } -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - __ticket_spin_unlock_wait(lock); -} - #define arch_read_can_lock(rw) (*(volatile int *)(rw) >= 0) #define arch_write_can_lock(rw) (*(volatile int *)(rw) == 0) diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 5dd5c5d0d642..002eb85a6941 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -111,4 +111,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 7508c306aa9e..1d29b2f8726b 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -159,12 +159,12 @@ int acpi_request_vector(u32 int_type) return vector; } -char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size) +void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size) { - return __va(phys_addr); + return __va(phys); } -void __init __acpi_unmap_table(char *map, unsigned long size) +void __init __acpi_unmap_table(void __iomem *map, unsigned long size) { } diff --git a/arch/m32r/include/asm/flat.h b/arch/m32r/include/asm/flat.h index 455ce7ddbf14..dfcb0e4eb256 100644 --- a/arch/m32r/include/asm/flat.h +++ b/arch/m32r/include/asm/flat.h @@ -95,7 +95,7 @@ static inline unsigned long m32r_flat_get_addr_from_rp (u32 *rp, return ~0; /* bogus value */ } -static inline void flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval) +static inline int flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval) { unsigned int reloc = flat_m32r_get_reloc_type (relval); if (reloc & 0xf0) { @@ -133,6 +133,7 @@ static inline void flat_put_addr_at_rp(u32 *rp, u32 addr, u32 relval) break; } } + return 0; } // kludge - text_len is a local variable in the only user. diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h index 323c7fc953cd..a56825592b90 100644 --- a/arch/m32r/include/asm/spinlock.h +++ b/arch/m32r/include/asm/spinlock.h @@ -30,11 +30,6 @@ #define arch_spin_is_locked(x) (*(volatile int *)(&(x)->slock) <= 0) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, VAL > 0); -} - /** * arch_spin_trylock - Try spin lock and return a result * @lock: Pointer to the lock variable diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index f8f7b47e247f..e268e51a38d1 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -102,4 +102,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index ddff1164aff0..54191f6fc715 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -49,6 +49,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -465,8 +466,10 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_MSM6242=m CONFIG_RTC_DRV_RP5C01=m # CONFIG_IOMMU_SUPPORT is not set @@ -477,7 +480,6 @@ CONFIG_SERIAL_CONSOLE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -587,12 +589,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 17384dc959a5..fb4663904428 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -47,6 +47,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -427,8 +428,10 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_HEARTBEAT=y @@ -436,7 +439,6 @@ CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -546,12 +548,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 53a641d62f85..4ab393e86e52 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -47,6 +47,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -442,7 +443,10 @@ CONFIG_DMASOUND_ATARI=m CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m +# CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_HEARTBEAT=y @@ -457,7 +461,6 @@ CONFIG_ATARI_DSP56K=m CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -567,12 +570,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 3925ae3a5eb3..1dd8d697545b 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -45,6 +45,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -420,15 +421,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -538,12 +540,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index f4a134b390b4..02b39f50076e 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -47,6 +47,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -430,15 +431,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -548,12 +550,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 9ed0cef632b7..044dcb2bf8fb 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -46,6 +46,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -452,15 +453,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -570,12 +572,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index efed0d48fd53..3ad04682077a 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -56,6 +56,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -520,8 +521,10 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_MSM6242=m CONFIG_RTC_DRV_RP5C01=m CONFIG_RTC_DRV_GENERIC=m @@ -540,7 +543,6 @@ CONFIG_SERIAL_CONSOLE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -650,12 +652,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 9040457c7f9c..dc2dd61948cd 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -44,6 +44,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -420,15 +421,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -538,12 +540,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 8b17f00e0484..54e7b523fc3d 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -45,6 +45,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -420,15 +421,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -538,12 +540,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 5f3718c62c85..d63d8a15f6db 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -45,6 +45,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -442,8 +443,10 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_HEARTBEAT=y @@ -451,7 +454,6 @@ CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -561,12 +563,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 8c979a68fca5..d0924c22f52a 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -42,6 +42,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -422,15 +423,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -540,12 +542,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y CONFIG_CRYPTO_RSA=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index a1e79530e806..3001ee1e5dc5 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -42,6 +42,7 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_INET=y @@ -422,15 +423,16 @@ CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m # CONFIG_HID_GENERIC is not set +# CONFIG_HID_ITE is not set # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y +# CONFIG_RTC_NVMEM is not set CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_PROC_HARDWARE=y CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m -CONFIG_XFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set CONFIG_FS_ENCRYPTION=m @@ -540,12 +542,13 @@ CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m -CONFIG_TEST_LKM=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_FIRMWARE=m +CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m CONFIG_EARLY_PRINTK=y CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y diff --git a/arch/m68k/include/asm/asm-prototypes.h b/arch/m68k/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..22ccb9c97576 --- /dev/null +++ b/arch/m68k/include/asm/asm-prototypes.h @@ -0,0 +1,5 @@ +extern int __divsi3(int, int); +extern int __modsi3(int, int); +extern int __mulsi3(int, int); +extern unsigned int __udivsi3(unsigned int, unsigned int); +extern unsigned int __umodsi3(unsigned int, unsigned int); diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c index 8aa8792e3174..d96348a52362 100644 --- a/arch/m68k/mac/misc.c +++ b/arch/m68k/mac/misc.c @@ -357,6 +357,17 @@ static void cuda_shutdown(void) struct adb_request req; if (cuda_request(&req, NULL, 2, CUDA_PACKET, CUDA_POWERDOWN) < 0) return; + + /* Avoid infinite polling loop when PSU is not under Cuda control */ + switch (macintosh_config->ident) { + case MAC_MODEL_C660: + case MAC_MODEL_Q605: + case MAC_MODEL_Q605_ACC: + case MAC_MODEL_P475: + case MAC_MODEL_P475F: + return; + } + while (!req.complete) cuda_poll(); } @@ -463,8 +474,9 @@ void mac_poweroff(void) pmu_shutdown(); #endif } - local_irq_enable(); + pr_crit("It is now safe to turn off your Macintosh.\n"); + local_irq_disable(); while(1); } @@ -554,8 +566,8 @@ void mac_reset(void) } /* should never get here */ - local_irq_enable(); pr_crit("Restart failed. Please restart manually.\n"); + local_irq_disable(); while(1); } diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig index 5b7a45d99cfb..7d8b322e5101 100644 --- a/arch/metag/Kconfig +++ b/arch/metag/Kconfig @@ -26,6 +26,7 @@ config METAG select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNDERSCORE_SYMBOL_PREFIX select IRQ_DOMAIN + select GENERIC_IRQ_EFFECTIVE_AFF_MASK select MODULES_USE_ELF_RELA select OF select OF_EARLY_FLATTREE diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h index 6c1380a8a0d4..eee779f26cc4 100644 --- a/arch/metag/include/asm/atomic_lock1.h +++ b/arch/metag/include/asm/atomic_lock1.h @@ -37,6 +37,8 @@ static inline int atomic_set(atomic_t *v, int i) return i; } +#define atomic_set_release(v, i) atomic_set((v), (i)) + #define ATOMIC_OP(op, c_op) \ static inline void atomic_##op(int i, atomic_t *v) \ { \ diff --git a/arch/metag/include/asm/spinlock.h b/arch/metag/include/asm/spinlock.h index c0c7a22be1ae..ddf7fe5708a6 100644 --- a/arch/metag/include/asm/spinlock.h +++ b/arch/metag/include/asm/spinlock.h @@ -15,11 +15,6 @@ * locked. */ -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) diff --git a/arch/metag/include/asm/topology.h b/arch/metag/include/asm/topology.h index e95f874ded1b..707c7f7b6bea 100644 --- a/arch/metag/include/asm/topology.h +++ b/arch/metag/include/asm/topology.h @@ -4,7 +4,6 @@ #ifdef CONFIG_NUMA #define cpu_to_node(cpu) ((void)(cpu), 0) -#define parent_node(node) ((void)(node), 0) #define cpumask_of_node(node) ((void)node, cpu_online_mask) diff --git a/arch/microblaze/include/asm/flat.h b/arch/microblaze/include/asm/flat.h index f23c3d266bae..3d2747d4c967 100644 --- a/arch/microblaze/include/asm/flat.h +++ b/arch/microblaze/include/asm/flat.h @@ -60,7 +60,7 @@ static inline int flat_get_addr_from_rp(u32 __user *rp, u32 relval, u32 flags, * unaligned. */ -static inline void +static inline int flat_put_addr_at_rp(u32 __user *rp, u32 addr, u32 relval) { u32 *p = (__force u32 *)rp; diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h index 01848f056f43..a9dad9e5e132 100644 --- a/arch/microblaze/include/asm/futex.h +++ b/arch/microblaze/include/asm/futex.h @@ -29,18 +29,9 @@ }) static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -66,30 +57,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h index 1de190bdfb9c..a9e61ea54ca9 100644 --- a/arch/mips/include/asm/futex.h +++ b/arch/mips/include/asm/futex.h @@ -83,18 +83,9 @@ } static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -125,17 +116,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 2998479fd4e8..a9af1d2dcd69 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -938,11 +938,6 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address) -{ -} - /* Emulation */ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 655e2fb5395b..da3216007fe0 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -91,20 +91,12 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 -/* - * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. - * This gives us 6 bits, which is enough until someone invents 128 bit address - * spaces. - * - * Assume these are all power of twos. - * When 0 use the default page size. - */ -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f - #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 882823bec153..6c755bc07975 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -120,4 +120,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 6dd13641a418..1395654cfc8d 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -872,15 +872,13 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) if (unlikely(test_thread_flag(TIF_SECCOMP))) { int ret, i; struct seccomp_data sd; + unsigned long args[6]; sd.nr = syscall; sd.arch = syscall_get_arch(); - for (i = 0; i < 6; i++) { - unsigned long v, r; - - r = mips_get_syscall_arg(&v, current, regs, i); - sd.args[i] = r ? 0 : v; - } + syscall_get_arguments(current, regs, 0, 6, args); + for (i = 0; i < 6; i++) + sd.args[i] = args[i]; sd.instruction_pointer = KSTK_EIP(current); ret = __secure_computing(&sd); diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 27c2f90eeb21..a9a7d78803cd 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -190,12 +190,6 @@ illegal_syscall: sll t1, t0, 2 beqz v0, einval lw t2, sys_call_table(t1) # syscall routine - sw a0, PT_R2(sp) # call routine directly on restart - - /* Some syscalls like execve get their arguments from struct pt_regs - and claim zero arguments in the syscall table. Thus we have to - assume the worst case and shuffle around all potential arguments. - If you want performance, don't use indirect syscalls. */ move a0, a1 # shift argument registers move a1, a2 @@ -207,11 +201,6 @@ illegal_syscall: sw t4, 16(sp) sw t5, 20(sp) sw t6, 24(sp) - sw a0, PT_R4(sp) # .. and push back a0 - a3, some - sw a1, PT_R5(sp) # syscalls expect them there - sw a2, PT_R6(sp) - sw a3, PT_R7(sp) - sw a3, PT_R26(sp) # update a3 for syscall restarting jr t2 /* Unreached */ diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index c30bc520885f..9ebe3e2403b1 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -198,7 +198,6 @@ LEAF(sys32_syscall) dsll t1, t0, 3 beqz v0, einval ld t2, sys32_call_table(t1) # syscall routine - sd a0, PT_R2(sp) # call routine directly on restart move a0, a1 # shift argument registers move a1, a2 @@ -207,11 +206,6 @@ LEAF(sys32_syscall) move a4, a5 move a5, a6 move a6, a7 - sd a0, PT_R4(sp) # ... and push back a0 - a3, some - sd a1, PT_R5(sp) # syscalls expect them there - sd a2, PT_R6(sp) - sd a3, PT_R7(sp) - sd a3, PT_R26(sp) # update a3 for syscall restarting jr t2 /* Unreached */ diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 6bace7695788..c7cbddfcdc3b 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static DEFINE_PER_CPU(atomic_t, tick_broadcast_count); -static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd); +static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd); void tick_broadcast(const struct cpumask *mask) { atomic_t *count; - struct call_single_data *csd; + call_single_data_t *csd; int cpu; for_each_cpu(cpu, mask) { @@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info) static int __init tick_broadcast_init(void) { - struct call_single_data *csd; + call_single_data_t *csd; int cpu; for (cpu = 0; cpu < NR_CPUS; cpu++) { diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 3f87b96da5c4..7646891c4e9b 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -113,6 +113,7 @@ struct jit_ctx { u64 *reg_val_types; unsigned int long_b_conversion:1; unsigned int gen_b_offsets:1; + unsigned int use_bbit_insns:1; }; static void set_reg_val_type(u64 *rvt, int reg, enum reg_val_type type) @@ -655,19 +656,6 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx, int this_idx) return build_int_epilogue(ctx, MIPS_R_T9); } -static bool use_bbit_insns(void) -{ - switch (current_cpu_type()) { - case CPU_CAVIUM_OCTEON: - case CPU_CAVIUM_OCTEON_PLUS: - case CPU_CAVIUM_OCTEON2: - case CPU_CAVIUM_OCTEON3: - return true; - default: - return false; - } -} - static bool is_bad_offset(int b_off) { return b_off > 0x1ffff || b_off < -0x20000; @@ -682,6 +670,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, unsigned int target; u64 t64; s64 t64s; + int bpf_op = BPF_OP(insn->code); switch (insn->code) { case BPF_ALU64 | BPF_ADD | BPF_K: /* ALU64_IMM */ @@ -770,13 +759,13 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, sll, dst, dst, 0); if (insn->imm == 1) { /* div by 1 is a nop, mod by 1 is zero */ - if (BPF_OP(insn->code) == BPF_MOD) + if (bpf_op == BPF_MOD) emit_instr(ctx, addu, dst, MIPS_R_ZERO, MIPS_R_ZERO); break; } gen_imm_to_reg(insn, MIPS_R_AT, ctx); emit_instr(ctx, divu, dst, MIPS_R_AT); - if (BPF_OP(insn->code) == BPF_DIV) + if (bpf_op == BPF_DIV) emit_instr(ctx, mflo, dst); else emit_instr(ctx, mfhi, dst); @@ -798,13 +787,13 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, if (insn->imm == 1) { /* div by 1 is a nop, mod by 1 is zero */ - if (BPF_OP(insn->code) == BPF_MOD) + if (bpf_op == BPF_MOD) emit_instr(ctx, addu, dst, MIPS_R_ZERO, MIPS_R_ZERO); break; } gen_imm_to_reg(insn, MIPS_R_AT, ctx); emit_instr(ctx, ddivu, dst, MIPS_R_AT); - if (BPF_OP(insn->code) == BPF_DIV) + if (bpf_op == BPF_DIV) emit_instr(ctx, mflo, dst); else emit_instr(ctx, mfhi, dst); @@ -829,7 +818,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32); did_move = false; if (insn->src_reg == BPF_REG_10) { - if (BPF_OP(insn->code) == BPF_MOV) { + if (bpf_op == BPF_MOV) { emit_instr(ctx, daddiu, dst, MIPS_R_SP, MAX_BPF_STACK); did_move = true; } else { @@ -839,7 +828,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, } else if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) { int tmp_reg = MIPS_R_AT; - if (BPF_OP(insn->code) == BPF_MOV) { + if (bpf_op == BPF_MOV) { tmp_reg = dst; did_move = true; } @@ -847,7 +836,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, dinsu, tmp_reg, MIPS_R_ZERO, 32, 32); src = MIPS_R_AT; } - switch (BPF_OP(insn->code)) { + switch (bpf_op) { case BPF_MOV: if (!did_move) emit_instr(ctx, daddu, dst, src, MIPS_R_ZERO); @@ -879,7 +868,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off); emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src); emit_instr(ctx, ddivu, dst, src); - if (BPF_OP(insn->code) == BPF_DIV) + if (bpf_op == BPF_DIV) emit_instr(ctx, mflo, dst); else emit_instr(ctx, mfhi, dst); @@ -923,7 +912,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, if (ts == REG_64BIT || ts == REG_32BIT_ZERO_EX) { int tmp_reg = MIPS_R_AT; - if (BPF_OP(insn->code) == BPF_MOV) { + if (bpf_op == BPF_MOV) { tmp_reg = dst; did_move = true; } @@ -931,7 +920,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, sll, tmp_reg, src, 0); src = MIPS_R_AT; } - switch (BPF_OP(insn->code)) { + switch (bpf_op) { case BPF_MOV: if (!did_move) emit_instr(ctx, addu, dst, src, MIPS_R_ZERO); @@ -962,7 +951,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off); emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src); emit_instr(ctx, divu, dst, src); - if (BPF_OP(insn->code) == BPF_DIV) + if (bpf_op == BPF_DIV) emit_instr(ctx, mflo, dst); else emit_instr(ctx, mfhi, dst); @@ -989,7 +978,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, break; case BPF_JMP | BPF_JEQ | BPF_K: /* JMP_IMM */ case BPF_JMP | BPF_JNE | BPF_K: /* JMP_IMM */ - cmp_eq = (BPF_OP(insn->code) == BPF_JEQ); + cmp_eq = (bpf_op == BPF_JEQ); dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok); if (dst < 0) return dst; @@ -1002,8 +991,12 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, goto jeq_common; case BPF_JMP | BPF_JEQ | BPF_X: /* JMP_REG */ case BPF_JMP | BPF_JNE | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: case BPF_JMP | BPF_JSGT | BPF_X: case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JSET | BPF_X: @@ -1020,33 +1013,39 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, sll, MIPS_R_AT, dst, 0); dst = MIPS_R_AT; } - if (BPF_OP(insn->code) == BPF_JSET) { + if (bpf_op == BPF_JSET) { emit_instr(ctx, and, MIPS_R_AT, dst, src); cmp_eq = false; dst = MIPS_R_AT; src = MIPS_R_ZERO; - } else if (BPF_OP(insn->code) == BPF_JSGT) { + } else if (bpf_op == BPF_JSGT || bpf_op == BPF_JSLE) { emit_instr(ctx, dsubu, MIPS_R_AT, dst, src); if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) { b_off = b_imm(exit_idx, ctx); if (is_bad_offset(b_off)) return -E2BIG; - emit_instr(ctx, blez, MIPS_R_AT, b_off); + if (bpf_op == BPF_JSGT) + emit_instr(ctx, blez, MIPS_R_AT, b_off); + else + emit_instr(ctx, bgtz, MIPS_R_AT, b_off); emit_instr(ctx, nop); return 2; /* We consumed the exit. */ } b_off = b_imm(this_idx + insn->off + 1, ctx); if (is_bad_offset(b_off)) return -E2BIG; - emit_instr(ctx, bgtz, MIPS_R_AT, b_off); + if (bpf_op == BPF_JSGT) + emit_instr(ctx, bgtz, MIPS_R_AT, b_off); + else + emit_instr(ctx, blez, MIPS_R_AT, b_off); emit_instr(ctx, nop); break; - } else if (BPF_OP(insn->code) == BPF_JSGE) { + } else if (bpf_op == BPF_JSGE || bpf_op == BPF_JSLT) { emit_instr(ctx, slt, MIPS_R_AT, dst, src); - cmp_eq = true; + cmp_eq = bpf_op == BPF_JSGE; dst = MIPS_R_AT; src = MIPS_R_ZERO; - } else if (BPF_OP(insn->code) == BPF_JGT) { + } else if (bpf_op == BPF_JGT || bpf_op == BPF_JLE) { /* dst or src could be AT */ emit_instr(ctx, dsubu, MIPS_R_T8, dst, src); emit_instr(ctx, sltu, MIPS_R_AT, dst, src); @@ -1054,16 +1053,16 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit_instr(ctx, movz, MIPS_R_T9, MIPS_R_SP, MIPS_R_T8); emit_instr(ctx, movn, MIPS_R_T9, MIPS_R_ZERO, MIPS_R_T8); emit_instr(ctx, or, MIPS_R_AT, MIPS_R_T9, MIPS_R_AT); - cmp_eq = true; + cmp_eq = bpf_op == BPF_JGT; dst = MIPS_R_AT; src = MIPS_R_ZERO; - } else if (BPF_OP(insn->code) == BPF_JGE) { + } else if (bpf_op == BPF_JGE || bpf_op == BPF_JLT) { emit_instr(ctx, sltu, MIPS_R_AT, dst, src); - cmp_eq = true; + cmp_eq = bpf_op == BPF_JGE; dst = MIPS_R_AT; src = MIPS_R_ZERO; } else { /* JNE/JEQ case */ - cmp_eq = (BPF_OP(insn->code) == BPF_JEQ); + cmp_eq = (bpf_op == BPF_JEQ); } jeq_common: /* @@ -1122,7 +1121,9 @@ jeq_common: break; case BPF_JMP | BPF_JSGT | BPF_K: /* JMP_IMM */ case BPF_JMP | BPF_JSGE | BPF_K: /* JMP_IMM */ - cmp_eq = (BPF_OP(insn->code) == BPF_JSGE); + case BPF_JMP | BPF_JSLT | BPF_K: /* JMP_IMM */ + case BPF_JMP | BPF_JSLE | BPF_K: /* JMP_IMM */ + cmp_eq = (bpf_op == BPF_JSGE); dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok); if (dst < 0) return dst; @@ -1132,65 +1133,92 @@ jeq_common: b_off = b_imm(exit_idx, ctx); if (is_bad_offset(b_off)) return -E2BIG; - if (cmp_eq) - emit_instr(ctx, bltz, dst, b_off); - else + switch (bpf_op) { + case BPF_JSGT: emit_instr(ctx, blez, dst, b_off); + break; + case BPF_JSGE: + emit_instr(ctx, bltz, dst, b_off); + break; + case BPF_JSLT: + emit_instr(ctx, bgez, dst, b_off); + break; + case BPF_JSLE: + emit_instr(ctx, bgtz, dst, b_off); + break; + } emit_instr(ctx, nop); return 2; /* We consumed the exit. */ } b_off = b_imm(this_idx + insn->off + 1, ctx); if (is_bad_offset(b_off)) return -E2BIG; - if (cmp_eq) - emit_instr(ctx, bgez, dst, b_off); - else + switch (bpf_op) { + case BPF_JSGT: emit_instr(ctx, bgtz, dst, b_off); + break; + case BPF_JSGE: + emit_instr(ctx, bgez, dst, b_off); + break; + case BPF_JSLT: + emit_instr(ctx, bltz, dst, b_off); + break; + case BPF_JSLE: + emit_instr(ctx, blez, dst, b_off); + break; + } emit_instr(ctx, nop); break; } /* * only "LT" compare available, so we must use imm + 1 - * to generate "GT" + * to generate "GT" and imm -1 to generate LE */ - t64s = insn->imm + (cmp_eq ? 0 : 1); + if (bpf_op == BPF_JSGT) + t64s = insn->imm + 1; + else if (bpf_op == BPF_JSLE) + t64s = insn->imm + 1; + else + t64s = insn->imm; + + cmp_eq = bpf_op == BPF_JSGT || bpf_op == BPF_JSGE; if (t64s >= S16_MIN && t64s <= S16_MAX) { emit_instr(ctx, slti, MIPS_R_AT, dst, (int)t64s); src = MIPS_R_AT; dst = MIPS_R_ZERO; - cmp_eq = true; goto jeq_common; } emit_const_to_reg(ctx, MIPS_R_AT, (u64)t64s); emit_instr(ctx, slt, MIPS_R_AT, dst, MIPS_R_AT); src = MIPS_R_AT; dst = MIPS_R_ZERO; - cmp_eq = true; goto jeq_common; case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: - cmp_eq = (BPF_OP(insn->code) == BPF_JGE); + case BPF_JMP | BPF_JLT | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: + cmp_eq = (bpf_op == BPF_JGE); dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok); if (dst < 0) return dst; /* * only "LT" compare available, so we must use imm + 1 - * to generate "GT" + * to generate "GT" and imm -1 to generate LE */ - t64s = (u64)(u32)(insn->imm) + (cmp_eq ? 0 : 1); - if (t64s >= 0 && t64s <= S16_MAX) { - emit_instr(ctx, sltiu, MIPS_R_AT, dst, (int)t64s); - src = MIPS_R_AT; - dst = MIPS_R_ZERO; - cmp_eq = true; - goto jeq_common; - } + if (bpf_op == BPF_JGT) + t64s = (u64)(u32)(insn->imm) + 1; + else if (bpf_op == BPF_JLE) + t64s = (u64)(u32)(insn->imm) + 1; + else + t64s = (u64)(u32)(insn->imm); + + cmp_eq = bpf_op == BPF_JGT || bpf_op == BPF_JGE; + emit_const_to_reg(ctx, MIPS_R_AT, (u64)t64s); emit_instr(ctx, sltu, MIPS_R_AT, dst, MIPS_R_AT); src = MIPS_R_AT; dst = MIPS_R_ZERO; - cmp_eq = true; goto jeq_common; case BPF_JMP | BPF_JSET | BPF_K: /* JMP_IMM */ @@ -1198,7 +1226,7 @@ jeq_common: if (dst < 0) return dst; - if (use_bbit_insns() && hweight32((u32)insn->imm) == 1) { + if (ctx->use_bbit_insns && hweight32((u32)insn->imm) == 1) { if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) { b_off = b_imm(exit_idx, ctx); if (is_bad_offset(b_off)) @@ -1724,10 +1752,14 @@ static int reg_val_propagate_range(struct jit_ctx *ctx, u64 initial_rvt, case BPF_JEQ: case BPF_JGT: case BPF_JGE: + case BPF_JLT: + case BPF_JLE: case BPF_JSET: case BPF_JNE: case BPF_JSGT: case BPF_JSGE: + case BPF_JSLT: + case BPF_JSLE: if (follow_taken) { rvt[idx] |= RVT_BRANCH_TAKEN; idx += insn->off; @@ -1853,6 +1885,19 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) memset(&ctx, 0, sizeof(ctx)); + preempt_disable(); + switch (current_cpu_type()) { + case CPU_CAVIUM_OCTEON: + case CPU_CAVIUM_OCTEON_PLUS: + case CPU_CAVIUM_OCTEON2: + case CPU_CAVIUM_OCTEON3: + ctx.use_bbit_insns = 1; + break; + default: + ctx.use_bbit_insns = 0; + } + preempt_enable(); + ctx.offsets = kcalloc(prog->len + 1, sizeof(*ctx.offsets), GFP_KERNEL); if (ctx.offsets == NULL) goto out_err; diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h index 9c7b8f7942d8..fe413b41df6c 100644 --- a/arch/mn10300/include/asm/spinlock.h +++ b/arch/mn10300/include/asm/spinlock.h @@ -26,11 +26,6 @@ #define arch_spin_is_locked(x) (*(volatile signed char *)(&(x)->slock) != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, !VAL); -} - static inline void arch_spin_unlock(arch_spinlock_t *lock) { asm volatile( diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index c710db354ff2..ac82a3f26dbf 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -102,4 +102,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/openrisc/include/asm/futex.h b/arch/openrisc/include/asm/futex.h index 778087341977..8fed278a24b8 100644 --- a/arch/openrisc/include/asm/futex.h +++ b/arch/openrisc/include/asm/futex.h @@ -30,20 +30,10 @@ }) static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -68,30 +58,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index dda1f558ef35..13648519bd41 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -9,6 +9,9 @@ config PARISC select ARCH_WANT_FRAME_POINTERS select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_STRICT_KERNEL_RWX + select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_WANTS_UBSAN_NO_NULL + select ARCH_SUPPORTS_MEMORY_FAILURE select RTC_CLASS select RTC_DRV_GENERIC select INIT_ALL_POSSIBLE @@ -17,6 +20,12 @@ config PARISC select BUG select BUILDTIME_EXTABLE_SORT select HAVE_PERF_EVENTS + select HAVE_KERNEL_BZIP2 + select HAVE_KERNEL_GZIP + select HAVE_KERNEL_LZ4 + select HAVE_KERNEL_LZMA + select HAVE_KERNEL_LZO + select HAVE_KERNEL_XZ select GENERIC_ATOMIC64 if !64BIT select GENERIC_IRQ_PROBE select GENERIC_PCI_IOMAP diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile index 75cb451b1f03..58fae5d2449d 100644 --- a/arch/parisc/Makefile +++ b/arch/parisc/Makefile @@ -24,15 +24,20 @@ KBUILD_DEFCONFIG := default_defconfig NM = sh $(srctree)/arch/parisc/nm CHECKFLAGS += -D__hppa__=1 LIBGCC = $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name) +export LIBGCC ifdef CONFIG_64BIT UTS_MACHINE := parisc64 CHECKFLAGS += -D__LP64__=1 -m64 CC_ARCHES = hppa64 +LD_BFD := elf64-hppa-linux else # 32-bit CC_ARCHES = hppa hppa2.0 hppa1.1 +LD_BFD := elf32-hppa-linux endif +export LD_BFD + ifneq ($(SUBARCH),$(UTS_MACHINE)) ifeq ($(CROSS_COMPILE),) CC_SUFFIXES = linux linux-gnu unknown-linux-gnu @@ -88,6 +93,8 @@ libs-y += arch/parisc/lib/ $(LIBGCC) drivers-$(CONFIG_OPROFILE) += arch/parisc/oprofile/ +boot := arch/parisc/boot + PALO := $(shell if (which palo 2>&1); then : ; \ elif [ -x /sbin/palo ]; then echo /sbin/palo; \ fi) @@ -116,11 +123,14 @@ INSTALL_TARGETS = zinstall install PHONY += bzImage $(BOOT_TARGETS) $(INSTALL_TARGETS) -bzImage zImage: vmlinuz +zImage: vmlinuz Image: vmlinux -vmlinuz: vmlinux - @gzip -cf -9 $< > $@ +bzImage: vmlinux + $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ + +vmlinuz: bzImage + $(OBJCOPY) $(boot)/bzImage $@ install: $(CONFIG_SHELL) $(src)/arch/parisc/install.sh \ diff --git a/arch/parisc/boot/.gitignore b/arch/parisc/boot/.gitignore new file mode 100644 index 000000000000..017d5912ad2d --- /dev/null +++ b/arch/parisc/boot/.gitignore @@ -0,0 +1,2 @@ +image +bzImage diff --git a/arch/parisc/boot/Makefile b/arch/parisc/boot/Makefile new file mode 100644 index 000000000000..cad68a584884 --- /dev/null +++ b/arch/parisc/boot/Makefile @@ -0,0 +1,26 @@ +# +# Makefile for the linux parisc-specific parts of the boot image creator. +# + +COMPILE_VERSION := __linux_compile_version_id__`hostname | \ + tr -c '[0-9A-Za-z]' '_'`__`date | \ + tr -c '[0-9A-Za-z]' '_'`_t + +ccflags-y := -DCOMPILE_VERSION=$(COMPILE_VERSION) -gstabs -I. + +targets := image +targets += bzImage +subdir- := compressed + +$(obj)/image: vmlinux FORCE + $(call if_changed,objcopy) + +$(obj)/bzImage: $(obj)/compressed/vmlinux FORCE + $(call if_changed,objcopy) + +$(obj)/compressed/vmlinux: FORCE + $(Q)$(MAKE) $(build)=$(obj)/compressed $@ + +install: $(CONFIGURE) $(obj)/bzImage + sh -x $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ + System.map "$(INSTALL_PATH)" diff --git a/arch/parisc/boot/compressed/.gitignore b/arch/parisc/boot/compressed/.gitignore new file mode 100644 index 000000000000..ae06b9b4c02f --- /dev/null +++ b/arch/parisc/boot/compressed/.gitignore @@ -0,0 +1,3 @@ +sizes.h +vmlinux +vmlinux.lds diff --git a/arch/parisc/boot/compressed/Makefile b/arch/parisc/boot/compressed/Makefile new file mode 100644 index 000000000000..5450a11c9d10 --- /dev/null +++ b/arch/parisc/boot/compressed/Makefile @@ -0,0 +1,86 @@ +# +# linux/arch/parisc/boot/compressed/Makefile +# +# create a compressed self-extracting vmlinux image from the original vmlinux +# + +KCOV_INSTRUMENT := n +GCOV_PROFILE := n +UBSAN_SANITIZE := n + +targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 +targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4 +targets += misc.o piggy.o sizes.h head.o real2.o firmware.o + +KBUILD_CFLAGS := -D__KERNEL__ -O2 -DBOOTLOADER +KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS += $(cflags-y) -fno-delete-null-pointer-checks +KBUILD_CFLAGS += -fno-PIE -mno-space-regs -mdisable-fpregs +ifndef CONFIG_64BIT +KBUILD_CFLAGS += -mfast-indirect-calls +endif + +OBJECTS += $(obj)/head.o $(obj)/real2.o $(obj)/firmware.o $(obj)/misc.o $(obj)/piggy.o + +# LDFLAGS_vmlinux := -X --whole-archive -e startup -T +LDFLAGS_vmlinux := -X -e startup --as-needed -T +$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS) $(LIBGCC) + $(call if_changed,ld) + +sed-sizes := -e 's/^\([0-9a-fA-F]*\) . \(__bss_start\|_end\|parisc_kernel_start\)$$/\#define SZ\2 0x\1/p' + +quiet_cmd_sizes = GEN $@ + cmd_sizes = $(NM) $< | sed -n $(sed-sizes) > $@ + +$(obj)/sizes.h: vmlinux + $(call if_changed,sizes) + +AFLAGS_head.o += -I$(objtree)/$(obj) -DBOOTLOADER +$(obj)/head.o: $(obj)/sizes.h + +CFLAGS_misc.o += -I$(objtree)/$(obj) +$(obj)/misc.o: $(obj)/sizes.h + +$(obj)/firmware.o: $(obj)/firmware.c +$(obj)/firmware.c: $(srctree)/arch/$(SRCARCH)/kernel/firmware.c + $(call cmd,shipped) + +AFLAGS_real2.o += -DBOOTLOADER +$(obj)/real2.o: $(obj)/real2.S +$(obj)/real2.S: $(srctree)/arch/$(SRCARCH)/kernel/real2.S + $(call cmd,shipped) + +$(obj)/misc.o: $(obj)/sizes.h + +CPPFLAGS_vmlinux.lds += -I$(objtree)/$(obj) -DBOOTLOADER +$(obj)/vmlinux.lds: $(obj)/sizes.h + +OBJCOPYFLAGS_vmlinux.bin := -O binary -R .comment -S +$(obj)/vmlinux.bin: vmlinux + $(call if_changed,objcopy) + +vmlinux.bin.all-y := $(obj)/vmlinux.bin + +suffix-$(CONFIG_KERNEL_GZIP) := gz +suffix-$(CONFIG_KERNEL_BZIP2) := bz2 +suffix-$(CONFIG_KERNEL_LZ4) := lz4 +suffix-$(CONFIG_KERNEL_LZMA) := lzma +suffix-$(CONFIG_KERNEL_LZO) := lzo +suffix-$(CONFIG_KERNEL_XZ) := xz + +$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) + $(call if_changed,gzip) +$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) + $(call if_changed,bzip2) +$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) + $(call if_changed,lz4) +$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) + $(call if_changed,lzma) +$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) + $(call if_changed,lzo) +$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) + $(call if_changed,xzkern) + +LDFLAGS_piggy.o := -r --format binary --oformat $(LD_BFD) -T +$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) + $(call if_changed,ld) diff --git a/arch/parisc/boot/compressed/head.S b/arch/parisc/boot/compressed/head.S new file mode 100644 index 000000000000..5aba20fa48aa --- /dev/null +++ b/arch/parisc/boot/compressed/head.S @@ -0,0 +1,85 @@ +/* + * Startup glue code to uncompress the kernel + * + * (C) 2017 Helge Deller <deller@gmx.de> + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/asm-offsets.h> +#include <asm/page.h> +#include <asm/psw.h> +#include <asm/pdc.h> +#include <asm/assembly.h> +#include "sizes.h" + +#define BOOTADDR(x) (x) + +#ifndef CONFIG_64BIT + .import $global$ /* forward declaration */ +#endif /*!CONFIG_64BIT*/ + + __HEAD + +ENTRY(startup) + .level LEVEL + +#define PSW_W_SM 0x200 +#define PSW_W_BIT 36 + + ;! nuke the W bit, saving original value + .level 2.0 + rsm PSW_W_SM, %r1 + + .level 1.1 + extrw,u %r1, PSW_W_BIT-32, 1, %r1 + copy %r1, %arg0 + + /* Make sure sr4-sr7 are set to zero for the kernel address space */ + mtsp %r0,%sr4 + mtsp %r0,%sr5 + mtsp %r0,%sr6 + mtsp %r0,%sr7 + + /* Clear BSS */ + + .import _bss,data + .import _ebss,data + + load32 BOOTADDR(_bss),%r3 + load32 BOOTADDR(_ebss),%r4 + ldo FRAME_SIZE(%r4),%sp /* stack at end of bss */ +$bss_loop: + cmpb,<<,n %r3,%r4,$bss_loop + stw,ma %r0,4(%r3) + + /* Initialize the global data pointer */ + loadgp + + /* arg0..arg4 were set by palo. */ + copy %arg1, %r6 /* command line */ + copy %arg2, %r7 /* rd-start */ + copy %arg3, %r8 /* rd-end */ + load32 BOOTADDR(decompress_kernel),%r3 + +#ifdef CONFIG_64BIT + .level LEVEL + ssm PSW_W_SM, %r0 /* set W-bit */ + depdi 0, 31, 32, %r3 +#endif + load32 BOOTADDR(startup_continue), %r2 + bv,n 0(%r3) + +startup_continue: +#ifdef CONFIG_64BIT + .level LEVEL + rsm PSW_W_SM, %r0 /* clear W-bit */ +#endif + + load32 KERNEL_BINARY_TEXT_START, %arg0 /* free mem */ + copy %r6, %arg1 /* command line */ + copy %r7, %arg2 /* rd-start */ + copy %r8, %arg3 /* rd-end */ + + bv,n 0(%ret0) +END(startup) diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c new file mode 100644 index 000000000000..13a4bf9ac4da --- /dev/null +++ b/arch/parisc/boot/compressed/misc.c @@ -0,0 +1,301 @@ +/* + * Definitions and wrapper functions for kernel decompressor + * + * (C) 2017 Helge Deller <deller@gmx.de> + */ + +#include <linux/uaccess.h> +#include <asm/unaligned.h> +#include <asm/page.h> +#include "sizes.h" + +/* + * gzip declarations + */ +#define STATIC static + +#undef memmove +#define memmove memmove +#define memzero(s, n) memset((s), 0, (n)) + +#define malloc malloc_gzip +#define free free_gzip + +/* Symbols defined by linker scripts */ +extern char input_data[]; +extern int input_len; +extern __le32 output_len; /* at unaligned address, little-endian */ +extern char _text, _end; +extern char _bss, _ebss; +extern char _startcode_end; +extern void startup_continue(void *entry, unsigned long cmdline, + unsigned long rd_start, unsigned long rd_end) __noreturn; + +void error(char *m) __noreturn; + +static unsigned long free_mem_ptr; +static unsigned long free_mem_end_ptr; + +#ifdef CONFIG_KERNEL_GZIP +#include "../../../../lib/decompress_inflate.c" +#endif + +#ifdef CONFIG_KERNEL_BZIP2 +#include "../../../../lib/decompress_bunzip2.c" +#endif + +#ifdef CONFIG_KERNEL_LZ4 +#include "../../../../lib/decompress_unlz4.c" +#endif + +#ifdef CONFIG_KERNEL_LZMA +#include "../../../../lib/decompress_unlzma.c" +#endif + +#ifdef CONFIG_KERNEL_LZO +#include "../../../../lib/decompress_unlzo.c" +#endif + +#ifdef CONFIG_KERNEL_XZ +#include "../../../../lib/decompress_unxz.c" +#endif + +void *memmove(void *dest, const void *src, size_t n) +{ + const char *s = src; + char *d = dest; + + if (d <= s) { + while (n--) + *d++ = *s++; + } else { + d += n; + s += n; + while (n--) + *--d = *--s; + } + return dest; +} + +void *memset(void *s, int c, size_t count) +{ + char *xs = (char *)s; + + while (count--) + *xs++ = c; + return s; +} + +void *memcpy(void *d, const void *s, size_t len) +{ + char *dest = (char *)d; + const char *source = (const char *)s; + + while (len--) + *dest++ = *source++; + return d; +} + +size_t strlen(const char *s) +{ + const char *sc; + + for (sc = s; *sc != '\0'; ++sc) + ; + return sc - s; +} + +char *strchr(const char *s, int c) +{ + while (*s) { + if (*s == (char)c) + return (char *)s; + ++s; + } + return NULL; +} + +int puts(const char *s) +{ + const char *nuline = s; + + while ((nuline = strchr(s, '\n')) != NULL) { + if (nuline != s) + pdc_iodc_print(s, nuline - s); + pdc_iodc_print("\r\n", 2); + s = nuline + 1; + } + if (*s != '\0') + pdc_iodc_print(s, strlen(s)); + + return 0; +} + +static int putchar(int c) +{ + char buf[2]; + + buf[0] = c; + buf[1] = '\0'; + puts(buf); + return c; +} + +void __noreturn error(char *x) +{ + puts("\n\n"); + puts(x); + puts("\n\n -- System halted"); + while (1) /* wait forever */ + ; +} + +static int print_hex(unsigned long num) +{ + const char hex[] = "0123456789abcdef"; + char str[40]; + int i = sizeof(str)-1; + + str[i--] = '\0'; + do { + str[i--] = hex[num & 0x0f]; + num >>= 4; + } while (num); + + str[i--] = 'x'; + str[i] = '0'; + puts(&str[i]); + + return 0; +} + +int printf(const char *fmt, ...) +{ + va_list args; + int i = 0; + + va_start(args, fmt); + + while (fmt[i]) { + if (fmt[i] != '%') { +put: + putchar(fmt[i++]); + continue; + } + + if (fmt[++i] == '%') + goto put; + ++i; + print_hex(va_arg(args, unsigned long)); + } + + va_end(args); + return 0; +} + +/* helper functions for libgcc */ +void abort(void) +{ + error("aborted."); +} + +#undef malloc +void *malloc(size_t size) +{ + return malloc_gzip(size); +} + +#undef free +void free(void *ptr) +{ + return free_gzip(ptr); +} + + +static void flush_data_cache(char *start, unsigned long length) +{ + char *end = start + length; + + do { + asm volatile("fdc 0(%0)" : : "r" (start)); + asm volatile("fic 0(%%sr0,%0)" : : "r" (start)); + start += 16; + } while (start < end); + asm volatile("fdc 0(%0)" : : "r" (end)); + + asm ("sync"); +} + +unsigned long decompress_kernel(unsigned int started_wide, + unsigned int command_line, + const unsigned int rd_start, + const unsigned int rd_end) +{ + char *output; + unsigned long len, len_all; + +#ifdef CONFIG_64BIT + parisc_narrow_firmware = 0; +#endif + + set_firmware_width_unlocked(); + + putchar('U'); /* if you get this p and no more, string storage */ + /* in $GLOBAL$ is wrong or %dp is wrong */ + puts("ncompressing ...\n"); + + output = (char *) KERNEL_BINARY_TEXT_START; + len_all = __pa(SZ_end) - __pa(SZparisc_kernel_start); + + if ((unsigned long) &_startcode_end > (unsigned long) output) + error("Bootcode overlaps kernel code"); + + len = get_unaligned_le32(&output_len); + if (len > len_all) + error("Output len too big."); + else + memset(&output[len], 0, len_all - len); + + /* + * Initialize free_mem_ptr and free_mem_end_ptr. + */ + free_mem_ptr = (unsigned long) &_ebss; + free_mem_ptr += 2*1024*1024; /* leave 2 MB for stack */ + + /* Limit memory for bootoader to 1GB */ + #define ARTIFICIAL_LIMIT (1*1024*1024*1024) + free_mem_end_ptr = PAGE0->imm_max_mem; + if (free_mem_end_ptr > ARTIFICIAL_LIMIT) + free_mem_end_ptr = ARTIFICIAL_LIMIT; + +#ifdef CONFIG_BLK_DEV_INITRD + /* if we have ramdisk this is at end of memory */ + if (rd_start && rd_start < free_mem_end_ptr) + free_mem_end_ptr = rd_start; +#endif + +#ifdef DEBUG + printf("startcode_end = %x\n", &_startcode_end); + printf("commandline = %x\n", command_line); + printf("rd_start = %x\n", rd_start); + printf("rd_end = %x\n", rd_end); + + printf("free_ptr = %x\n", free_mem_ptr); + printf("free_ptr_end = %x\n", free_mem_end_ptr); + + printf("input_data = %x\n", input_data); + printf("input_len = %x\n", input_len); + printf("output = %x\n", output); + printf("output_len = %x\n", len); + printf("output_max = %x\n", len_all); +#endif + + __decompress(input_data, input_len, NULL, NULL, + output, 0, NULL, error); + + flush_data_cache(output, len); + + printf("Booting kernel ...\n\n"); + + return (unsigned long) output; +} diff --git a/arch/parisc/boot/compressed/vmlinux.lds.S b/arch/parisc/boot/compressed/vmlinux.lds.S new file mode 100644 index 000000000000..a4ce3314e78e --- /dev/null +++ b/arch/parisc/boot/compressed/vmlinux.lds.S @@ -0,0 +1,101 @@ +#include <asm-generic/vmlinux.lds.h> +#include <asm/page.h> +#include "sizes.h" + +#ifndef CONFIG_64BIT +OUTPUT_FORMAT("elf32-hppa-linux") +OUTPUT_ARCH(hppa) +#else +OUTPUT_FORMAT("elf64-hppa-linux") +OUTPUT_ARCH(hppa:hppa2.0w) +#endif + +ENTRY(startup) + +SECTIONS +{ + /* palo loads at 0x60000 */ + /* loaded kernel will move to 0x10000 */ + . = 0xe0000; /* should not overwrite palo code */ + + .head.text : { + _head = . ; + HEAD_TEXT + _ehead = . ; + } + + /* keep __gp below 0x1000000 */ +#ifdef CONFIG_64BIT + . = ALIGN(16); + /* Linkage tables */ + .opd : { + *(.opd) + } PROVIDE (__gp = .); + .plt : { + *(.plt) + } + .dlt : { + *(.dlt) + } +#endif + _startcode_end = .; + + /* bootloader code and data starts behind area of extracted kernel */ + . = (SZ_end - SZparisc_kernel_start + KERNEL_BINARY_TEXT_START); + + /* align on next page boundary */ + . = ALIGN(4096); + .text : { + _text = .; /* Text */ + *(.text) + *(.text.*) + _etext = . ; + } + . = ALIGN(8); + .data : { + _data = . ; + *(.data) + *(.data.*) + _edata = . ; + } + . = ALIGN(8); + .rodata : { + _rodata = . ; + *(.rodata) /* read-only data */ + *(.rodata.*) + _erodata = . ; + } + . = ALIGN(8); + .rodata.compressed : { + *(.rodata.compressed) + } + . = ALIGN(8); + .bss : { + _bss = . ; + *(.bss) + *(.bss.*) + *(COMMON) + . = ALIGN(4096); + _ebss = .; + } + + STABS_DEBUG + .note 0 : { *(.note) } + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { +#ifdef CONFIG_64BIT + /* temporary hack until binutils is fixed to not emit these + * for static binaries + */ + *(.PARISC.unwind) /* no unwind data */ + *(.interp) + *(.dynsym) + *(.dynstr) + *(.dynamic) + *(.hash) + *(.gnu.hash) +#endif + } +} diff --git a/arch/parisc/boot/compressed/vmlinux.scr b/arch/parisc/boot/compressed/vmlinux.scr new file mode 100644 index 000000000000..dac2d142bcfa --- /dev/null +++ b/arch/parisc/boot/compressed/vmlinux.scr @@ -0,0 +1,10 @@ +SECTIONS +{ + .rodata.compressed : { + input_len = .; + LONG(input_data_end - input_data) input_data = .; + *(.data) + output_len = . - 4; /* can be at unaligned address */ + input_data_end = .; + } +} diff --git a/arch/parisc/boot/install.sh b/arch/parisc/boot/install.sh new file mode 100644 index 000000000000..8f7c365fad83 --- /dev/null +++ b/arch/parisc/boot/install.sh @@ -0,0 +1,65 @@ +#!/bin/sh +# +# arch/parisc/install.sh, derived from arch/i386/boot/install.sh +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (C) 1995 by Linus Torvalds +# +# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin +# +# "make install" script for i386 architecture +# +# Arguments: +# $1 - kernel version +# $2 - kernel image file +# $3 - kernel map file +# $4 - default install path (blank if root directory) +# + +verify () { + if [ ! -f "$1" ]; then + echo "" 1>&2 + echo " *** Missing file: $1" 1>&2 + echo ' *** You need to run "make" before "make install".' 1>&2 + echo "" 1>&2 + exit 1 + fi +} + +# Make sure the files actually exist + +verify "$2" +verify "$3" + +# User may have a custom install script + +if [ -n "${INSTALLKERNEL}" ]; then + if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi + if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi +fi + +# Default install + +if [ "$(basename $2)" = "zImage" ]; then +# Compressed install + echo "Installing compressed kernel" + base=vmlinuz +else +# Normal install + echo "Installing normal kernel" + base=vmlinux +fi + +if [ -f $4/$base-$1 ]; then + mv $4/$base-$1 $4/$base-$1.old +fi +cat $2 > $4/$base-$1 + +# Install system map file +if [ -f $4/System.map-$1 ]; then + mv $4/System.map-$1 $4/System.map-$1.old +fi +cp $3 $4/System.map-$1 diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig index 0764d3971cf6..8d41a73bd71b 100644 --- a/arch/parisc/configs/c3000_defconfig +++ b/arch/parisc/configs/c3000_defconfig @@ -31,7 +31,6 @@ CONFIG_IP_PNP_BOOTP=y CONFIG_INET6_IPCOMP=m CONFIG_IPV6_TUNNEL=m CONFIG_NETFILTER=y -CONFIG_NETFILTER_DEBUG=y CONFIG_NET_PKTGEN=m CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index 5394b9c5f914..17b98a87e5e2 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -65,6 +65,8 @@ static __inline__ void atomic_set(atomic_t *v, int i) _atomic_spin_unlock_irqrestore(v, flags); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + static __inline__ int atomic_read(const atomic_t *v) { return READ_ONCE((v)->counter); diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h index 0ba14300cd8e..c601aab2fb36 100644 --- a/arch/parisc/include/asm/futex.h +++ b/arch/parisc/include/asm/futex.h @@ -32,22 +32,12 @@ _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags) } static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { unsigned long int flags; - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval, ret; u32 tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr))) - return -EFAULT; - _futex_spin_lock_irqsave(uaddr, &flags); pagefault_disable(); @@ -85,17 +75,9 @@ out_pagefault_enable: pagefault_enable(); _futex_spin_unlock_irqrestore(uaddr, &flags); - if (ret == 0) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/parisc/include/asm/mmu_context.h b/arch/parisc/include/asm/mmu_context.h index a81226257878..e4a657094058 100644 --- a/arch/parisc/include/asm/mmu_context.h +++ b/arch/parisc/include/asm/mmu_context.h @@ -63,6 +63,9 @@ static inline void switch_mm(struct mm_struct *prev, { unsigned long flags; + if (prev == next) + return; + local_irq_save(flags); switch_mm_irqs_off(prev, next, tsk); local_irq_restore(flags); diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 80e742a1c162..bfed09d80bae 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -116,11 +116,15 @@ extern int npmem_ranges; /* This governs the relationship between virtual and physical addresses. * If you alter it, make sure to take care of our various fixed mapping * segments in fixmap.h */ +#if defined(BOOTLOADER) +#define __PAGE_OFFSET (0) /* bootloader uses physical addresses */ +#else #ifdef CONFIG_64BIT #define __PAGE_OFFSET (0x40000000) /* 1GB */ #else #define __PAGE_OFFSET (0x10000000) /* 256MB */ #endif +#endif /* BOOTLOADER */ #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) diff --git a/arch/parisc/include/asm/pdc.h b/arch/parisc/include/asm/pdc.h index 7569627a032b..26b4455baa83 100644 --- a/arch/parisc/include/asm/pdc.h +++ b/arch/parisc/include/asm/pdc.h @@ -5,6 +5,8 @@ #if !defined(__ASSEMBLY__) +extern int parisc_narrow_firmware; + extern int pdc_type; extern unsigned long parisc_cell_num; /* cell number the CPU runs on (PAT) */ extern unsigned long parisc_cell_loc; /* cell location of CPU (PAT) */ diff --git a/arch/parisc/include/asm/pdcpat.h b/arch/parisc/include/asm/pdcpat.h index e3c0586260d8..a468a172ee33 100644 --- a/arch/parisc/include/asm/pdcpat.h +++ b/arch/parisc/include/asm/pdcpat.h @@ -223,6 +223,18 @@ struct pdc_pat_mem_retinfo { /* PDC_PAT_MEM/PDC_PAT_MEM_PD_INFO (return info) */ unsigned long clear_time; /* last PDT clear time (since Jan 1970) */ }; +struct pdc_pat_mem_cell_pdt_retinfo { /* PDC_PAT_MEM/PDC_PAT_MEM_CELL_INFO */ + u64 reserved:32; + u64 cs:1; /* clear status: cleared since the last call? */ + u64 current_pdt_entries:15; + u64 ic:1; /* interleaving had to be changed ? */ + u64 max_pdt_entries:15; + unsigned long good_mem; + unsigned long first_dbe_loc; /* first location of double bit error */ + unsigned long clear_time; /* last PDT clear time (since Jan 1970) */ +}; + + struct pdc_pat_mem_read_pd_retinfo { /* PDC_PAT_MEM/PDC_PAT_MEM_PD_READ */ unsigned long actual_count_bytes; unsigned long pdt_entries; @@ -325,6 +337,8 @@ extern int pdc_pat_io_pci_cfg_read(unsigned long pci_addr, int pci_size, u32 *va extern int pdc_pat_io_pci_cfg_write(unsigned long pci_addr, int pci_size, u32 val); extern int pdc_pat_mem_pdt_info(struct pdc_pat_mem_retinfo *rinfo); +extern int pdc_pat_mem_pdt_cell_info(struct pdc_pat_mem_cell_pdt_retinfo *rinfo, + unsigned long cell); extern int pdc_pat_mem_read_cell_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, unsigned long *pdt_entries_ptr, unsigned long max_entries); extern int pdc_pat_mem_read_pd_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h index e32936cd7f10..55bfe4affca3 100644 --- a/arch/parisc/include/asm/spinlock.h +++ b/arch/parisc/include/asm/spinlock.h @@ -14,13 +14,6 @@ static inline int arch_spin_is_locked(arch_spinlock_t *x) #define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *x) -{ - volatile unsigned int *a = __ldcw_align(x); - - smp_cond_load_acquire(a, VAL); -} - static inline void arch_spin_lock_flags(arch_spinlock_t *x, unsigned long flags) { diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 5979745815a5..775b5d5e41a1 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -40,9 +40,6 @@ #define MADV_SEQUENTIAL 2 /* expect sequential page references */ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ -#define MADV_SPACEAVAIL 5 /* insure that resources are reserved */ -#define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */ -#define MADV_VPS_INHERIT 7 /* Inherit parents page size */ /* common/generic parameters */ #define MADV_FREE 8 /* free pages only if memory pressure */ @@ -60,21 +57,16 @@ overrides the coredump filter bits */ #define MADV_DODUMP 70 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */ + +#define MADV_HWPOISON 100 /* poison a page for testing */ +#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ + /* compatibility flags */ #define MAP_FILE 0 #define MAP_VARIABLE 0 -/* - * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. - * This gives us 6 bits, which is enough until someone invents 128 bit address - * spaces. - * - * Assume these are all power of twos. - * When 0 use the default page size. - */ -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f - #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index a0d4dc9f4eb2..3b2bf7ae703b 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -101,4 +101,6 @@ #define SO_PEERGROUPS 0x4034 +#define SO_ZEROCOPY 0x4035 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index f622a311d04a..ab80e5c6f651 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c @@ -69,7 +69,15 @@ #include <asm/pdcpat.h> #include <asm/processor.h> /* for boot_cpu_data */ +#if defined(BOOTLOADER) +# undef spin_lock_irqsave +# define spin_lock_irqsave(a, b) { b = 1; } +# undef spin_unlock_irqrestore +# define spin_unlock_irqrestore(a, b) +#else static DEFINE_SPINLOCK(pdc_lock); +#endif + extern unsigned long pdc_result[NUM_PDC_RESULT]; extern unsigned long pdc_result2[NUM_PDC_RESULT]; @@ -142,8 +150,8 @@ static void convert_to_wide(unsigned long *addr) int i; unsigned int *p = (unsigned int *)addr; - if(unlikely(parisc_narrow_firmware)) { - for(i = 31; i >= 0; --i) + if (unlikely(parisc_narrow_firmware)) { + for (i = (NUM_PDC_RESULT-1); i >= 0; --i) addr[i] = p[i]; } #endif @@ -186,6 +194,8 @@ void set_firmware_width(void) } #endif /*CONFIG_64BIT*/ + +#if !defined(BOOTLOADER) /** * pdc_emergency_unlock - Unlock the linux pdc lock * @@ -979,16 +989,22 @@ int pdc_mem_pdt_read_entries(struct pdc_mem_read_pdt *pret, spin_lock_irqsave(&pdc_lock, flags); retval = mem_pdc_call(PDC_MEM, PDC_MEM_READ_PDT, __pa(pdc_result), - __pa(pdc_result2)); + __pa(pdt_entries_ptr)); if (retval == PDC_OK) { convert_to_wide(pdc_result); memcpy(pret, pdc_result, sizeof(*pret)); - convert_to_wide(pdc_result2); - memcpy(pdt_entries_ptr, pdc_result2, - pret->pdt_entries * sizeof(*pdt_entries_ptr)); } spin_unlock_irqrestore(&pdc_lock, flags); +#ifdef CONFIG_64BIT + /* + * 64-bit kernels should not call this PDT function in narrow mode. + * The pdt_entries_ptr array above will now contain 32-bit values + */ + if (WARN_ON_ONCE((retval == PDC_OK) && parisc_narrow_firmware)) + return PDC_ERROR; +#endif + return retval; } @@ -1143,6 +1159,8 @@ void pdc_io_reset_devices(void) spin_unlock_irqrestore(&pdc_lock, flags); } +#endif /* defined(BOOTLOADER) */ + /* locked by pdc_console_lock */ static int __attribute__((aligned(8))) iodc_retbuf[32]; static char __attribute__((aligned(64))) iodc_dbuf[4096]; @@ -1187,6 +1205,7 @@ print: return i; } +#if !defined(BOOTLOADER) /** * pdc_iodc_getc - Read a character (non-blocking) from the PDC console. * @@ -1440,6 +1459,29 @@ int pdc_pat_mem_pdt_info(struct pdc_pat_mem_retinfo *rinfo) } /** + * pdc_pat_mem_pdt_cell_info - Retrieve information about page deallocation + * table of a cell + * @rinfo: memory pdt information + * @cell: cell number + * + */ +int pdc_pat_mem_pdt_cell_info(struct pdc_pat_mem_cell_pdt_retinfo *rinfo, + unsigned long cell) +{ + int retval; + unsigned long flags; + + spin_lock_irqsave(&pdc_lock, flags); + retval = mem_pdc_call(PDC_PAT_MEM, PDC_PAT_MEM_CELL_INFO, + __pa(&pdc_result), cell); + if (retval == PDC_OK) + memcpy(rinfo, &pdc_result, sizeof(*rinfo)); + spin_unlock_irqrestore(&pdc_lock, flags); + + return retval; +} + +/** * pdc_pat_mem_read_cell_pdt - Read PDT entries from (old) PAT firmware * @pret: array of PDT entries * @pdt_entries_ptr: ptr to hold number of PDT entries @@ -1455,14 +1497,14 @@ int pdc_pat_mem_read_cell_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, spin_lock_irqsave(&pdc_lock, flags); /* PDC_PAT_MEM_CELL_READ is available on early PAT machines only */ retval = mem_pdc_call(PDC_PAT_MEM, PDC_PAT_MEM_CELL_READ, - __pa(&pdc_result), parisc_cell_num, __pa(&pdc_result2)); + __pa(&pdc_result), parisc_cell_num, + __pa(pdt_entries_ptr)); if (retval == PDC_OK) { /* build up return value as for PDC_PAT_MEM_PD_READ */ entries = min(pdc_result[0], max_entries); pret->pdt_entries = entries; pret->actual_count_bytes = entries * sizeof(unsigned long); - memcpy(pdt_entries_ptr, &pdc_result2, pret->actual_count_bytes); } spin_unlock_irqrestore(&pdc_lock, flags); @@ -1474,6 +1516,8 @@ int pdc_pat_mem_read_cell_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, * pdc_pat_mem_read_pd_pdt - Read PDT entries from (newer) PAT firmware * @pret: array of PDT entries * @pdt_entries_ptr: ptr to hold number of PDT entries + * @count: number of bytes to read + * @offset: offset to start (in bytes) * */ int pdc_pat_mem_read_pd_pdt(struct pdc_pat_mem_read_pd_retinfo *pret, @@ -1524,6 +1568,7 @@ int pdc_pat_mem_get_dimm_phys_location( return retval; } #endif /* CONFIG_64BIT */ +#endif /* defined(BOOTLOADER) */ /***************** 32-bit real-mode calls ***********/ @@ -1633,4 +1678,3 @@ long real64_call(unsigned long fn, ...) } #endif /* CONFIG_64BIT */ - diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index 5f0067a62738..bd4c0a7471d3 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -41,7 +41,7 @@ static unsigned long pcxl_used_bytes __read_mostly = 0; static unsigned long pcxl_used_pages __read_mostly = 0; extern unsigned long pcxl_dma_start; /* Start of pcxl dma mapping area */ -static spinlock_t pcxl_res_lock; +static DEFINE_SPINLOCK(pcxl_res_lock); static char *pcxl_res_map; static int pcxl_res_hint; static int pcxl_res_size; @@ -390,7 +390,6 @@ pcxl_dma_init(void) if (pcxl_dma_start == 0) return 0; - spin_lock_init(&pcxl_res_lock); pcxl_res_size = PCXL_DMA_MAP_SIZE >> (PAGE_SHIFT + 3); pcxl_res_hint = 0; pcxl_res_map = (char *)__get_free_pages(GFP_KERNEL, diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c index d02874ecb94d..05730a83895c 100644 --- a/arch/parisc/kernel/pdt.c +++ b/arch/parisc/kernel/pdt.c @@ -1,19 +1,20 @@ /* * Page Deallocation Table (PDT) support * - * The Page Deallocation Table (PDT) holds a table with pointers to bad - * memory (broken RAM modules) which is maintained by firmware. + * The Page Deallocation Table (PDT) is maintained by firmware and holds a + * list of memory addresses in which memory errors were detected. + * The list contains both single-bit (correctable) and double-bit + * (uncorrectable) errors. * * Copyright 2017 by Helge Deller <deller@gmx.de> * - * TODO: - * - check regularily for new bad memory - * - add userspace interface with procfs or sysfs - * - increase number of PDT entries dynamically + * possible future enhancements: + * - add userspace interface via procfs or sysfs to clear PDT */ #include <linux/memblock.h> #include <linux/seq_file.h> +#include <linux/kthread.h> #include <asm/pdc.h> #include <asm/pdcpat.h> @@ -24,11 +25,16 @@ enum pdt_access_type { PDT_NONE, PDT_PDC, PDT_PAT_NEW, - PDT_PAT_OLD + PDT_PAT_CELL }; static enum pdt_access_type pdt_type; +/* PDT poll interval: 1 minute if errors, 5 minutes if everything OK. */ +#define PDT_POLL_INTERVAL_DEFAULT (5*60*HZ) +#define PDT_POLL_INTERVAL_SHORT (1*60*HZ) +static unsigned long pdt_poll_interval = PDT_POLL_INTERVAL_DEFAULT; + /* global PDT status information */ static struct pdc_mem_retinfo pdt_status; @@ -36,6 +42,21 @@ static struct pdc_mem_retinfo pdt_status; #define MAX_PDT_ENTRIES (MAX_PDT_TABLE_SIZE / sizeof(unsigned long)) static unsigned long pdt_entry[MAX_PDT_ENTRIES] __page_aligned_bss; +/* + * Constants for the pdt_entry format: + * A pdt_entry holds the physical address in bits 0-57, bits 58-61 are + * reserved, bit 62 is the perm bit and bit 63 is the error_type bit. + * The perm bit indicates whether the error have been verified as a permanent + * error (value of 1) or has not been verified, and may be transient (value + * of 0). The error_type bit indicates whether the error is a single bit error + * (value of 1) or a multiple bit error. + * On non-PAT machines phys_addr is encoded in bits 0-59 and error_type in bit + * 63. Those machines don't provide the perm bit. + */ + +#define PDT_ADDR_PHYS_MASK (pdt_type != PDT_PDC ? ~0x3f : ~0x0f) +#define PDT_ADDR_PERM_ERR (pdt_type != PDT_PDC ? 2UL : 0UL) +#define PDT_ADDR_SINGLE_ERR 1UL /* report PDT entries via /proc/meminfo */ void arch_report_meminfo(struct seq_file *m) @@ -49,6 +70,68 @@ void arch_report_meminfo(struct seq_file *m) pdt_status.pdt_entries); } +static int get_info_pat_new(void) +{ + struct pdc_pat_mem_retinfo pat_rinfo; + int ret; + + /* newer PAT machines like C8000 report info for all cells */ + if (is_pdc_pat()) + ret = pdc_pat_mem_pdt_info(&pat_rinfo); + else + return PDC_BAD_PROC; + + pdt_status.pdt_size = pat_rinfo.max_pdt_entries; + pdt_status.pdt_entries = pat_rinfo.current_pdt_entries; + pdt_status.pdt_status = 0; + pdt_status.first_dbe_loc = pat_rinfo.first_dbe_loc; + pdt_status.good_mem = pat_rinfo.good_mem; + + return ret; +} + +static int get_info_pat_cell(void) +{ + struct pdc_pat_mem_cell_pdt_retinfo cell_rinfo; + int ret; + + /* older PAT machines like rp5470 report cell info only */ + if (is_pdc_pat()) + ret = pdc_pat_mem_pdt_cell_info(&cell_rinfo, parisc_cell_num); + else + return PDC_BAD_PROC; + + pdt_status.pdt_size = cell_rinfo.max_pdt_entries; + pdt_status.pdt_entries = cell_rinfo.current_pdt_entries; + pdt_status.pdt_status = 0; + pdt_status.first_dbe_loc = cell_rinfo.first_dbe_loc; + pdt_status.good_mem = cell_rinfo.good_mem; + + return ret; +} + +static void report_mem_err(unsigned long pde) +{ + struct pdc_pat_mem_phys_mem_location loc; + unsigned long addr; + char dimm_txt[32]; + + addr = pde & PDT_ADDR_PHYS_MASK; + + /* show DIMM slot description on PAT machines */ + if (is_pdc_pat()) { + pdc_pat_mem_get_dimm_phys_location(&loc, addr); + sprintf(dimm_txt, "DIMM slot %02x, ", loc.dimm_slot); + } else + dimm_txt[0] = 0; + + pr_warn("PDT: BAD MEMORY at 0x%08lx, %s%s%s-bit error.\n", + addr, dimm_txt, + pde & PDT_ADDR_PERM_ERR ? "permanent ":"", + pde & PDT_ADDR_SINGLE_ERR ? "single":"multi"); +} + + /* * pdc_pdt_init() * @@ -63,18 +146,17 @@ void __init pdc_pdt_init(void) unsigned long entries; struct pdc_mem_read_pdt pdt_read_ret; - if (is_pdc_pat()) { - struct pdc_pat_mem_retinfo pat_rinfo; + pdt_type = PDT_PAT_NEW; + ret = get_info_pat_new(); - pdt_type = PDT_PAT_NEW; - ret = pdc_pat_mem_pdt_info(&pat_rinfo); - pdt_status.pdt_size = pat_rinfo.max_pdt_entries; - pdt_status.pdt_entries = pat_rinfo.current_pdt_entries; - pdt_status.pdt_status = 0; - pdt_status.first_dbe_loc = pat_rinfo.first_dbe_loc; - pdt_status.good_mem = pat_rinfo.good_mem; - } else { + if (ret != PDC_OK) { + pdt_type = PDT_PAT_CELL; + ret = get_info_pat_cell(); + } + + if (ret != PDC_OK) { pdt_type = PDT_PDC; + /* non-PAT machines provide the standard PDC call */ ret = pdc_mem_pdt_info(&pdt_status); } @@ -86,13 +168,17 @@ void __init pdc_pdt_init(void) } entries = pdt_status.pdt_entries; - WARN_ON(entries > MAX_PDT_ENTRIES); + if (WARN_ON(entries > MAX_PDT_ENTRIES)) + entries = pdt_status.pdt_entries = MAX_PDT_ENTRIES; - pr_info("PDT: size %lu, entries %lu, status %lu, dbe_loc 0x%lx," - " good_mem %lu\n", + pr_info("PDT: type %s, size %lu, entries %lu, status %lu, dbe_loc 0x%lx," + " good_mem %lu MB\n", + pdt_type == PDT_PDC ? __stringify(PDT_PDC) : + pdt_type == PDT_PAT_CELL ? __stringify(PDT_PAT_CELL) + : __stringify(PDT_PAT_NEW), pdt_status.pdt_size, pdt_status.pdt_entries, pdt_status.pdt_status, pdt_status.first_dbe_loc, - pdt_status.good_mem); + pdt_status.good_mem / 1024 / 1024); if (entries == 0) { pr_info("PDT: Firmware reports all memory OK.\n"); @@ -112,15 +198,12 @@ void __init pdc_pdt_init(void) #ifdef CONFIG_64BIT struct pdc_pat_mem_read_pd_retinfo pat_pret; - /* try old obsolete PAT firmware function first */ - pdt_type = PDT_PAT_OLD; - ret = pdc_pat_mem_read_cell_pdt(&pat_pret, pdt_entry, - MAX_PDT_ENTRIES); - if (ret != PDC_OK) { - pdt_type = PDT_PAT_NEW; + if (pdt_type == PDT_PAT_CELL) + ret = pdc_pat_mem_read_cell_pdt(&pat_pret, pdt_entry, + MAX_PDT_ENTRIES); + else ret = pdc_pat_mem_read_pd_pdt(&pat_pret, pdt_entry, MAX_PDT_TABLE_SIZE, 0); - } #else ret = PDC_BAD_PROC; #endif @@ -128,27 +211,142 @@ void __init pdc_pdt_init(void) if (ret != PDC_OK) { pdt_type = PDT_NONE; - pr_debug("PDT type %d, retval = %d\n", pdt_type, ret); + pr_warn("PDT: Get PDT entries failed with %d\n", ret); return; } for (i = 0; i < pdt_status.pdt_entries; i++) { - struct pdc_pat_mem_phys_mem_location loc; + report_mem_err(pdt_entry[i]); + + /* mark memory page bad */ + memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE); + } +} - /* get DIMM slot number */ - loc.dimm_slot = 0xff; + +/* + * This is the PDT kernel thread main loop. + */ + +static int pdt_mainloop(void *unused) +{ + struct pdc_mem_read_pdt pdt_read_ret; + struct pdc_pat_mem_read_pd_retinfo pat_pret __maybe_unused; + unsigned long old_num_entries; + unsigned long *bad_mem_ptr; + int num, ret; + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + old_num_entries = pdt_status.pdt_entries; + + schedule_timeout(pdt_poll_interval); + if (kthread_should_stop()) + break; + + /* Do we have new PDT entries? */ + switch (pdt_type) { + case PDT_PAT_NEW: + ret = get_info_pat_new(); + break; + case PDT_PAT_CELL: + ret = get_info_pat_cell(); + break; + default: + ret = pdc_mem_pdt_info(&pdt_status); + break; + } + + if (ret != PDC_OK) { + pr_warn("PDT: unexpected failure %d\n", ret); + return -EINVAL; + } + + /* if no new PDT entries, just wait again */ + num = pdt_status.pdt_entries - old_num_entries; + if (num <= 0) + continue; + + /* decrease poll interval in case we found memory errors */ + if (pdt_status.pdt_entries && + pdt_poll_interval == PDT_POLL_INTERVAL_DEFAULT) + pdt_poll_interval = PDT_POLL_INTERVAL_SHORT; + + /* limit entries to get */ + if (num > MAX_PDT_ENTRIES) { + num = MAX_PDT_ENTRIES; + pdt_status.pdt_entries = old_num_entries + num; + } + + /* get new entries */ + switch (pdt_type) { #ifdef CONFIG_64BIT - pdc_pat_mem_get_dimm_phys_location(&loc, pdt_entry[i]); + case PDT_PAT_CELL: + if (pdt_status.pdt_entries > MAX_PDT_ENTRIES) { + pr_crit("PDT: too many entries.\n"); + return -ENOMEM; + } + ret = pdc_pat_mem_read_cell_pdt(&pat_pret, pdt_entry, + MAX_PDT_ENTRIES); + bad_mem_ptr = &pdt_entry[old_num_entries]; + break; + case PDT_PAT_NEW: + ret = pdc_pat_mem_read_pd_pdt(&pat_pret, + pdt_entry, + num * sizeof(unsigned long), + old_num_entries * sizeof(unsigned long)); + bad_mem_ptr = &pdt_entry[0]; + break; #endif + default: + ret = pdc_mem_pdt_read_entries(&pdt_read_ret, + pdt_entry); + bad_mem_ptr = &pdt_entry[old_num_entries]; + break; + } - pr_warn("PDT: BAD PAGE #%d at 0x%08lx, " - "DIMM slot %02x (error_type = %lu)\n", - i, - pdt_entry[i] & PAGE_MASK, - loc.dimm_slot, - pdt_entry[i] & 1); + /* report and mark memory broken */ + while (num--) { + unsigned long pde = *bad_mem_ptr++; - /* mark memory page bad */ - memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE); + report_mem_err(pde); + +#ifdef CONFIG_MEMORY_FAILURE + if ((pde & PDT_ADDR_PERM_ERR) || + ((pde & PDT_ADDR_SINGLE_ERR) == 0)) + memory_failure(pde >> PAGE_SHIFT, 0, 0); + else + soft_offline_page( + pfn_to_page(pde >> PAGE_SHIFT), 0); +#else + pr_crit("PDT: memory error at 0x%lx ignored.\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y " + "for real handling.\n", + pde & PDT_ADDR_PHYS_MASK); +#endif + + } } + + return 0; } + + +static int __init pdt_initcall(void) +{ + struct task_struct *kpdtd_task; + + if (pdt_type == PDT_NONE) + return -ENODEV; + + kpdtd_task = kthread_create(pdt_mainloop, NULL, "kpdtd"); + if (IS_ERR(kpdtd_task)) + return PTR_ERR(kpdtd_task); + + wake_up_process(kpdtd_task); + + return 0; +} + +late_initcall(pdt_initcall); diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c index 6017a5af2e6e..0813359049ae 100644 --- a/arch/parisc/kernel/perf.c +++ b/arch/parisc/kernel/perf.c @@ -69,7 +69,7 @@ struct rdr_tbl_ent { static int perf_processor_interface __read_mostly = UNKNOWN_INTF; static int perf_enabled __read_mostly; -static spinlock_t perf_lock; +static DEFINE_SPINLOCK(perf_lock); struct parisc_device *cpu_device __read_mostly; /* RDRs to write for PCX-W */ @@ -533,8 +533,6 @@ static int __init perf_init(void) /* Patch the images to match the system */ perf_patch_images(); - spin_lock_init(&perf_lock); - /* TODO: this only lets us access the first cpu.. what to do for SMP? */ cpu_device = per_cpu(cpu_data, 0).dev; printk("Performance monitoring counters enabled for %s\n", diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index 0ab32779dfa7..a778bd3c107c 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -30,6 +30,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/seq_file.h> +#include <linux/random.h> #include <linux/slab.h> #include <linux/cpu.h> #include <asm/param.h> @@ -89,7 +90,7 @@ init_percpu_prof(unsigned long cpunum) * (return 1). If so, initialize the chip and tell other partners in crime * they have work to do. */ -static int processor_probe(struct parisc_device *dev) +static int __init processor_probe(struct parisc_device *dev) { unsigned long txn_addr; unsigned long cpuid; @@ -237,28 +238,45 @@ static int processor_probe(struct parisc_device *dev) */ void __init collect_boot_cpu_data(void) { + unsigned long cr16_seed; + memset(&boot_cpu_data, 0, sizeof(boot_cpu_data)); + cr16_seed = get_cycles(); + add_device_randomness(&cr16_seed, sizeof(cr16_seed)); + boot_cpu_data.cpu_hz = 100 * PAGE0->mem_10msec; /* Hz of this PARISC */ /* get CPU-Model Information... */ #define p ((unsigned long *)&boot_cpu_data.pdc.model) - if (pdc_model_info(&boot_cpu_data.pdc.model) == PDC_OK) + if (pdc_model_info(&boot_cpu_data.pdc.model) == PDC_OK) { printk(KERN_INFO "model %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8]); + + add_device_randomness(&boot_cpu_data.pdc.model, + sizeof(boot_cpu_data.pdc.model)); + } #undef p - if (pdc_model_versions(&boot_cpu_data.pdc.versions, 0) == PDC_OK) + if (pdc_model_versions(&boot_cpu_data.pdc.versions, 0) == PDC_OK) { printk(KERN_INFO "vers %08lx\n", boot_cpu_data.pdc.versions); - if (pdc_model_cpuid(&boot_cpu_data.pdc.cpuid) == PDC_OK) + add_device_randomness(&boot_cpu_data.pdc.versions, + sizeof(boot_cpu_data.pdc.versions)); + } + + if (pdc_model_cpuid(&boot_cpu_data.pdc.cpuid) == PDC_OK) { printk(KERN_INFO "CPUID vers %ld rev %ld (0x%08lx)\n", (boot_cpu_data.pdc.cpuid >> 5) & 127, boot_cpu_data.pdc.cpuid & 31, boot_cpu_data.pdc.cpuid); + add_device_randomness(&boot_cpu_data.pdc.cpuid, + sizeof(boot_cpu_data.pdc.cpuid)); + } + if (pdc_model_capabilities(&boot_cpu_data.pdc.capabilities) == PDC_OK) printk(KERN_INFO "capabilities 0x%lx\n", boot_cpu_data.pdc.capabilities); @@ -414,12 +432,12 @@ show_cpuinfo (struct seq_file *m, void *v) return 0; } -static const struct parisc_device_id processor_tbl[] = { +static const struct parisc_device_id processor_tbl[] __initconst = { { HPHW_NPROC, HVERSION_REV_ANY_ID, HVERSION_ANY_ID, SVERSION_ANY_ID }, { 0, } }; -static struct parisc_driver cpu_driver = { +static struct parisc_driver cpu_driver __refdata = { .name = "CPU", .id_table = processor_tbl, .probe = processor_probe diff --git a/arch/parisc/kernel/real2.S b/arch/parisc/kernel/real2.S index 1db58e546230..cc9963421a19 100644 --- a/arch/parisc/kernel/real2.S +++ b/arch/parisc/kernel/real2.S @@ -162,6 +162,7 @@ ENDPROC_CFI(restore_control_regs) .text .align 128 ENTRY_CFI(rfi_virt2real) +#if !defined(BOOTLOADER) /* switch to real mode... */ rsm PSW_SM_I,%r0 load32 PA(rfi_v2r_1), %r1 @@ -191,6 +192,7 @@ ENTRY_CFI(rfi_virt2real) nop rfi_v2r_1: tophys_r1 %r2 +#endif /* defined(BOOTLOADER) */ bv 0(%r2) nop ENDPROC_CFI(rfi_virt2real) @@ -198,6 +200,7 @@ ENDPROC_CFI(rfi_virt2real) .text .align 128 ENTRY_CFI(rfi_real2virt) +#if !defined(BOOTLOADER) rsm PSW_SM_I,%r0 load32 (rfi_r2v_1), %r1 nop @@ -226,6 +229,7 @@ ENTRY_CFI(rfi_real2virt) nop rfi_r2v_1: tovirt_r1 %r2 +#endif /* defined(BOOTLOADER) */ bv 0(%r2) nop ENDPROC_CFI(rfi_real2virt) diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c index 1b73690477c5..48dc7d4d20bb 100644 --- a/arch/parisc/kernel/unwind.c +++ b/arch/parisc/kernel/unwind.c @@ -34,7 +34,7 @@ extern struct unwind_table_entry __start___unwind[]; extern struct unwind_table_entry __stop___unwind[]; -static spinlock_t unwind_lock; +static DEFINE_SPINLOCK(unwind_lock); /* * the kernel unwind block is not dynamically allocated so that * we can call unwind_init as early in the bootup process as @@ -181,8 +181,6 @@ int __init unwind_init(void) start = (long)&__start___unwind[0]; stop = (long)&__stop___unwind[0]; - spin_lock_init(&unwind_lock); - printk("unwind_init: start = 0x%lx, end = 0x%lx, entries = %lu\n", start, stop, (stop - start) / sizeof(struct unwind_table_entry)); diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c index 99115cd9e790..865a7f796c7f 100644 --- a/arch/parisc/lib/memcpy.c +++ b/arch/parisc/lib/memcpy.c @@ -27,8 +27,6 @@ #include <linux/compiler.h> #include <linux/uaccess.h> -DECLARE_PER_CPU(struct exception_data, exception_data); - #define get_user_space() (uaccess_kernel() ? 0 : mfsp(3)) #define get_kernel_space() (0) diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index 25d42bd3f114..9c601adfc500 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -74,13 +74,6 @@ do { \ ___p1; \ }) -/* - * This must resolve to hwsync on SMP for the context switch path. - * See _switch, and core scheduler context switch memory ordering - * comments. - */ -#define smp_mb__before_spinlock() smp_mb() - #include <asm-generic/barrier.h> #endif /* _ASM_POWERPC_BARRIER_H */ diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index eaada6c92344..719ed9b61ea7 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -29,18 +29,10 @@ : "b" (uaddr), "i" (-EFAULT), "r" (oparg) \ : "cr0", "memory") -static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -66,17 +58,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8b3f1238d07f..e372ed871c51 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -67,11 +67,6 @@ extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address) -{ -} - #define HPTEG_CACHE_NUM (1 << 15) #define HPTEG_HASH_BITS_PTE 13 #define HPTEG_HASH_BITS_PTE_LONG 12 diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 0c76675394c5..35bec1c5bd5a 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -90,6 +90,24 @@ static inline void switch_mm_irqs_off(struct mm_struct *prev, /* Mark this context has been used on the new CPU */ if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) { cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); + + /* + * This full barrier orders the store to the cpumask above vs + * a subsequent operation which allows this CPU to begin loading + * translations for next. + * + * When using the radix MMU that operation is the load of the + * MMU context id, which is then moved to SPRN_PID. + * + * For the hash MMU it is either the first load from slb_cache + * in switch_slb(), and/or the store of paca->mm_ctx_id in + * copy_mm_to_paca(). + * + * On the read side the barrier is in pte_xchg(), which orders + * the store to the PTE vs the load of mm_cpumask. + */ + smp_mb(); + new_on_cpu = true; } diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h index 9c0f5db5cf46..67e7e3d990f4 100644 --- a/arch/powerpc/include/asm/pgtable-be-types.h +++ b/arch/powerpc/include/asm/pgtable-be-types.h @@ -87,6 +87,7 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) unsigned long *p = (unsigned long *)ptep; __be64 prev; + /* See comment in switch_mm_irqs_off() */ prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pte_raw(old), (__force unsigned long)pte_raw(new)); diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index 8bd3b13fe2fb..369a164b545c 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -62,6 +62,7 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) { unsigned long *p = (unsigned long *)ptep; + /* See comment in switch_mm_irqs_off() */ return pte_val(old) == __cmpxchg_u64(p, pte_val(old), pte_val(new)); } #endif diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 8c1b913de6d7..edbe571bcc54 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -170,39 +170,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) lock->slock = 0; } -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - arch_spinlock_t lock_val; - - smp_mb(); - - /* - * Atomically load and store back the lock value (unchanged). This - * ensures that our observation of the lock value is ordered with - * respect to other lock operations. - */ - __asm__ __volatile__( -"1: " PPC_LWARX(%0, 0, %2, 0) "\n" -" stwcx. %0, 0, %2\n" -" bne- 1b\n" - : "=&r" (lock_val), "+m" (*lock) - : "r" (lock) - : "cr0", "xer"); - - if (arch_spin_value_unlocked(lock_val)) - goto out; - - while (lock->slock) { - HMT_low(); - if (SHARED_PROCESSOR) - __spin_yield(lock); - } - HMT_medium(); - -out: - smp_mb(); -} - /* * Read-write spinlocks, allowing multiple readers * but only one writer. @@ -342,5 +309,8 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) #define arch_read_relax(lock) __rw_yield(lock) #define arch_write_relax(lock) __rw_yield(lock) +/* See include/linux/spinlock.h */ +#define smp_mb__after_spinlock() smp_mb() + #endif /* __KERNEL__ */ #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h index ab45cc2f3101..03c06ba7464f 100644 --- a/arch/powerpc/include/uapi/asm/mman.h +++ b/arch/powerpc/include/uapi/asm/mman.h @@ -29,20 +29,4 @@ #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ -/* - * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2), - * encode the log2 of the huge page size. A value of zero indicates that the - * default huge page size should be used. To use a non-default huge page size, - * one of these defines can be used, or the size can be encoded by hand. Note - * that on most systems only a subset, or possibly none, of these sizes will be - * available. - */ -#define MAP_HUGE_512KB (19 << MAP_HUGE_SHIFT) /* 512KB HugeTLB Page */ -#define MAP_HUGE_1MB (20 << MAP_HUGE_SHIFT) /* 1MB HugeTLB Page */ -#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) /* 2MB HugeTLB Page */ -#define MAP_HUGE_8MB (23 << MAP_HUGE_SHIFT) /* 8MB HugeTLB Page */ -#define MAP_HUGE_16MB (24 << MAP_HUGE_SHIFT) /* 16MB HugeTLB Page */ -#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) /* 1GB HugeTLB Page */ -#define MAP_HUGE_16GB (34 << MAP_HUGE_SHIFT) /* 16GB HugeTLB Page */ - #endif /* _UAPI_ASM_POWERPC_MMAN_H */ diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index a160c14304eb..53766e2bc029 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -294,32 +294,26 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce_64 *args) { struct kvmppc_spapr_tce_table *stt = NULL; + struct kvmppc_spapr_tce_table *siter; unsigned long npages, size; int ret = -ENOMEM; int i; + int fd = -1; if (!args->size) return -EINVAL; - /* Check this LIOBN hasn't been previously allocated */ - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == args->liobn) - return -EBUSY; - } - size = _ALIGN_UP(args->size, PAGE_SIZE >> 3); npages = kvmppc_tce_pages(size); ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); - if (ret) { - stt = NULL; - goto fail; - } + if (ret) + return ret; ret = -ENOMEM; stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL); if (!stt) - goto fail; + goto fail_acct; stt->liobn = args->liobn; stt->page_shift = args->page_shift; @@ -334,24 +328,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; } - kvm_get_kvm(kvm); + ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, + stt, O_RDWR | O_CLOEXEC); + if (ret < 0) + goto fail; mutex_lock(&kvm->lock); - list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); + + /* Check this LIOBN hasn't been previously allocated */ + ret = 0; + list_for_each_entry(siter, &kvm->arch.spapr_tce_tables, list) { + if (siter->liobn == args->liobn) { + ret = -EBUSY; + break; + } + } + + if (!ret) { + list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); + kvm_get_kvm(kvm); + } mutex_unlock(&kvm->lock); - return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, - stt, O_RDWR | O_CLOEXEC); + if (!ret) + return fd; -fail: - if (stt) { - for (i = 0; i < npages; i++) - if (stt->pages[i]) - __free_page(stt->pages[i]); + put_unused_fd(fd); - kfree(stt); - } + fail: + for (i = 0; i < npages; i++) + if (stt->pages[i]) + __free_page(stt->pages[i]); + + kfree(stt); + fail_acct: + kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); return ret; } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index c52184a8efdf..9c9c983b864f 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1291,6 +1291,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) /* Hypervisor doorbell - exit only if host IPI flag set */ cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL bne 3f +BEGIN_FTR_SECTION + PPC_MSGSYNC +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lbz r0, HSTATE_HOST_IPI(r13) cmpwi r0, 0 beq 4f diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c index 4636ca6e7d38..d1ed2c41b5d2 100644 --- a/arch/powerpc/kvm/book3s_xive_template.c +++ b/arch/powerpc/kvm/book3s_xive_template.c @@ -16,7 +16,22 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc) u8 cppr; u16 ack; - /* XXX DD1 bug workaround: Check PIPR vs. CPPR first ! */ + /* + * Ensure any previous store to CPPR is ordered vs. + * the subsequent loads from PIPR or ACK. + */ + eieio(); + + /* + * DD1 bug workaround: If PIPR is less favored than CPPR + * ignore the interrupt or we might incorrectly lose an IPB + * bit. + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + u8 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR); + if (pipr >= xc->hw_cppr) + return; + } /* Perform the acknowledge OS to register cycle. */ ack = be16_to_cpu(__x_readw(__x_tima + TM_SPC_ACK_OS_REG)); @@ -235,6 +250,11 @@ skip_ipi: /* * If we found an interrupt, adjust what the guest CPPR should * be as if we had just fetched that interrupt from HW. + * + * Note: This can only make xc->cppr smaller as the previous + * loop will only exit with hirq != 0 if prio is lower than + * the current xc->cppr. Thus we don't need to re-check xc->mfrr + * for pending IPIs. */ if (hirq) xc->cppr = prio; @@ -381,6 +401,12 @@ X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr) xc->cppr = cppr; /* + * Order the above update of xc->cppr with the subsequent + * read of xc->mfrr inside push_pending_to_hw() + */ + smp_mb(); + + /* * We are masking less, we need to look for pending things * to deliver and set VP pending bits accordingly to trigger * a new interrupt otherwise we might miss MFRR changes for @@ -420,21 +446,37 @@ X_STATIC int GLUE(X_PFX,h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr) * used to signal MFRR changes is EOId when fetched from * the queue. */ - if (irq == XICS_IPI || irq == 0) + if (irq == XICS_IPI || irq == 0) { + /* + * This barrier orders the setting of xc->cppr vs. + * subsquent test of xc->mfrr done inside + * scan_interrupts and push_pending_to_hw + */ + smp_mb(); goto bail; + } /* Find interrupt source */ sb = kvmppc_xive_find_source(xive, irq, &src); if (!sb) { pr_devel(" source not found !\n"); rc = H_PARAMETER; + /* Same as above */ + smp_mb(); goto bail; } state = &sb->irq_state[src]; kvmppc_xive_select_irq(state, &hw_num, &xd); state->in_eoi = true; - mb(); + + /* + * This barrier orders both setting of in_eoi above vs, + * subsequent test of guest_priority, and the setting + * of xc->cppr vs. subsquent test of xc->mfrr done inside + * scan_interrupts and push_pending_to_hw + */ + smp_mb(); again: if (state->guest_priority == MASKED) { @@ -461,6 +503,14 @@ again: } + /* + * This barrier orders the above guest_priority check + * and spin_lock/unlock with clearing in_eoi below. + * + * It also has to be a full mb() as it must ensure + * the MMIOs done in source_eoi() are completed before + * state->in_eoi is visible. + */ mb(); state->in_eoi = false; bail: @@ -495,6 +545,18 @@ X_STATIC int GLUE(X_PFX,h_ipi)(struct kvm_vcpu *vcpu, unsigned long server, /* Locklessly write over MFRR */ xc->mfrr = mfrr; + /* + * The load of xc->cppr below and the subsequent MMIO store + * to the IPI must happen after the above mfrr update is + * globally visible so that: + * + * - Synchronize with another CPU doing an H_EOI or a H_CPPR + * updating xc->cppr then reading xc->mfrr. + * + * - The target of the IPI sees the xc->mfrr update + */ + mb(); + /* Shoot the IPI if most favored than target cppr */ if (mfrr < xc->cppr) __x_writeq(0, __x_trig_page(&xc->vp_ipi_data)); diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 30cf03f53428..47fc6660845d 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -263,6 +263,7 @@ static inline bool is_nearbranch(int offset) #define COND_EQ (CR0_EQ | COND_CMP_TRUE) #define COND_NE (CR0_EQ | COND_CMP_FALSE) #define COND_LT (CR0_LT | COND_CMP_TRUE) +#define COND_LE (CR0_GT | COND_CMP_FALSE) #endif diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 861c5af1c9c4..faf20163bd4c 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -795,12 +795,24 @@ emit_clear: case BPF_JMP | BPF_JSGT | BPF_X: true_cond = COND_GT; goto cond_branch; + case BPF_JMP | BPF_JLT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_X: + true_cond = COND_LT; + goto cond_branch; case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JSGE | BPF_K: case BPF_JMP | BPF_JSGE | BPF_X: true_cond = COND_GE; goto cond_branch; + case BPF_JMP | BPF_JLE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_X: + true_cond = COND_LE; + goto cond_branch; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: true_cond = COND_EQ; @@ -817,14 +829,18 @@ emit_clear: cond_branch: switch (code) { case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JNE | BPF_X: /* unsigned comparison */ PPC_CMPLD(dst_reg, src_reg); break; case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: /* signed comparison */ PPC_CMPD(dst_reg, src_reg); break; @@ -834,7 +850,9 @@ cond_branch: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: /* * Need sign-extended load, so only positive * values can be used as imm in cmpldi @@ -849,7 +867,9 @@ cond_branch: } break; case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: /* * signed comparison, so any 16-bit value * can be used in cmpdi diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 6c2d4168daec..2e3eb7431571 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2039,7 +2039,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val, perf_sample_data_init(&data, ~0ULL, event->hw.last_period); - if (event->attr.sample_type & PERF_SAMPLE_ADDR) + if (event->attr.sample_type & + (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) perf_get_data_addr(regs, &data.addr); if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) { diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index ae2f740a82f1..5ffcdeb1eb17 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -1749,7 +1749,7 @@ out: static int spufs_mfc_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file_inode(file); - int err = filemap_write_and_wait_range(inode->i_mapping, start, end); + int err = file_write_and_wait_range(file, start, end); if (!err) { inode_lock(inode); err = spufs_mfc_flush(file, NULL); diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index b5d960d6db3d..4c7b8591f737 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -614,15 +614,6 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, mmio_invalidate(npu_context, 1, address, true); } -static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -{ - struct npu_context *npu_context = mn_to_npu_context(mn); - - mmio_invalidate(npu_context, 1, address, true); -} - static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) @@ -640,7 +631,6 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { .release = pnv_npu2_mn_release, .change_pte = pnv_npu2_mn_change_pte, - .invalidate_page = pnv_npu2_mn_invalidate_page, .invalidate_range = pnv_npu2_mn_invalidate_range, }; diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 7eeb75d758c1..48af970320cb 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -222,6 +222,10 @@ config HAVE_MARCH_Z13_FEATURES def_bool n select HAVE_MARCH_ZEC12_FEATURES +config HAVE_MARCH_Z14_FEATURES + def_bool n + select HAVE_MARCH_Z13_FEATURES + choice prompt "Processor type" default MARCH_Z196 @@ -282,6 +286,14 @@ config MARCH_Z13 2964 series). The kernel will be slightly faster but will not work on older machines. +config MARCH_Z14 + bool "IBM z14" + select HAVE_MARCH_Z14_FEATURES + help + Select this to enable optimizations for IBM z14 (3906 series). + The kernel will be slightly faster but will not work on older + machines. + endchoice config MARCH_Z900_TUNE @@ -305,6 +317,9 @@ config MARCH_ZEC12_TUNE config MARCH_Z13_TUNE def_bool TUNE_Z13 || MARCH_Z13 && TUNE_DEFAULT +config MARCH_Z14_TUNE + def_bool TUNE_Z14 || MARCH_Z14 && TUNE_DEFAULT + choice prompt "Tune code generation" default TUNE_DEFAULT @@ -343,6 +358,9 @@ config TUNE_ZEC12 config TUNE_Z13 bool "IBM z13" +config TUNE_Z14 + bool "IBM z14" + endchoice config 64BIT diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 54e00526b8df..dac821cfcd43 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -31,7 +31,8 @@ mflags-$(CONFIG_MARCH_Z9_109) := -march=z9-109 mflags-$(CONFIG_MARCH_Z10) := -march=z10 mflags-$(CONFIG_MARCH_Z196) := -march=z196 mflags-$(CONFIG_MARCH_ZEC12) := -march=zEC12 -mflags-$(CONFIG_MARCH_Z13) := -march=z13 +mflags-$(CONFIG_MARCH_Z13) := -march=z13 +mflags-$(CONFIG_MARCH_Z14) := -march=z14 export CC_FLAGS_MARCH := $(mflags-y) @@ -44,7 +45,8 @@ cflags-$(CONFIG_MARCH_Z9_109_TUNE) += -mtune=z9-109 cflags-$(CONFIG_MARCH_Z10_TUNE) += -mtune=z10 cflags-$(CONFIG_MARCH_Z196_TUNE) += -mtune=z196 cflags-$(CONFIG_MARCH_ZEC12_TUNE) += -mtune=zEC12 -cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13 +cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13 +cflags-$(CONFIG_MARCH_Z14_TUNE) += -mtune=z14 cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index b3c88479feba..6e2c9f7e47fa 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -16,4 +16,5 @@ generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += preempt.h generic-y += trace_clock.h +generic-y += unaligned.h generic-y += word-at-a-time.h diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index b9300f8aee10..07a82bc933a7 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -8,11 +8,12 @@ #include <linux/sched/task_stack.h> #include <linux/thread_info.h> -#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p(typeof(0?(t)0:0ULL), u64)) +#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p( \ + typeof(0?(__force t)0:0ULL), u64)) #define __SC_DELOUSE(t,v) ({ \ BUILD_BUG_ON(sizeof(t) > 4 && !__TYPE_IS_PTR(t)); \ - (t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \ + (__force t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \ }) #define PSW32_MASK_PER 0x40000000UL diff --git a/arch/s390/include/asm/cpcmd.h b/arch/s390/include/asm/cpcmd.h index 3dfadb5d648f..ca2b0624ad46 100644 --- a/arch/s390/include/asm/cpcmd.h +++ b/arch/s390/include/asm/cpcmd.h @@ -10,9 +10,8 @@ /* * the lowlevel function for cpcmd - * the caller of __cpcmd has to ensure that the response buffer is below 2 GB */ -extern int __cpcmd(const char *cmd, char *response, int rlen, int *response_code); +int __cpcmd(const char *cmd, char *response, int rlen, int *response_code); /* * cpcmd is the in-kernel interface for issuing CP commands @@ -25,8 +24,8 @@ extern int __cpcmd(const char *cmd, char *response, int rlen, int *response_code * response_code: return pointer for VM's error code * return value: the size of the response. The caller can check if the buffer * was large enough by comparing the return value and rlen - * NOTE: If the response buffer is not below 2 GB, cpcmd can sleep + * NOTE: If the response buffer is not in real storage, cpcmd can sleep */ -extern int cpcmd(const char *cmd, char *response, int rlen, int *response_code); +int cpcmd(const char *cmd, char *response, int rlen, int *response_code); #endif /* _ASM_S390_CPCMD_H */ diff --git a/arch/s390/include/asm/ebcdic.h b/arch/s390/include/asm/ebcdic.h index c5befc5a3bf5..b71735eab23f 100644 --- a/arch/s390/include/asm/ebcdic.h +++ b/arch/s390/include/asm/ebcdic.h @@ -9,9 +9,7 @@ #ifndef _EBCDIC_H #define _EBCDIC_H -#ifndef _S390_TYPES_H -#include <types.h> -#endif +#include <linux/types.h> extern __u8 _ascebc_500[256]; /* ASCII -> EBCDIC 500 conversion table */ extern __u8 _ebcasc_500[256]; /* EBCDIC 500 -> ASCII conversion table */ diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index c92ed0170be2..65998a1f5d43 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -191,7 +191,7 @@ struct arch_elf_state { } while (0) #define CORE_DUMP_USE_REGSET -#define ELF_EXEC_PAGESIZE 4096 +#define ELF_EXEC_PAGESIZE PAGE_SIZE /* * This is the base location for PIE (ET_DYN with INTERP) loads. On diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index a4811aa0304d..8f8eec9e1198 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -21,17 +21,12 @@ : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ "m" (*uaddr) : "cc"); -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, newval, ret; load_kernel_asce(); - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; pagefault_disable(); switch (op) { @@ -60,17 +55,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) } pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index edb5161df7e2..6810bd757312 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -81,7 +81,7 @@ struct ipl_parameter_block { struct ipl_block_fcp fcp; struct ipl_block_ccw ccw; } ipl_info; -} __attribute__((packed,aligned(4096))); +} __packed __aligned(PAGE_SIZE); /* * IPL validity flags diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 8a5b082797f8..a6870ea6ea8b 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -95,46 +95,46 @@ struct lowcore { __u64 int_clock; /* 0x0310 */ __u64 mcck_clock; /* 0x0318 */ __u64 clock_comparator; /* 0x0320 */ + __u64 boot_clock[2]; /* 0x0328 */ /* Current process. */ - __u64 current_task; /* 0x0328 */ - __u8 pad_0x318[0x320-0x318]; /* 0x0330 */ - __u64 kernel_stack; /* 0x0338 */ + __u64 current_task; /* 0x0338 */ + __u64 kernel_stack; /* 0x0340 */ /* Interrupt, panic and restart stack. */ - __u64 async_stack; /* 0x0340 */ - __u64 panic_stack; /* 0x0348 */ - __u64 restart_stack; /* 0x0350 */ + __u64 async_stack; /* 0x0348 */ + __u64 panic_stack; /* 0x0350 */ + __u64 restart_stack; /* 0x0358 */ /* Restart function and parameter. */ - __u64 restart_fn; /* 0x0358 */ - __u64 restart_data; /* 0x0360 */ - __u64 restart_source; /* 0x0368 */ + __u64 restart_fn; /* 0x0360 */ + __u64 restart_data; /* 0x0368 */ + __u64 restart_source; /* 0x0370 */ /* Address space pointer. */ - __u64 kernel_asce; /* 0x0370 */ - __u64 user_asce; /* 0x0378 */ + __u64 kernel_asce; /* 0x0378 */ + __u64 user_asce; /* 0x0380 */ /* * The lpp and current_pid fields form a * 64-bit value that is set as program * parameter with the LPP instruction. */ - __u32 lpp; /* 0x0380 */ - __u32 current_pid; /* 0x0384 */ + __u32 lpp; /* 0x0388 */ + __u32 current_pid; /* 0x038c */ /* SMP info area */ - __u32 cpu_nr; /* 0x0388 */ - __u32 softirq_pending; /* 0x038c */ - __u64 percpu_offset; /* 0x0390 */ - __u64 vdso_per_cpu_data; /* 0x0398 */ - __u64 machine_flags; /* 0x03a0 */ - __u32 preempt_count; /* 0x03a8 */ - __u8 pad_0x03ac[0x03b0-0x03ac]; /* 0x03ac */ - __u64 gmap; /* 0x03b0 */ - __u32 spinlock_lockval; /* 0x03b8 */ - __u32 fpu_flags; /* 0x03bc */ - __u8 pad_0x03c0[0x0400-0x03c0]; /* 0x03c0 */ + __u32 cpu_nr; /* 0x0390 */ + __u32 softirq_pending; /* 0x0394 */ + __u64 percpu_offset; /* 0x0398 */ + __u64 vdso_per_cpu_data; /* 0x03a0 */ + __u64 machine_flags; /* 0x03a8 */ + __u32 preempt_count; /* 0x03b0 */ + __u8 pad_0x03b4[0x03b8-0x03b4]; /* 0x03b4 */ + __u64 gmap; /* 0x03b8 */ + __u32 spinlock_lockval; /* 0x03c0 */ + __u32 fpu_flags; /* 0x03c4 */ + __u8 pad_0x03c8[0x0400-0x03c8]; /* 0x03c8 */ /* Per cpu primary space access list */ __u32 paste[16]; /* 0x0400 */ diff --git a/arch/s390/include/asm/mman.h b/arch/s390/include/asm/mman.h deleted file mode 100644 index b79813d9cf68..000000000000 --- a/arch/s390/include/asm/mman.h +++ /dev/null @@ -1,11 +0,0 @@ -/* - * S390 version - * - * Derived from "include/asm-i386/mman.h" - */ -#ifndef __S390_MMAN_H__ -#define __S390_MMAN_H__ - -#include <uapi/asm/mman.h> - -#endif /* __S390_MMAN_H__ */ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 4541ac44b35f..72e9ca83a668 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -12,6 +12,7 @@ #include <linux/mm_types.h> #include <asm/tlbflush.h> #include <asm/ctl_reg.h> +#include <asm-generic/mm_hooks.h> static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) @@ -33,7 +34,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.use_cmma = 0; #endif switch (mm->context.asce_limit) { - case 1UL << 42: + case _REGION2_SIZE: /* * forked 3-level task, fall through to set new asce with new * mm->pgd @@ -44,12 +45,17 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION3; break; - case 1UL << 53: + case -PAGE_SIZE: + /* forked 5-level task, set new asce with new_mm->pgd */ + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION1; + break; + case _REGION1_SIZE: /* forked 4-level task, set new asce with new mm->pgd */ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION2; break; - case 1UL << 31: + case _REGION3_SIZE: /* forked 2-level compat task, set new asce with new mm->pgd */ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; @@ -133,30 +139,4 @@ static inline void activate_mm(struct mm_struct *prev, set_user_asce(next); } -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) -{ -} - -static inline void arch_exit_mmap(struct mm_struct *mm) -{ -} - -static inline void arch_unmap(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ -} - -static inline void arch_bprm_mm_init(struct mm_struct *mm, - struct vm_area_struct *vma) -{ -} - -static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - bool write, bool execute, bool foreign) -{ - /* by default, allow everything */ - return true; -} #endif /* __S390_MMU_CONTEXT_H */ diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index 9d91cf3e427f..c8e211b9a002 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -72,7 +72,7 @@ union mci { u64 ar : 1; /* 33 access register validity */ u64 da : 1; /* 34 delayed access exception */ u64 : 1; /* 35 */ - u64 gs : 1; /* 36 guarded storage registers */ + u64 gs : 1; /* 36 guarded storage registers validity */ u64 : 5; /* 37-41 */ u64 pr : 1; /* 42 tod programmable register validity */ u64 fc : 1; /* 43 fp control register validity */ diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h index 42267a2fe29e..ca21b28a7b17 100644 --- a/arch/s390/include/asm/page-states.h +++ b/arch/s390/include/asm/page-states.h @@ -13,6 +13,7 @@ #define ESSA_SET_POT_VOLATILE 4 #define ESSA_SET_STABLE_RESIDENT 5 #define ESSA_SET_STABLE_IF_RESIDENT 6 +#define ESSA_SET_STABLE_NODAT 7 #define ESSA_MAX ESSA_SET_STABLE_IF_RESIDENT diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 624deaa44230..5d5c2b3500a4 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -10,10 +10,14 @@ #include <linux/const.h> #include <asm/types.h> +#define _PAGE_SHIFT 12 +#define _PAGE_SIZE (_AC(1, UL) << _PAGE_SHIFT) +#define _PAGE_MASK (~(_PAGE_SIZE - 1)) + /* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 -#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) +#define PAGE_SHIFT _PAGE_SHIFT +#define PAGE_SIZE _PAGE_SIZE +#define PAGE_MASK _PAGE_MASK #define PAGE_DEFAULT_ACC 0 #define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4) @@ -133,6 +137,9 @@ static inline int page_reset_referenced(unsigned long addr) struct page; void arch_free_page(struct page *page, int order); void arch_alloc_page(struct page *page, int order); +void arch_set_page_dat(struct page *page, int order); +void arch_set_page_nodat(struct page *page, int order); +int arch_test_page_nodat(struct page *page); void arch_set_page_states(int make_stable); static inline int devmem_is_allowed(unsigned long pfn) @@ -145,16 +152,26 @@ static inline int devmem_is_allowed(unsigned long pfn) #endif /* !__ASSEMBLY__ */ -#define __PAGE_OFFSET 0x0UL -#define PAGE_OFFSET 0x0UL -#define __pa(x) (unsigned long)(x) -#define __va(x) (void *)(unsigned long)(x) -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define __PAGE_OFFSET 0x0UL +#define PAGE_OFFSET 0x0UL + +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)(unsigned long)(x)) + +#define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT) #define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT) + +#define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define page_to_virt(page) pfn_to_virt(page_to_pfn(page)) +#define phys_to_pfn(kaddr) ((kaddr) >> PAGE_SHIFT) +#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) + +#define phys_to_page(kaddr) pfn_to_page(phys_to_pfn(kaddr)) +#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) + +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) + #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index bb0ff1bb0c4a..a0d9167519b1 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -15,6 +15,8 @@ #include <linux/gfp.h> #include <linux/mm.h> +#define CRST_ALLOC_ORDER 2 + unsigned long *crst_table_alloc(struct mm_struct *); void crst_table_free(struct mm_struct *, unsigned long *); @@ -42,16 +44,16 @@ static inline void clear_table(unsigned long *s, unsigned long val, size_t n) static inline void crst_table_init(unsigned long *crst, unsigned long entry) { - clear_table(crst, entry, sizeof(unsigned long)*2048); + clear_table(crst, entry, _CRST_TABLE_SIZE); } static inline unsigned long pgd_entry_type(struct mm_struct *mm) { - if (mm->context.asce_limit <= (1UL << 31)) + if (mm->context.asce_limit <= _REGION3_SIZE) return _SEGMENT_ENTRY_EMPTY; - if (mm->context.asce_limit <= (1UL << 42)) + if (mm->context.asce_limit <= _REGION2_SIZE) return _REGION3_ENTRY_EMPTY; - if (mm->context.asce_limit <= (1UL << 53)) + if (mm->context.asce_limit <= _REGION1_SIZE) return _REGION2_ENTRY_EMPTY; return _REGION1_ENTRY_EMPTY; } @@ -119,7 +121,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) if (!table) return NULL; - if (mm->context.asce_limit == (1UL << 31)) { + if (mm->context.asce_limit == _REGION3_SIZE) { /* Forking a compat process with 2 page table levels */ if (!pgtable_pmd_page_ctor(virt_to_page(table))) { crst_table_free(mm, table); @@ -131,7 +133,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - if (mm->context.asce_limit == (1UL << 31)) + if (mm->context.asce_limit == _REGION3_SIZE) pgtable_pmd_page_dtor(virt_to_page(pgd)); crst_table_free(mm, (unsigned long *) pgd); } @@ -158,4 +160,8 @@ static inline void pmd_populate(struct mm_struct *mm, extern void rcu_table_freelist_finish(void); +void vmem_map_init(void); +void *vmem_crst_alloc(unsigned long val); +pte_t *vmem_pte_alloc(void); + #endif /* _S390_PGALLOC_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 57057fb1cc07..dce708e061ea 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -11,19 +11,6 @@ #ifndef _ASM_S390_PGTABLE_H #define _ASM_S390_PGTABLE_H -/* - * The Linux memory management assumes a three-level page table setup. - * For s390 64 bit we use up to four of the five levels the hardware - * provides (region first tables are not used). - * - * The "pgd_xxx()" functions are trivial for a folded two-level - * setup: the pgd is never bad, and a pmd always exists (as it's folded - * into the pgd entry) - * - * This file contains the functions and defines necessary to modify and use - * the S390 page table tree. - */ -#ifndef __ASSEMBLY__ #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/page-flags.h> @@ -34,9 +21,6 @@ extern pgd_t swapper_pg_dir[]; extern void paging_init(void); -extern void vmem_map_init(void); -pmd_t *vmem_pmd_alloc(void); -pte_t *vmem_pte_alloc(void); enum { PG_DIRECT_MAP_4K = 0, @@ -77,38 +61,6 @@ extern unsigned long zero_page_mask; #define __HAVE_COLOR_ZERO_PAGE /* TODO: s390 cannot support io_remap_pfn_range... */ -#endif /* !__ASSEMBLY__ */ - -/* - * PMD_SHIFT determines the size of the area a second-level page - * table can map - * PGDIR_SHIFT determines what a third-level page table entry can map - */ -#define PMD_SHIFT 20 -#define PUD_SHIFT 31 -#define P4D_SHIFT 42 -#define PGDIR_SHIFT 53 - -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) -#define P4D_SIZE (1UL << P4D_SHIFT) -#define P4D_MASK (~(P4D_SIZE-1)) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -/* - * entries per page directory level: the S390 is two-level, so - * we don't really have any PMD directory physically. - * for S390 segment-table entries are combined to one PGD - * that leads to 1024 pte per pgd - */ -#define PTRS_PER_PTE 256 -#define PTRS_PER_PMD 2048 -#define PTRS_PER_PUD 2048 -#define PTRS_PER_P4D 2048 -#define PTRS_PER_PGD 2048 #define FIRST_USER_ADDRESS 0UL @@ -123,7 +75,6 @@ extern unsigned long zero_page_mask; #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %p.\n", __FILE__, __LINE__, (void *) pgd_val(e)) -#ifndef __ASSEMBLY__ /* * The vmalloc and module area will always be on the topmost area of the * kernel mapping. We reserve 128GB (64bit) for vmalloc and modules. @@ -269,7 +220,7 @@ static inline int is_module_addr(void *addr) */ /* Bits in the segment/region table address-space-control-element */ -#define _ASCE_ORIGIN ~0xfffUL/* segment table origin */ +#define _ASCE_ORIGIN ~0xfffUL/* region/segment table origin */ #define _ASCE_PRIVATE_SPACE 0x100 /* private space control */ #define _ASCE_ALT_EVENT 0x80 /* storage alteration event control */ #define _ASCE_SPACE_SWITCH 0x40 /* space switch event */ @@ -320,9 +271,9 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL #define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL #define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address */ -#define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* segment table origin */ -#define _SEGMENT_ENTRY_PROTECT 0x200 /* page protection bit */ -#define _SEGMENT_ENTRY_NOEXEC 0x100 /* region no-execute bit */ +#define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* page table origin */ +#define _SEGMENT_ENTRY_PROTECT 0x200 /* segment protection bit */ +#define _SEGMENT_ENTRY_NOEXEC 0x100 /* segment no-execute bit */ #define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */ #define _SEGMENT_ENTRY (0) @@ -340,6 +291,54 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */ #endif +#define _CRST_ENTRIES 2048 /* number of region/segment table entries */ +#define _PAGE_ENTRIES 256 /* number of page table entries */ + +#define _CRST_TABLE_SIZE (_CRST_ENTRIES * 8) +#define _PAGE_TABLE_SIZE (_PAGE_ENTRIES * 8) + +#define _REGION1_SHIFT 53 +#define _REGION2_SHIFT 42 +#define _REGION3_SHIFT 31 +#define _SEGMENT_SHIFT 20 + +#define _REGION1_INDEX (0x7ffUL << _REGION1_SHIFT) +#define _REGION2_INDEX (0x7ffUL << _REGION2_SHIFT) +#define _REGION3_INDEX (0x7ffUL << _REGION3_SHIFT) +#define _SEGMENT_INDEX (0x7ffUL << _SEGMENT_SHIFT) +#define _PAGE_INDEX (0xffUL << _PAGE_SHIFT) + +#define _REGION1_SIZE (1UL << _REGION1_SHIFT) +#define _REGION2_SIZE (1UL << _REGION2_SHIFT) +#define _REGION3_SIZE (1UL << _REGION3_SHIFT) +#define _SEGMENT_SIZE (1UL << _SEGMENT_SHIFT) + +#define _REGION1_MASK (~(_REGION1_SIZE - 1)) +#define _REGION2_MASK (~(_REGION2_SIZE - 1)) +#define _REGION3_MASK (~(_REGION3_SIZE - 1)) +#define _SEGMENT_MASK (~(_SEGMENT_SIZE - 1)) + +#define PMD_SHIFT _SEGMENT_SHIFT +#define PUD_SHIFT _REGION3_SHIFT +#define P4D_SHIFT _REGION2_SHIFT +#define PGDIR_SHIFT _REGION1_SHIFT + +#define PMD_SIZE _SEGMENT_SIZE +#define PUD_SIZE _REGION3_SIZE +#define P4D_SIZE _REGION2_SIZE +#define PGDIR_SIZE _REGION1_SIZE + +#define PMD_MASK _SEGMENT_MASK +#define PUD_MASK _REGION3_MASK +#define P4D_MASK _REGION2_MASK +#define PGDIR_MASK _REGION1_MASK + +#define PTRS_PER_PTE _PAGE_ENTRIES +#define PTRS_PER_PMD _CRST_ENTRIES +#define PTRS_PER_PUD _CRST_ENTRIES +#define PTRS_PER_P4D _CRST_ENTRIES +#define PTRS_PER_PGD _CRST_ENTRIES + /* * Segment table and region3 table entry encoding * (R = read-only, I = invalid, y = young bit): @@ -376,6 +375,7 @@ static inline int is_module_addr(void *addr) /* Guest Page State used for virtualization */ #define _PGSTE_GPS_ZERO 0x0000000080000000UL +#define _PGSTE_GPS_NODAT 0x0000000040000000UL #define _PGSTE_GPS_USAGE_MASK 0x0000000003000000UL #define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL #define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL @@ -505,7 +505,7 @@ static inline int mm_alloc_pgste(struct mm_struct *mm) * In the case that a guest uses storage keys * faults should no longer be backed by zero pages */ -#define mm_forbids_zeropage mm_use_skey +#define mm_forbids_zeropage mm_has_pgste static inline int mm_use_skey(struct mm_struct *mm) { #ifdef CONFIG_PGSTE @@ -952,15 +952,30 @@ static inline pte_t pte_mkhuge(pte_t pte) #define IPTE_GLOBAL 0 #define IPTE_LOCAL 1 -static inline void __ptep_ipte(unsigned long address, pte_t *ptep, int local) +#define IPTE_NODAT 0x400 +#define IPTE_GUEST_ASCE 0x800 + +static inline void __ptep_ipte(unsigned long address, pte_t *ptep, + unsigned long opt, unsigned long asce, + int local) { unsigned long pto = (unsigned long) ptep; - /* Invalidation + TLB flush for the pte */ + if (__builtin_constant_p(opt) && opt == 0) { + /* Invalidation + TLB flush for the pte */ + asm volatile( + " .insn rrf,0xb2210000,%[r1],%[r2],0,%[m4]" + : "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address), + [m4] "i" (local)); + return; + } + + /* Invalidate ptes with options + TLB flush of the ptes */ + opt = opt | (asce & _ASCE_ORIGIN); asm volatile( - " .insn rrf,0xb2210000,%[r1],%[r2],0,%[m4]" - : "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address), - [m4] "i" (local)); + " .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]" + : [r2] "+a" (address), [r3] "+a" (opt) + : [r1] "a" (pto), [m4] "i" (local) : "memory"); } static inline void __ptep_ipte_range(unsigned long address, int nr, @@ -1341,31 +1356,61 @@ static inline void __pmdp_csp(pmd_t *pmdp) #define IDTE_GLOBAL 0 #define IDTE_LOCAL 1 -static inline void __pmdp_idte(unsigned long address, pmd_t *pmdp, int local) +#define IDTE_PTOA 0x0800 +#define IDTE_NODAT 0x1000 +#define IDTE_GUEST_ASCE 0x2000 + +static inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, + unsigned long opt, unsigned long asce, + int local) { unsigned long sto; - sto = (unsigned long) pmdp - pmd_index(address) * sizeof(pmd_t); - asm volatile( - " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" - : "+m" (*pmdp) - : [r1] "a" (sto), [r2] "a" ((address & HPAGE_MASK)), - [m4] "i" (local) - : "cc" ); + sto = (unsigned long) pmdp - pmd_index(addr) * sizeof(pmd_t); + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" + : "+m" (*pmdp) + : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK)), + [m4] "i" (local) + : "cc" ); + } else { + /* flush with guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]" + : "+m" (*pmdp) + : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK) | opt), + [r3] "a" (asce), [m4] "i" (local) + : "cc" ); + } } -static inline void __pudp_idte(unsigned long address, pud_t *pudp, int local) +static inline void __pudp_idte(unsigned long addr, pud_t *pudp, + unsigned long opt, unsigned long asce, + int local) { unsigned long r3o; - r3o = (unsigned long) pudp - pud_index(address) * sizeof(pud_t); + r3o = (unsigned long) pudp - pud_index(addr) * sizeof(pud_t); r3o |= _ASCE_TYPE_REGION3; - asm volatile( - " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" - : "+m" (*pudp) - : [r1] "a" (r3o), [r2] "a" ((address & PUD_MASK)), - [m4] "i" (local) - : "cc"); + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" + : "+m" (*pudp) + : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK)), + [m4] "i" (local) + : "cc"); + } else { + /* flush with guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]" + : "+m" (*pudp) + : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK) | opt), + [r3] "a" (asce), [m4] "i" (local) + : "cc" ); + } } pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t); @@ -1548,8 +1593,6 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#endif /* !__ASSEMBLY__ */ - #define kern_addr_valid(addr) (1) extern int vmem_add_mapping(unsigned long start, unsigned long size); diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h index 998b61cd0e56..eaee69e7c42a 100644 --- a/arch/s390/include/asm/qdio.h +++ b/arch/s390/include/asm/qdio.h @@ -80,7 +80,7 @@ struct qdr { u32 qkey : 4; u32 : 28; struct qdesfmt0 qdf0[126]; -} __attribute__ ((packed, aligned(4096))); +} __packed __aligned(PAGE_SIZE); #define QIB_AC_OUTBOUND_PCI_SUPPORTED 0x40 #define QIB_RFLAGS_ENABLE_QEBSM 0x80 diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index cd78155b1829..490e035b3716 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -29,8 +29,10 @@ #define MACHINE_FLAG_TE _BITUL(11) #define MACHINE_FLAG_TLB_LC _BITUL(12) #define MACHINE_FLAG_VX _BITUL(13) -#define MACHINE_FLAG_NX _BITUL(14) -#define MACHINE_FLAG_GS _BITUL(15) +#define MACHINE_FLAG_TLB_GUEST _BITUL(14) +#define MACHINE_FLAG_NX _BITUL(15) +#define MACHINE_FLAG_GS _BITUL(16) +#define MACHINE_FLAG_SCC _BITUL(17) #define LPP_MAGIC _BITUL(31) #define LPP_PFAULT_PID_MASK _AC(0xffffffff, UL) @@ -68,8 +70,10 @@ extern void detect_memory_memblock(void); #define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) #define MACHINE_HAS_TLB_LC (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC) #define MACHINE_HAS_VX (S390_lowcore.machine_flags & MACHINE_FLAG_VX) +#define MACHINE_HAS_TLB_GUEST (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_GUEST) #define MACHINE_HAS_NX (S390_lowcore.machine_flags & MACHINE_FLAG_NX) #define MACHINE_HAS_GS (S390_lowcore.machine_flags & MACHINE_FLAG_GS) +#define MACHINE_HAS_SCC (S390_lowcore.machine_flags & MACHINE_FLAG_SCC) /* * Console mode. Override with conmode= @@ -104,9 +108,16 @@ extern void pfault_fini(void); #define pfault_fini() do { } while (0) #endif /* CONFIG_PFAULT */ +#ifdef CONFIG_VMCP +void vmcp_cma_reserve(void); +#else +static inline void vmcp_cma_reserve(void) { } +#endif + void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault); -extern void cmma_init(void); +void cmma_init(void); +void cmma_init_nodat(void); extern void (*_machine_restart)(char *command); extern void (*_machine_halt)(void); diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index f7838ecd83c6..8182b521c42f 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -92,17 +92,11 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) { typecheck(int, lp->lock); asm volatile( - "st %1,%0\n" - : "+Q" (lp->lock) - : "d" (0) - : "cc", "memory"); -} - -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - while (arch_spin_is_locked(lock)) - arch_spin_relax(lock); - smp_acquire__after_ctrl_dep(); +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + " .long 0xb2fa0070\n" /* NIAI 7 */ +#endif + " st %1,%0\n" + : "=Q" (lp->lock) : "d" (0) : "cc", "memory"); } /* diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 118535123f34..93f2eb3f277c 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -15,6 +15,8 @@ /* The value of the TOD clock for 1.1.1970. */ #define TOD_UNIX_EPOCH 0x7d91048bca000000ULL +extern u64 clock_comparator_max; + /* Inline functions for clock register access. */ static inline int set_tod_clock(__u64 time) { @@ -126,7 +128,7 @@ static inline unsigned long long local_tick_disable(void) unsigned long long old; old = S390_lowcore.clock_comparator; - S390_lowcore.clock_comparator = -1ULL; + S390_lowcore.clock_comparator = clock_comparator_max; set_clock_comparator(S390_lowcore.clock_comparator); return old; } @@ -174,24 +176,24 @@ static inline cycles_t get_cycles(void) return (cycles_t) get_tod_clock() >> 2; } -int get_phys_clock(unsigned long long *clock); +int get_phys_clock(unsigned long *clock); void init_cpu_timer(void); unsigned long long monotonic_clock(void); -extern u64 sched_clock_base_cc; +extern unsigned char tod_clock_base[16] __aligned(8); /** * get_clock_monotonic - returns current time in clock rate units * * The caller must ensure that preemption is disabled. - * The clock and sched_clock_base get changed via stop_machine. + * The clock and tod_clock_base get changed via stop_machine. * Therefore preemption must be disabled when calling this * function, otherwise the returned value is not guaranteed to * be monotonic. */ static inline unsigned long long get_tod_clock_monotonic(void) { - return get_tod_clock() - sched_clock_base_cc; + return get_tod_clock() - *(unsigned long long *) &tod_clock_base[1]; } /** @@ -218,4 +220,32 @@ static inline unsigned long long tod_to_ns(unsigned long long todval) return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9); } +/** + * tod_after - compare two 64 bit TOD values + * @a: first 64 bit TOD timestamp + * @b: second 64 bit TOD timestamp + * + * Returns: true if a is later than b + */ +static inline int tod_after(unsigned long long a, unsigned long long b) +{ + if (MACHINE_HAS_SCC) + return (long long) a > (long long) b; + return a > b; +} + +/** + * tod_after_eq - compare two 64 bit TOD values + * @a: first 64 bit TOD timestamp + * @b: second 64 bit TOD timestamp + * + * Returns: true if a is later than b + */ +static inline int tod_after_eq(unsigned long long a, unsigned long long b) +{ + if (MACHINE_HAS_SCC) + return (long long) a >= (long long) b; + return a >= b; +} + #endif diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 2eb8ff0d6fca..3a14b864b2e3 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -135,7 +135,7 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, unsigned long address) { - if (tlb->mm->context.asce_limit <= (1UL << 31)) + if (tlb->mm->context.asce_limit <= _REGION3_SIZE) return; pgtable_pmd_page_dtor(virt_to_page(pmd)); tlb_remove_table(tlb, pmd); @@ -151,7 +151,7 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, unsigned long address) { - if (tlb->mm->context.asce_limit <= (1UL << 53)) + if (tlb->mm->context.asce_limit <= _REGION1_SIZE) return; tlb_remove_table(tlb, p4d); } @@ -166,7 +166,7 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, unsigned long address) { - if (tlb->mm->context.asce_limit <= (1UL << 42)) + if (tlb->mm->context.asce_limit <= _REGION2_SIZE) return; tlb_remove_table(tlb, pud); } diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 39846100682a..4d759f8f4bc7 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -20,10 +20,15 @@ static inline void __tlb_flush_local(void) */ static inline void __tlb_flush_idte(unsigned long asce) { + unsigned long opt; + + opt = IDTE_PTOA; + if (MACHINE_HAS_TLB_GUEST) + opt |= IDTE_GUEST_ASCE; /* Global TLB flush for the mm */ asm volatile( " .insn rrf,0xb98e0000,0,%0,%1,0" - : : "a" (2048), "a" (asce) : "cc"); + : : "a" (opt), "a" (asce) : "cc"); } #ifdef CONFIG_SMP diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index fa1bfce10370..5222da162b69 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -77,12 +77,6 @@ static inline const struct cpumask *cpumask_of_node(int node) return &node_to_cpumask_map[node]; } -/* - * Returns the number of the node containing node 'node'. This - * architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) - #define pcibus_to_node(bus) __pcibus_to_node(bus) #define node_distance(a, b) __node_distance(a, b) diff --git a/arch/s390/include/asm/types.h b/arch/s390/include/asm/types.h deleted file mode 100644 index 6740f4f9781f..000000000000 --- a/arch/s390/include/asm/types.h +++ /dev/null @@ -1,11 +0,0 @@ -/* - * S390 version - * - * Derived from "include/asm-i386/types.h" - */ -#ifndef _S390_TYPES_H -#define _S390_TYPES_H - -#include <uapi/asm/types.h> - -#endif /* _S390_TYPES_H */ diff --git a/arch/s390/include/asm/unaligned.h b/arch/s390/include/asm/unaligned.h deleted file mode 100644 index da9627afe5d8..000000000000 --- a/arch/s390/include/asm/unaligned.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _ASM_S390_UNALIGNED_H -#define _ASM_S390_UNALIGNED_H - -/* - * The S390 can do unaligned accesses itself. - */ -#include <linux/unaligned/access_ok.h> -#include <linux/unaligned/generic.h> - -#define get_unaligned __get_unaligned_be -#define put_unaligned __put_unaligned_be - -#endif /* _ASM_S390_UNALIGNED_H */ diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild index ca62066895e0..098f28778a13 100644 --- a/arch/s390/include/uapi/asm/Kbuild +++ b/arch/s390/include/uapi/asm/Kbuild @@ -9,4 +9,5 @@ generic-y += param.h generic-y += poll.h generic-y += resource.h generic-y += sockios.h +generic-y += swab.h generic-y += termbits.h diff --git a/arch/s390/include/uapi/asm/dasd.h b/arch/s390/include/uapi/asm/dasd.h index 1340311dab77..ab5797cdc1b7 100644 --- a/arch/s390/include/uapi/asm/dasd.h +++ b/arch/s390/include/uapi/asm/dasd.h @@ -72,7 +72,10 @@ typedef struct dasd_information2_t { * 0x02: use diag discipline (diag) * 0x04: set the device initially online (internal use only) * 0x08: enable ERP related logging - * 0x20: give access to raw eckd data + * 0x10: allow I/O to fail on lost paths + * 0x20: allow I/O to fail when a lock was stolen + * 0x40: give access to raw eckd data + * 0x80: enable discard support */ #define DASD_FEATURE_DEFAULT 0x00 #define DASD_FEATURE_READONLY 0x01 @@ -82,6 +85,7 @@ typedef struct dasd_information2_t { #define DASD_FEATURE_FAILFAST 0x10 #define DASD_FEATURE_FAILONSLCK 0x20 #define DASD_FEATURE_USERAW 0x40 +#define DASD_FEATURE_DISCARD 0x80 #define DASD_PARTN_BITS 2 diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 52a63f4175cb..a56916c83565 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -108,4 +108,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/swab.h b/arch/s390/include/uapi/asm/swab.h deleted file mode 100644 index da3bfe5cc161..000000000000 --- a/arch/s390/include/uapi/asm/swab.h +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef _S390_SWAB_H -#define _S390_SWAB_H - -/* - * S390 version - * Copyright IBM Corp. 1999 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - */ - -#include <linux/types.h> - -#ifndef __s390x__ -# define __SWAB_64_THRU_32__ -#endif - -#ifdef __s390x__ -static inline __u64 __arch_swab64p(const __u64 *x) -{ - __u64 result; - - asm volatile("lrvg %0,%1" : "=d" (result) : "m" (*x)); - return result; -} -#define __arch_swab64p __arch_swab64p - -static inline __u64 __arch_swab64(__u64 x) -{ - __u64 result; - - asm volatile("lrvgr %0,%1" : "=d" (result) : "d" (x)); - return result; -} -#define __arch_swab64 __arch_swab64 - -static inline void __arch_swab64s(__u64 *x) -{ - *x = __arch_swab64p(x); -} -#define __arch_swab64s __arch_swab64s -#endif /* __s390x__ */ - -static inline __u32 __arch_swab32p(const __u32 *x) -{ - __u32 result; - - asm volatile( -#ifndef __s390x__ - " icm %0,8,%O1+3(%R1)\n" - " icm %0,4,%O1+2(%R1)\n" - " icm %0,2,%O1+1(%R1)\n" - " ic %0,%1" - : "=&d" (result) : "Q" (*x) : "cc"); -#else /* __s390x__ */ - " lrv %0,%1" - : "=d" (result) : "m" (*x)); -#endif /* __s390x__ */ - return result; -} -#define __arch_swab32p __arch_swab32p - -#ifdef __s390x__ -static inline __u32 __arch_swab32(__u32 x) -{ - __u32 result; - - asm volatile("lrvr %0,%1" : "=d" (result) : "d" (x)); - return result; -} -#define __arch_swab32 __arch_swab32 -#endif /* __s390x__ */ - -static inline __u16 __arch_swab16p(const __u16 *x) -{ - __u16 result; - - asm volatile( -#ifndef __s390x__ - " icm %0,2,%O1+1(%R1)\n" - " ic %0,%1\n" - : "=&d" (result) : "Q" (*x) : "cc"); -#else /* __s390x__ */ - " lrvh %0,%1" - : "=d" (result) : "m" (*x)); -#endif /* __s390x__ */ - return result; -} -#define __arch_swab16p __arch_swab16p - -#endif /* _S390_SWAB_H */ diff --git a/arch/s390/include/uapi/asm/vmcp.h b/arch/s390/include/uapi/asm/vmcp.h new file mode 100644 index 000000000000..4caf71714a55 --- /dev/null +++ b/arch/s390/include/uapi/asm/vmcp.h @@ -0,0 +1,24 @@ +/* + * Copyright IBM Corp. 2004, 2005 + * Interface implementation for communication with the z/VM control program + * Version 1.0 + * Author(s): Christian Borntraeger <cborntra@de.ibm.com> + * + * + * z/VMs CP offers the possibility to issue commands via the diagnose code 8 + * this driver implements a character device that issues these commands and + * returns the answer of CP. + * + * The idea of this driver is based on cpint from Neale Ferguson + */ + +#ifndef _UAPI_ASM_VMCP_H +#define _UAPI_ASM_VMCP_H + +#include <linux/ioctl.h> + +#define VMCP_GETCODE _IOR(0x10, 1, int) +#define VMCP_SETBUF _IOW(0x10, 2, int) +#define VMCP_GETSIZE _IOR(0x10, 3, int) + +#endif /* _UAPI_ASM_VMCP_H */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index b65c414b6c0e..3d42f91c95fd 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -158,6 +158,7 @@ int main(void) OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); OFFSET(__LC_INT_CLOCK, lowcore, int_clock); OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock); + OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock); OFFSET(__LC_CURRENT, lowcore, current_task); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); OFFSET(__LC_ASYNC_STACK, lowcore, async_stack); diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c index 9f0e4a2785f7..63bc6603e0ed 100644 --- a/arch/s390/kernel/cpcmd.c +++ b/arch/s390/kernel/cpcmd.c @@ -14,6 +14,7 @@ #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> +#include <linux/mm.h> #include <asm/diag.h> #include <asm/ebcdic.h> #include <asm/cpcmd.h> @@ -28,9 +29,7 @@ static int diag8_noresponse(int cmdlen) register unsigned long reg3 asm ("3") = cmdlen; asm volatile( - " sam31\n" " diag %1,%0,0x8\n" - " sam64\n" : "+d" (reg3) : "d" (reg2) : "cc"); return reg3; } @@ -43,9 +42,7 @@ static int diag8_response(int cmdlen, char *response, int *rlen) register unsigned long reg5 asm ("5") = *rlen; asm volatile( - " sam31\n" " diag %2,%0,0x8\n" - " sam64\n" " brc 8,1f\n" " agr %1,%4\n" "1:\n" @@ -57,7 +54,6 @@ static int diag8_response(int cmdlen, char *response, int *rlen) /* * __cpcmd has some restrictions over cpcmd - * - the response buffer must reside below 2GB (if any) * - __cpcmd is unlocked and therefore not SMP-safe */ int __cpcmd(const char *cmd, char *response, int rlen, int *response_code) @@ -88,13 +84,12 @@ EXPORT_SYMBOL(__cpcmd); int cpcmd(const char *cmd, char *response, int rlen, int *response_code) { + unsigned long flags; char *lowbuf; int len; - unsigned long flags; - if ((virt_to_phys(response) != (unsigned long) response) || - (((unsigned long)response + rlen) >> 31)) { - lowbuf = kmalloc(rlen, GFP_KERNEL | GFP_DMA); + if (is_vmalloc_or_module_addr(response)) { + lowbuf = kmalloc(rlen, GFP_KERNEL); if (!lowbuf) { pr_warn("The cpcmd kernel function failed to allocate a response buffer\n"); return -ENOMEM; diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 86b3e74f569e..1d9e83c401fc 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -866,7 +866,8 @@ static inline void debug_finish_entry(debug_info_t * id, debug_entry_t* active, int level, int exception) { - active->id.stck = get_tod_clock_fast() - sched_clock_base_cc; + active->id.stck = get_tod_clock_fast() - + *(unsigned long long *) &tod_clock_base[1]; active->id.fields.cpuid = smp_processor_id(); active->caller = __builtin_return_address(0); active->id.fields.exception = exception; @@ -1455,15 +1456,15 @@ int debug_dflt_header_fn(debug_info_t * id, struct debug_view *view, int area, debug_entry_t * entry, char *out_buf) { - unsigned long sec, usec; + unsigned long base, sec, usec; char *except_str; unsigned long caller; int rc = 0; unsigned int level; level = entry->id.fields.level; - sec = (entry->id.stck >> 12) + (sched_clock_base_cc >> 12); - sec = sec - (TOD_UNIX_EPOCH >> 12); + base = (*(unsigned long *) &tod_clock_base[0]) >> 4; + sec = (entry->id.stck >> 12) + base - (TOD_UNIX_EPOCH >> 12); usec = do_div(sec, USEC_PER_SEC); if (entry->id.fields.exception) diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index dab78babfab6..2aa545dca4d5 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -76,7 +76,7 @@ void dump_trace(dump_trace_func_t func, void *data, struct task_struct *task, frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); #ifdef CONFIG_CHECK_STACK sp = __dump_trace(func, data, sp, - S390_lowcore.panic_stack + frame_size - 4096, + S390_lowcore.panic_stack + frame_size - PAGE_SIZE, S390_lowcore.panic_stack + frame_size); #endif sp = __dump_trace(func, data, sp, diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 5d20182ee8ae..ca8cd80e8feb 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -53,8 +53,9 @@ static void __init reset_tod_clock(void) if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock(&time) != 0) disabled_wait(0); - sched_clock_base_cc = TOD_UNIX_EPOCH; - S390_lowcore.last_update_clock = sched_clock_base_cc; + memset(tod_clock_base, 0, 16); + *(__u64 *) &tod_clock_base[1] = TOD_UNIX_EPOCH; + S390_lowcore.last_update_clock = TOD_UNIX_EPOCH; } #ifdef CONFIG_SHARED_KERNEL @@ -165,8 +166,8 @@ static noinline __init void create_kernel_nss(void) } /* re-initialize cputime accounting. */ - sched_clock_base_cc = get_tod_clock(); - S390_lowcore.last_update_clock = sched_clock_base_cc; + get_tod_clock_ext(tod_clock_base); + S390_lowcore.last_update_clock = *(__u64 *) &tod_clock_base[1]; S390_lowcore.last_update_timer = 0x7fffffffffffffffULL; S390_lowcore.user_timer = 0; S390_lowcore.system_timer = 0; @@ -387,6 +388,12 @@ static __init void detect_machine_facilities(void) } if (test_facility(133)) S390_lowcore.machine_flags |= MACHINE_FLAG_GS; + if (test_facility(139) && (tod_clock_base[1] & 0x80)) { + /* Enabled signed clock comparator comparisons */ + S390_lowcore.machine_flags |= MACHINE_FLAG_SCC; + clock_comparator_max = -1ULL >> 1; + __ctl_set_bit(0, 53); + } } static inline void save_vector_registers(void) @@ -413,7 +420,7 @@ static int __init disable_vector_extension(char *str) { S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX; __ctl_clear_bit(0, 17); - return 1; + return 0; } early_param("novx", disable_vector_extension); diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S index eff5b31671d4..8ed753c72d9b 100644 --- a/arch/s390/kernel/head.S +++ b/arch/s390/kernel/head.S @@ -302,7 +302,8 @@ ENTRY(startup_kdump) xc 0xe00(256),0xe00 xc 0xf00(256),0xf00 lctlg %c0,%c15,0x200(%r0) # initialize control registers - stck __LC_LAST_UPDATE_CLOCK + stcke __LC_BOOT_CLOCK + mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 spt 6f-.LPG0(%r13) mvc __LC_LAST_UPDATE_TIMER(8),6f-.LPG0(%r13) l %r15,.Lstack-.LPG0(%r13) diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 31c91f24e562..0d8f2a858ced 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -21,8 +21,8 @@ ENTRY(startup_continue) xc __LC_LPP+1(7,0),__LC_LPP+1 # clear lpp and current_pid mvi __LC_LPP,0x80 # and set LPP_MAGIC .insn s,0xb2800000,__LC_LPP # load program parameter -0: larl %r1,sched_clock_base_cc - mvc 0(8,%r1),__LC_LAST_UPDATE_CLOCK +0: larl %r1,tod_clock_base + mvc 0(16,%r1),__LC_BOOT_CLOCK larl %r13,.LPG1 # get base lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 6dca93b29bed..a2fdff0e730b 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -105,7 +105,8 @@ void do_IRQ(struct pt_regs *regs, int irq) old_regs = set_irq_regs(regs); irq_enter(); - if (S390_lowcore.int_clock >= S390_lowcore.clock_comparator) + if (tod_after_eq(S390_lowcore.int_clock, + S390_lowcore.clock_comparator)) /* Serve timer interrupts first. */ clock_comparator_work(); generic_handle_irq(irq); diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S index cfac28330b03..4bdc65636603 100644 --- a/arch/s390/kernel/relocate_kernel.S +++ b/arch/s390/kernel/relocate_kernel.S @@ -7,6 +7,7 @@ */ #include <linux/linkage.h> +#include <asm/page.h> #include <asm/sigp.h> /* @@ -55,8 +56,8 @@ ENTRY(relocate_kernel) .back_pgm: lmg %r0,%r15,gprregs-.base(%r13) .top: - lghi %r7,4096 # load PAGE_SIZE in r7 - lghi %r9,4096 # load PAGE_SIZE in r9 + lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7 + lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9 lg %r5,0(%r2) # read another word for indirection page aghi %r2,8 # increment pointer tml %r5,0x1 # is it a destination page? diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 3d1d808ea8a9..164a1e16b53e 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -305,7 +305,7 @@ static void __init setup_lowcore(void) /* * Setup lowcore for boot cpu */ - BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * 4096); + BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * PAGE_SIZE); lc = memblock_virt_alloc_low(sizeof(*lc), sizeof(*lc)); lc->restart_psw.mask = PSW_KERNEL_BITS; lc->restart_psw.addr = (unsigned long) restart_int_handler; @@ -323,7 +323,7 @@ static void __init setup_lowcore(void) lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | PSW_MASK_MCHECK; lc->io_new_psw.addr = (unsigned long) io_int_handler; - lc->clock_comparator = -1ULL; + lc->clock_comparator = clock_comparator_max; lc->kernel_stack = ((unsigned long) &init_thread_union) + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); lc->async_stack = (unsigned long) @@ -469,10 +469,10 @@ static void __init setup_memory_end(void) vmalloc_size = VMALLOC_END ?: (128UL << 30) - MODULES_LEN; tmp = (memory_end ?: max_physmem_end) / PAGE_SIZE; tmp = tmp * (sizeof(struct page) + PAGE_SIZE); - if (tmp + vmalloc_size + MODULES_LEN <= (1UL << 42)) - vmax = 1UL << 42; /* 3-level kernel page table */ + if (tmp + vmalloc_size + MODULES_LEN <= _REGION2_SIZE) + vmax = _REGION2_SIZE; /* 3-level kernel page table */ else - vmax = 1UL << 53; /* 4-level kernel page table */ + vmax = _REGION1_SIZE; /* 4-level kernel page table */ /* module area is at the end of the kernel address space. */ MODULES_END = vmax; MODULES_VADDR = MODULES_END - MODULES_LEN; @@ -818,6 +818,9 @@ static int __init setup_hwcaps(void) case 0x2965: strcpy(elf_platform, "z13"); break; + case 0x3906: + strcpy(elf_platform, "z14"); + break; } /* @@ -922,6 +925,7 @@ void __init setup_arch(char **cmdline_p) setup_memory_end(); setup_memory(); dma_contiguous_reserve(memory_end); + vmcp_cma_reserve(); check_initrd(); reserve_crashkernel(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 1020a11a24e5..1cee6753d47a 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -1181,6 +1181,7 @@ static int __init s390_smp_init(void) rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "s390/smp:online", smp_cpu_online, smp_cpu_pre_down); + rc = rc <= 0 ? rc : 0; out: return rc; } diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c index 39e2f41b6cf0..c8ea715bfe10 100644 --- a/arch/s390/kernel/suspend.c +++ b/arch/s390/kernel/suspend.c @@ -98,10 +98,16 @@ int page_key_alloc(unsigned long pages) */ void page_key_read(unsigned long *pfn) { + struct page *page; unsigned long addr; - - addr = (unsigned long) page_address(pfn_to_page(*pfn)); - *(unsigned char *) pfn = (unsigned char) page_get_storage_key(addr); + unsigned char key; + + page = pfn_to_page(*pfn); + addr = (unsigned long) page_address(page); + key = (unsigned char) page_get_storage_key(addr) & 0x7f; + if (arch_test_page_nodat(page)) + key |= 0x80; + *(unsigned char *) pfn = key; } /* @@ -126,8 +132,16 @@ void page_key_memorize(unsigned long *pfn) */ void page_key_write(void *address) { - page_set_storage_key((unsigned long) address, - page_key_rp->data[page_key_rx], 0); + struct page *page; + unsigned char key; + + key = page_key_rp->data[page_key_rx]; + page_set_storage_key((unsigned long) address, key & 0x7f, 0); + page = virt_to_page(address); + if (key & 0x80) + arch_set_page_nodat(page, 0); + else + arch_set_page_dat(page, 0); if (++page_key_rx >= PAGE_KEY_DATA_SIZE) return; page_key_rp = page_key_rp->next; diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 192efdfac918..5cbd52169348 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -51,8 +51,15 @@ #include <asm/cio.h> #include "entry.h" -u64 sched_clock_base_cc = -1; /* Force to data section. */ -EXPORT_SYMBOL_GPL(sched_clock_base_cc); +unsigned char tod_clock_base[16] __aligned(8) = { + /* Force to data section. */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; +EXPORT_SYMBOL_GPL(tod_clock_base); + +u64 clock_comparator_max = -1ULL; +EXPORT_SYMBOL_GPL(clock_comparator_max); static DEFINE_PER_CPU(struct clock_event_device, comparators); @@ -75,7 +82,7 @@ void __init time_early_init(void) struct ptff_qui qui; /* Initialize TOD steering parameters */ - tod_steering_end = sched_clock_base_cc; + tod_steering_end = *(unsigned long long *) &tod_clock_base[1]; vdso_data->ts_end = tod_steering_end; if (!test_facility(28)) @@ -111,22 +118,27 @@ unsigned long long monotonic_clock(void) } EXPORT_SYMBOL(monotonic_clock); -static void tod_to_timeval(__u64 todval, struct timespec64 *xt) +static void ext_to_timespec64(unsigned char *clk, struct timespec64 *xt) { - unsigned long long sec; + unsigned long long high, low, rem, sec, nsec; + + /* Split extendnd TOD clock to micro-seconds and sub-micro-seconds */ + high = (*(unsigned long long *) clk) >> 4; + low = (*(unsigned long long *)&clk[7]) << 4; + /* Calculate seconds and nano-seconds */ + sec = high; + rem = do_div(sec, 1000000); + nsec = (((low >> 32) + (rem << 32)) * 1000) >> 32; - sec = todval >> 12; - do_div(sec, 1000000); xt->tv_sec = sec; - todval -= (sec * 1000000) << 12; - xt->tv_nsec = ((todval * 1000) >> 12); + xt->tv_nsec = nsec; } void clock_comparator_work(void) { struct clock_event_device *cd; - S390_lowcore.clock_comparator = -1ULL; + S390_lowcore.clock_comparator = clock_comparator_max; cd = this_cpu_ptr(&comparators); cd->event_handler(cd); } @@ -148,7 +160,7 @@ void init_cpu_timer(void) struct clock_event_device *cd; int cpu; - S390_lowcore.clock_comparator = -1ULL; + S390_lowcore.clock_comparator = clock_comparator_max; set_clock_comparator(S390_lowcore.clock_comparator); cpu = smp_processor_id(); @@ -179,7 +191,7 @@ static void clock_comparator_interrupt(struct ext_code ext_code, unsigned long param64) { inc_irq_stat(IRQEXT_CLK); - if (S390_lowcore.clock_comparator == -1ULL) + if (S390_lowcore.clock_comparator == clock_comparator_max) set_clock_comparator(S390_lowcore.clock_comparator); } @@ -197,18 +209,28 @@ static void stp_reset(void); void read_persistent_clock64(struct timespec64 *ts) { - __u64 clock; + unsigned char clk[STORE_CLOCK_EXT_SIZE]; + __u64 delta; - clock = get_tod_clock() - initial_leap_seconds; - tod_to_timeval(clock - TOD_UNIX_EPOCH, ts); + delta = initial_leap_seconds + TOD_UNIX_EPOCH; + get_tod_clock_ext(clk); + *(__u64 *) &clk[1] -= delta; + if (*(__u64 *) &clk[1] > delta) + clk[0]--; + ext_to_timespec64(clk, ts); } void read_boot_clock64(struct timespec64 *ts) { - __u64 clock; + unsigned char clk[STORE_CLOCK_EXT_SIZE]; + __u64 delta; - clock = sched_clock_base_cc - initial_leap_seconds; - tod_to_timeval(clock - TOD_UNIX_EPOCH, ts); + delta = initial_leap_seconds + TOD_UNIX_EPOCH; + memcpy(clk, tod_clock_base, 16); + *(__u64 *) &clk[1] -= delta; + if (*(__u64 *) &clk[1] > delta) + clk[0]--; + ext_to_timespec64(clk, ts); } static u64 read_tod_clock(struct clocksource *cs) @@ -335,7 +357,7 @@ static unsigned long clock_sync_flags; * source. If the clock mode is local it will return -EOPNOTSUPP and * -EAGAIN if the clock is not in sync with the external reference. */ -int get_phys_clock(unsigned long long *clock) +int get_phys_clock(unsigned long *clock) { atomic_t *sw_ptr; unsigned int sw0, sw1; @@ -406,7 +428,10 @@ static void clock_sync_global(unsigned long long delta) struct ptff_qto qto; /* Fixup the monotonic sched clock. */ - sched_clock_base_cc += delta; + *(unsigned long long *) &tod_clock_base[1] += delta; + if (*(unsigned long long *) &tod_clock_base[1] < delta) + /* Epoch overflow */ + tod_clock_base[0]++; /* Adjust TOD steering parameters. */ vdso_data->tb_update_count++; now = get_tod_clock(); @@ -437,7 +462,7 @@ static void clock_sync_global(unsigned long long delta) static void clock_sync_local(unsigned long long delta) { /* Add the delta to the clock comparator. */ - if (S390_lowcore.clock_comparator != -1ULL) { + if (S390_lowcore.clock_comparator != clock_comparator_max) { S390_lowcore.clock_comparator += delta; set_clock_comparator(S390_lowcore.clock_comparator); } diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index b89d19f6f2ab..eacda05b45d7 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -157,6 +157,8 @@ int vdso_alloc_per_cpu(struct lowcore *lowcore) page_frame = get_zeroed_page(GFP_KERNEL); if (!segment_table || !page_table || !page_frame) goto out; + arch_set_page_dat(virt_to_page(segment_table), SEGMENT_ORDER); + arch_set_page_dat(virt_to_page(page_table), 0); /* Initialize per-cpu vdso data page */ vd = (struct vdso_per_cpu_data *) page_frame; diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S index 8f048c2d6d13..263a7f9eee1e 100644 --- a/arch/s390/kernel/vdso32/vdso32.lds.S +++ b/arch/s390/kernel/vdso32/vdso32.lds.S @@ -2,6 +2,8 @@ * This is the infamous ld script for the 32 bits vdso * library */ + +#include <asm/page.h> #include <asm/vdso.h> OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") @@ -91,7 +93,7 @@ SECTIONS .debug_ranges 0 : { *(.debug_ranges) } .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); PROVIDE(_vdso_data = .); /DISCARD/ : { diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index f35455d497fe..9e3dbbcc1cfc 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -2,6 +2,8 @@ * This is the infamous ld script for the 64 bits vdso * library */ + +#include <asm/page.h> #include <asm/vdso.h> OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") @@ -91,7 +93,7 @@ SECTIONS .debug_ranges 0 : { *(.debug_ranges) } .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); PROVIDE(_vdso_data = .); /DISCARD/ : { diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index ce865bd4f81d..e4d36094aceb 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -27,7 +27,7 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) unsigned long prefix = kvm_s390_get_prefix(vcpu); start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; - end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096; + end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + PAGE_SIZE; vcpu->stat.diagnose_10++; if (start & ~PAGE_MASK || end & ~PAGE_MASK || start >= end @@ -51,9 +51,9 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) */ gmap_discard(vcpu->arch.gmap, start, prefix); if (start <= prefix) - gmap_discard(vcpu->arch.gmap, 0, 4096); - if (end > prefix + 4096) - gmap_discard(vcpu->arch.gmap, 4096, 8192); + gmap_discard(vcpu->arch.gmap, 0, PAGE_SIZE); + if (end > prefix + PAGE_SIZE) + gmap_discard(vcpu->arch.gmap, PAGE_SIZE, 2 * PAGE_SIZE); gmap_discard(vcpu->arch.gmap, prefix + 2 * PAGE_SIZE, end); } return 0; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 653cae5e1ee1..3cc77391a102 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -629,7 +629,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130); if (asce.r) goto real_address; - ptr = asce.origin * 4096; + ptr = asce.origin * PAGE_SIZE; switch (asce.dt) { case ASCE_TYPE_REGION1: if (vaddr.rfx01 > asce.tl) @@ -674,7 +674,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_REGION_SECOND_TRANS; if (edat1) dat_protection |= rfte.p; - ptr = rfte.rto * 4096 + vaddr.rsx * 8; + ptr = rfte.rto * PAGE_SIZE + vaddr.rsx * 8; } /* fallthrough */ case ASCE_TYPE_REGION2: { @@ -692,7 +692,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_REGION_THIRD_TRANS; if (edat1) dat_protection |= rste.p; - ptr = rste.rto * 4096 + vaddr.rtx * 8; + ptr = rste.rto * PAGE_SIZE + vaddr.rtx * 8; } /* fallthrough */ case ASCE_TYPE_REGION3: { @@ -720,7 +720,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_SEGMENT_TRANSLATION; if (edat1) dat_protection |= rtte.fc0.p; - ptr = rtte.fc0.sto * 4096 + vaddr.sx * 8; + ptr = rtte.fc0.sto * PAGE_SIZE + vaddr.sx * 8; } /* fallthrough */ case ASCE_TYPE_SEGMENT: { @@ -743,7 +743,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, goto absolute_address; } dat_protection |= ste.fc0.p; - ptr = ste.fc0.pto * 2048 + vaddr.px * 8; + ptr = ste.fc0.pto * (PAGE_SIZE / 2) + vaddr.px * 8; } } if (kvm_is_error_gpa(vcpu->kvm, ptr)) @@ -993,7 +993,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, parent = sg->parent; vaddr.addr = saddr; asce.val = sg->orig_asce; - ptr = asce.origin * 4096; + ptr = asce.origin * PAGE_SIZE; if (asce.r) { *fake = 1; ptr = 0; @@ -1029,7 +1029,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, union region1_table_entry rfte; if (*fake) { - ptr += (unsigned long) vaddr.rfx << 53; + ptr += vaddr.rfx * _REGION1_SIZE; rfte.val = ptr; goto shadow_r2t; } @@ -1044,7 +1044,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, return PGM_REGION_SECOND_TRANS; if (sg->edat_level >= 1) *dat_protection |= rfte.p; - ptr = rfte.rto << 12UL; + ptr = rfte.rto * PAGE_SIZE; shadow_r2t: rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); if (rc) @@ -1055,7 +1055,7 @@ shadow_r2t: union region2_table_entry rste; if (*fake) { - ptr += (unsigned long) vaddr.rsx << 42; + ptr += vaddr.rsx * _REGION2_SIZE; rste.val = ptr; goto shadow_r3t; } @@ -1070,7 +1070,7 @@ shadow_r2t: return PGM_REGION_THIRD_TRANS; if (sg->edat_level >= 1) *dat_protection |= rste.p; - ptr = rste.rto << 12UL; + ptr = rste.rto * PAGE_SIZE; shadow_r3t: rste.p |= *dat_protection; rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); @@ -1082,7 +1082,7 @@ shadow_r3t: union region3_table_entry rtte; if (*fake) { - ptr += (unsigned long) vaddr.rtx << 31; + ptr += vaddr.rtx * _REGION3_SIZE; rtte.val = ptr; goto shadow_sgt; } @@ -1098,7 +1098,7 @@ shadow_r3t: if (rtte.fc && sg->edat_level >= 2) { *dat_protection |= rtte.fc0.p; *fake = 1; - ptr = rtte.fc1.rfaa << 31UL; + ptr = rtte.fc1.rfaa * _REGION3_SIZE; rtte.val = ptr; goto shadow_sgt; } @@ -1106,7 +1106,7 @@ shadow_r3t: return PGM_SEGMENT_TRANSLATION; if (sg->edat_level >= 1) *dat_protection |= rtte.fc0.p; - ptr = rtte.fc0.sto << 12UL; + ptr = rtte.fc0.sto * PAGE_SIZE; shadow_sgt: rtte.fc0.p |= *dat_protection; rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); @@ -1118,7 +1118,7 @@ shadow_sgt: union segment_table_entry ste; if (*fake) { - ptr += (unsigned long) vaddr.sx << 20; + ptr += vaddr.sx * _SEGMENT_SIZE; ste.val = ptr; goto shadow_pgt; } @@ -1134,11 +1134,11 @@ shadow_sgt: *dat_protection |= ste.fc0.p; if (ste.fc && sg->edat_level >= 1) { *fake = 1; - ptr = ste.fc1.sfaa << 20UL; + ptr = ste.fc1.sfaa * _SEGMENT_SIZE; ste.val = ptr; goto shadow_pgt; } - ptr = ste.fc0.pto << 11UL; + ptr = ste.fc0.pto * (PAGE_SIZE / 2); shadow_pgt: ste.fc0.p |= *dat_protection; rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); @@ -1187,8 +1187,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, vaddr.addr = saddr; if (fake) { - /* offset in 1MB guest memory block */ - pte.val = pgt + ((unsigned long) vaddr.px << 12UL); + pte.val = pgt + vaddr.px * PAGE_SIZE; goto shadow_page; } if (!rc) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 8a1dac793d6b..785ad028bde6 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -329,7 +329,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) start = kvm_s390_logical_to_effective(vcpu, start); if (m3 & SSKE_MB) { /* start already designates an absolute address */ - end = (start + (1UL << 20)) & ~((1UL << 20) - 1); + end = (start + _SEGMENT_SIZE) & ~(_SEGMENT_SIZE - 1); } else { start = kvm_s390_real_to_abs(vcpu, start); end = start + PAGE_SIZE; @@ -893,10 +893,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) case 0x00000000: /* only 4k frames specify a real address */ start = kvm_s390_real_to_abs(vcpu, start); - end = (start + (1UL << 12)) & ~((1UL << 12) - 1); + end = (start + PAGE_SIZE) & ~(PAGE_SIZE - 1); break; case 0x00001000: - end = (start + (1UL << 20)) & ~((1UL << 20) - 1); + end = (start + _SEGMENT_SIZE) & ~(_SEGMENT_SIZE - 1); break; case 0x00002000: /* only support 2G frame size if EDAT2 is available and we are @@ -904,7 +904,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (!test_kvm_facility(vcpu->kvm, 78) || psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_24BIT) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - end = (start + (1UL << 31)) & ~((1UL << 31) - 1); + end = (start + _REGION3_SIZE) & ~(_REGION3_SIZE - 1); break; default: return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c index 926b5244263e..a2e5c24f47a7 100644 --- a/arch/s390/kvm/sthyi.c +++ b/arch/s390/kvm/sthyi.c @@ -394,7 +394,7 @@ static int sthyi(u64 vaddr) "srl %[cc],28\n" : [cc] "=d" (cc) : [code] "d" (code), [addr] "a" (addr) - : "memory", "cc"); + : "3", "memory", "cc"); return cc; } @@ -425,7 +425,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr); trace_kvm_s390_handle_sthyi(vcpu, code, addr); - if (reg1 == reg2 || reg1 & 1 || reg2 & 1 || addr & ~PAGE_MASK) + if (reg1 == reg2 || reg1 & 1 || reg2 & 1) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); if (code & 0xffff) { @@ -433,6 +433,9 @@ int handle_sthyi(struct kvm_vcpu *vcpu) goto out; } + if (addr & ~PAGE_MASK) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + /* * If the page has not yet been faulted in, we want to do that * now and not after all the expensive calculations. diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 715c19c45d9a..ba8203e4d516 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1069,7 +1069,7 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - BUILD_BUG_ON(sizeof(struct vsie_page) != 4096); + BUILD_BUG_ON(sizeof(struct vsie_page) != PAGE_SIZE); scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL); /* 512 byte alignment */ diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c index 92e90e40b6fb..7f17555ad4d5 100644 --- a/arch/s390/lib/delay.c +++ b/arch/s390/lib/delay.c @@ -57,7 +57,7 @@ static void __udelay_enabled(unsigned long long usecs) end = get_tod_clock_fast() + (usecs << 12); do { clock_saved = 0; - if (end < S390_lowcore.clock_comparator) { + if (tod_after(S390_lowcore.clock_comparator, end)) { clock_saved = local_tick_disable(); set_clock_comparator(end); } diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index ffb15bd4c593..b12663d653d8 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -32,42 +32,63 @@ static int __init spin_retry_setup(char *str) } __setup("spin_retry=", spin_retry_setup); +static inline int arch_load_niai4(int *lock) +{ + int owner; + + asm volatile( +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + " .long 0xb2fa0040\n" /* NIAI 4 */ +#endif + " l %0,%1\n" + : "=d" (owner) : "Q" (*lock) : "memory"); + return owner; +} + +static inline int arch_cmpxchg_niai8(int *lock, int old, int new) +{ + int expected = old; + + asm volatile( +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + " .long 0xb2fa0080\n" /* NIAI 8 */ +#endif + " cs %0,%3,%1\n" + : "=d" (old), "=Q" (*lock) + : "0" (old), "d" (new), "Q" (*lock) + : "cc", "memory"); + return expected == old; +} + void arch_spin_lock_wait(arch_spinlock_t *lp) { int cpu = SPINLOCK_LOCKVAL; - int owner, count, first_diag; + int owner, count; + + /* Pass the virtual CPU to the lock holder if it is not running */ + owner = arch_load_niai4(&lp->lock); + if (owner && arch_vcpu_is_preempted(~owner)) + smp_yield_cpu(~owner); - first_diag = 1; + count = spin_retry; while (1) { - owner = ACCESS_ONCE(lp->lock); + owner = arch_load_niai4(&lp->lock); /* Try to get the lock if it is free. */ if (!owner) { - if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu)) + if (arch_cmpxchg_niai8(&lp->lock, 0, cpu)) return; continue; } - /* First iteration: check if the lock owner is running. */ - if (first_diag && arch_vcpu_is_preempted(~owner)) { - smp_yield_cpu(~owner); - first_diag = 0; + if (count-- >= 0) continue; - } - /* Loop for a while on the lock value. */ count = spin_retry; - do { - owner = ACCESS_ONCE(lp->lock); - } while (owner && count-- > 0); - if (!owner) - continue; /* * For multiple layers of hypervisors, e.g. z/VM + LPAR * yield the CPU unconditionally. For LPAR rely on the * sense running status. */ - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) { + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) smp_yield_cpu(~owner); - first_diag = 0; - } } } EXPORT_SYMBOL(arch_spin_lock_wait); @@ -75,42 +96,36 @@ EXPORT_SYMBOL(arch_spin_lock_wait); void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags) { int cpu = SPINLOCK_LOCKVAL; - int owner, count, first_diag; + int owner, count; local_irq_restore(flags); - first_diag = 1; + + /* Pass the virtual CPU to the lock holder if it is not running */ + owner = arch_load_niai4(&lp->lock); + if (owner && arch_vcpu_is_preempted(~owner)) + smp_yield_cpu(~owner); + + count = spin_retry; while (1) { - owner = ACCESS_ONCE(lp->lock); + owner = arch_load_niai4(&lp->lock); /* Try to get the lock if it is free. */ if (!owner) { local_irq_disable(); - if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu)) + if (arch_cmpxchg_niai8(&lp->lock, 0, cpu)) return; local_irq_restore(flags); continue; } - /* Check if the lock owner is running. */ - if (first_diag && arch_vcpu_is_preempted(~owner)) { - smp_yield_cpu(~owner); - first_diag = 0; + if (count-- >= 0) continue; - } - /* Loop for a while on the lock value. */ count = spin_retry; - do { - owner = ACCESS_ONCE(lp->lock); - } while (owner && count-- > 0); - if (!owner) - continue; /* * For multiple layers of hypervisors, e.g. z/VM + LPAR * yield the CPU unconditionally. For LPAR rely on the * sense running status. */ - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) { + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) smp_yield_cpu(~owner); - first_diag = 0; - } } } EXPORT_SYMBOL(arch_spin_lock_wait_flags); diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index b3bd3f23b8e8..4ea9106417ee 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -15,8 +15,30 @@ #include <asm/mmu_context.h> #include <asm/facility.h> +#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES static DEFINE_STATIC_KEY_FALSE(have_mvcos); +static int __init uaccess_init(void) +{ + if (test_facility(27)) + static_branch_enable(&have_mvcos); + return 0; +} +early_initcall(uaccess_init); + +static inline int copy_with_mvcos(void) +{ + if (static_branch_likely(&have_mvcos)) + return 1; + return 0; +} +#else +static inline int copy_with_mvcos(void) +{ + return 1; +} +#endif + static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr, unsigned long size) { @@ -84,7 +106,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr, unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return copy_from_user_mvcos(to, from, n); return copy_from_user_mvcp(to, from, n); } @@ -157,7 +179,7 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x, unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return copy_to_user_mvcos(to, from, n); return copy_to_user_mvcs(to, from, n); } @@ -220,7 +242,7 @@ static inline unsigned long copy_in_user_mvc(void __user *to, const void __user unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return copy_in_user_mvcos(to, from, n); return copy_in_user_mvc(to, from, n); } @@ -292,7 +314,7 @@ static inline unsigned long clear_user_xc(void __user *to, unsigned long size) unsigned long __clear_user(void __user *to, unsigned long size) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return clear_user_mvcos(to, size); return clear_user_xc(to, size); } @@ -349,11 +371,3 @@ long __strncpy_from_user(char *dst, const char __user *src, long size) return done; } EXPORT_SYMBOL(__strncpy_from_user); - -static int __init uaccess_init(void) -{ - if (test_facility(27)) - static_branch_enable(&have_mvcos); - return 0; -} -early_initcall(uaccess_init); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 14f25798b001..bdabb013537b 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -135,7 +135,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_alert("AS:%016lx ", asce); switch (asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: - table = table + ((address >> 53) & 0x7ff); + table += (address & _REGION1_INDEX) >> _REGION1_SHIFT; if (bad_address(table)) goto bad; pr_cont("R1:%016lx ", *table); @@ -144,7 +144,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* fallthrough */ case _ASCE_TYPE_REGION2: - table = table + ((address >> 42) & 0x7ff); + table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; if (bad_address(table)) goto bad; pr_cont("R2:%016lx ", *table); @@ -153,7 +153,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* fallthrough */ case _ASCE_TYPE_REGION3: - table = table + ((address >> 31) & 0x7ff); + table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; if (bad_address(table)) goto bad; pr_cont("R3:%016lx ", *table); @@ -162,7 +162,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* fallthrough */ case _ASCE_TYPE_SEGMENT: - table = table + ((address >> 20) & 0x7ff); + table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (bad_address(table)) goto bad; pr_cont("S:%016lx ", *table); @@ -170,7 +170,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) goto out; table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); } - table = table + ((address >> 12) & 0xff); + table += (address & _PAGE_INDEX) >> _PAGE_SHIFT; if (bad_address(table)) goto bad; pr_cont("P:%016lx ", *table); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 4fb3d3cdb370..9e1494e3d849 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -36,16 +36,16 @@ static struct gmap *gmap_alloc(unsigned long limit) unsigned long *table; unsigned long etype, atype; - if (limit < (1UL << 31)) { - limit = (1UL << 31) - 1; + if (limit < _REGION3_SIZE) { + limit = _REGION3_SIZE - 1; atype = _ASCE_TYPE_SEGMENT; etype = _SEGMENT_ENTRY_EMPTY; - } else if (limit < (1UL << 42)) { - limit = (1UL << 42) - 1; + } else if (limit < _REGION2_SIZE) { + limit = _REGION2_SIZE - 1; atype = _ASCE_TYPE_REGION3; etype = _REGION3_ENTRY_EMPTY; - } else if (limit < (1UL << 53)) { - limit = (1UL << 53) - 1; + } else if (limit < _REGION1_SIZE) { + limit = _REGION1_SIZE - 1; atype = _ASCE_TYPE_REGION2; etype = _REGION2_ENTRY_EMPTY; } else { @@ -65,7 +65,7 @@ static struct gmap *gmap_alloc(unsigned long limit) spin_lock_init(&gmap->guest_table_lock); spin_lock_init(&gmap->shadow_lock); atomic_set(&gmap->ref_count, 1); - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) goto out_free; page->index = 0; @@ -186,7 +186,7 @@ static void gmap_free(struct gmap *gmap) gmap_flush_tlb(gmap); /* Free all segment & region tables. */ list_for_each_entry_safe(page, next, &gmap->crst_list, lru) - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); @@ -306,7 +306,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, unsigned long *new; /* since we dont free the gmap table until gmap_free we can unlock */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; new = (unsigned long *) page_to_phys(page); @@ -321,7 +321,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, } spin_unlock(&gmap->guest_table_lock); if (page) - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return 0; } @@ -546,30 +546,30 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) /* Create higher level tables in the gmap page table */ table = gmap->table; if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { - table += (gaddr >> 53) & 0x7ff; + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, - gaddr & 0xffe0000000000000UL)) + gaddr & _REGION1_MASK)) return -ENOMEM; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { - table += (gaddr >> 42) & 0x7ff; + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, - gaddr & 0xfffffc0000000000UL)) + gaddr & _REGION2_MASK)) return -ENOMEM; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { - table += (gaddr >> 31) & 0x7ff; + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, - gaddr & 0xffffffff80000000UL)) + gaddr & _REGION3_MASK)) return -ENOMEM; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); } - table += (gaddr >> 20) & 0x7ff; + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; /* Walk the parent mm page table */ mm = gmap->mm; pgd = pgd_offset(mm, vmaddr); @@ -771,7 +771,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = gmap->table; switch (gmap->asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: - table += (gaddr >> 53) & 0x7ff; + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; if (level == 4) break; if (*table & _REGION_ENTRY_INVALID) @@ -779,7 +779,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_REGION2: - table += (gaddr >> 42) & 0x7ff; + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; if (level == 3) break; if (*table & _REGION_ENTRY_INVALID) @@ -787,7 +787,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_REGION3: - table += (gaddr >> 31) & 0x7ff; + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; if (level == 2) break; if (*table & _REGION_ENTRY_INVALID) @@ -795,13 +795,13 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_SEGMENT: - table += (gaddr >> 20) & 0x7ff; + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (level == 1) break; if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); - table += (gaddr >> 12) & 0xff; + table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; } return table; } @@ -1126,7 +1126,7 @@ static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ if (!table || *table & _PAGE_INVALID) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1); + gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1); ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); } @@ -1144,7 +1144,7 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, int i; BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < 256; i++, raddr += 1UL << 12) + for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE) pgt[i] = _PAGE_INVALID; } @@ -1164,8 +1164,8 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1); - sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); + sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); *ste = _SEGMENT_ENTRY_EMPTY; @@ -1193,7 +1193,7 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT; - for (i = 0; i < 2048; i++, raddr += 1UL << 20) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) continue; pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); @@ -1222,8 +1222,8 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1); - r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); + r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); *r3e = _REGION3_ENTRY_EMPTY; @@ -1231,7 +1231,7 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) /* Free segment table */ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1251,7 +1251,7 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r3t | _ASCE_TYPE_REGION3; - for (i = 0; i < 2048; i++, raddr += 1UL << 31) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) continue; sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); @@ -1260,7 +1260,7 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, /* Free segment table */ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1280,8 +1280,8 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1); - r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); + r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); *r2e = _REGION2_ENTRY_EMPTY; @@ -1289,7 +1289,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) /* Free region 3 table */ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1309,7 +1309,7 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r2t | _ASCE_TYPE_REGION2; - for (i = 0; i < 2048; i++, raddr += 1UL << 42) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) continue; r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); @@ -1318,7 +1318,7 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, /* Free region 3 table */ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1338,8 +1338,8 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1); - r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); + r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); *r1e = _REGION1_ENTRY_EMPTY; @@ -1347,7 +1347,7 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) /* Free region 2 table */ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1367,7 +1367,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; - for (i = 0; i < 2048; i++, raddr += 1UL << 53) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) continue; r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); @@ -1378,7 +1378,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, /* Free region 2 table */ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1535,7 +1535,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, /* protect after insertion, so it will get properly invalidated */ down_read(&parent->mm->mmap_sem); rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, - ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096, + ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, PROT_READ, PGSTE_VSIE_BIT); up_read(&parent->mm->mmap_sem); spin_lock(&parent->shadow_lock); @@ -1578,7 +1578,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = r2t & _REGION_ENTRY_ORIGIN; @@ -1614,10 +1614,10 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, } spin_unlock(&sg->guest_table_lock); /* Make r2t read-only in parent gmap page table */ - raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1; + raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; origin = r2t & _REGION_ENTRY_ORIGIN; - offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); spin_lock(&sg->guest_table_lock); if (!rc) { @@ -1634,7 +1634,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_r2t); @@ -1662,7 +1662,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = r3t & _REGION_ENTRY_ORIGIN; @@ -1697,10 +1697,10 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, } spin_unlock(&sg->guest_table_lock); /* Make r3t read-only in parent gmap page table */ - raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2; + raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; origin = r3t & _REGION_ENTRY_ORIGIN; - offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); spin_lock(&sg->guest_table_lock); if (!rc) { @@ -1717,7 +1717,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_r3t); @@ -1745,7 +1745,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); /* Allocate a shadow segment table */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = sgt & _REGION_ENTRY_ORIGIN; @@ -1781,10 +1781,10 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, } spin_unlock(&sg->guest_table_lock); /* Make sgt read-only in parent gmap page table */ - raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3; + raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; origin = sgt & _REGION_ENTRY_ORIGIN; - offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); spin_lock(&sg->guest_table_lock); if (!rc) { @@ -1801,7 +1801,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_sgt); @@ -1902,7 +1902,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, } spin_unlock(&sg->guest_table_lock); /* Make pgt read-only in parent gmap page table (not the pgste) */ - raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT; + raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ); spin_lock(&sg->guest_table_lock); @@ -2021,7 +2021,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, } /* Check for top level table */ start = sg->orig_asce & _ASCE_ORIGIN; - end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096; + end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && gaddr < end) { /* The complete shadow table has to go */ @@ -2032,7 +2032,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, return; } /* Remove the page table tree from on specific entry */ - head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12); + head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); gmap_for_each_rmap_safe(rmap, rnext, head) { bits = rmap->raddr & _SHADOW_RMAP_MASK; raddr = rmap->raddr ^ bits; @@ -2076,7 +2076,7 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, struct gmap *gmap, *sg, *next; offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); - offset = offset * (4096 / sizeof(pte_t)); + offset = offset * (PAGE_SIZE / sizeof(pte_t)); rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); @@ -2121,6 +2121,37 @@ static inline void thp_split_mm(struct mm_struct *mm) } /* + * Remove all empty zero pages from the mapping for lazy refaulting + * - This must be called after mm->context.has_pgste is set, to avoid + * future creation of zero pages + * - This must be called after THP was enabled + */ +static int __zap_zero_pages(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + unsigned long addr; + + for (addr = start; addr != end; addr += PAGE_SIZE) { + pte_t *ptep; + spinlock_t *ptl; + + ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (is_zero_pfn(pte_pfn(*ptep))) + ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); + pte_unmap_unlock(ptep, ptl); + } + return 0; +} + +static inline void zap_zero_pages(struct mm_struct *mm) +{ + struct mm_walk walk = { .pmd_entry = __zap_zero_pages }; + + walk.mm = mm; + walk_page_range(0, TASK_SIZE, &walk); +} + +/* * switch on pgstes for its userspace process (for kvm) */ int s390_enable_sie(void) @@ -2137,6 +2168,7 @@ int s390_enable_sie(void) mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); + zap_zero_pages(mm); up_write(&mm->mmap_sem); return 0; } @@ -2149,13 +2181,6 @@ EXPORT_SYMBOL_GPL(s390_enable_sie); static int __s390_enable_skey(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - /* - * Remove all zero page mappings, - * after establishing a policy to forbid zero page mappings - * following faults for that page will get fresh anonymous pages - */ - if (is_zero_pfn(pte_pfn(*pte))) - ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID)); /* Clear storage key */ ptep_zap_key(walk->mm, addr, pte); return 0; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 8111694ce55a..3b567838b905 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -26,6 +26,7 @@ #include <linux/poison.h> #include <linux/initrd.h> #include <linux/export.h> +#include <linux/cma.h> #include <linux/gfp.h> #include <linux/memblock.h> #include <asm/processor.h> @@ -84,7 +85,7 @@ void __init paging_init(void) psw_t psw; init_mm.pgd = swapper_pg_dir; - if (VMALLOC_END > (1UL << 42)) { + if (VMALLOC_END > _REGION2_SIZE) { asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; pgd_type = _REGION2_ENTRY_EMPTY; } else { @@ -93,8 +94,7 @@ void __init paging_init(void) } init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; S390_lowcore.kernel_asce = init_mm.context.asce; - clear_table((unsigned long *) init_mm.pgd, pgd_type, - sizeof(unsigned long)*2048); + crst_table_init((unsigned long *) init_mm.pgd, pgd_type); vmem_map_init(); /* enable virtual mapping in kernel mode */ @@ -137,6 +137,8 @@ void __init mem_init(void) free_all_bootmem(); setup_zero_pages(); /* Setup zeroed pages. */ + cmma_init_nodat(); + mem_init_print_info(NULL); } @@ -166,6 +168,58 @@ unsigned long memory_block_size_bytes(void) } #ifdef CONFIG_MEMORY_HOTPLUG + +#ifdef CONFIG_CMA + +/* Prevent memory blocks which contain cma regions from going offline */ + +struct s390_cma_mem_data { + unsigned long start; + unsigned long end; +}; + +static int s390_cma_check_range(struct cma *cma, void *data) +{ + struct s390_cma_mem_data *mem_data; + unsigned long start, end; + + mem_data = data; + start = cma_get_base(cma); + end = start + cma_get_size(cma); + if (end < mem_data->start) + return 0; + if (start >= mem_data->end) + return 0; + return -EBUSY; +} + +static int s390_cma_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct s390_cma_mem_data mem_data; + struct memory_notify *arg; + int rc = 0; + + arg = data; + mem_data.start = arg->start_pfn << PAGE_SHIFT; + mem_data.end = mem_data.start + (arg->nr_pages << PAGE_SHIFT); + if (action == MEM_GOING_OFFLINE) + rc = cma_for_each_area(s390_cma_check_range, &mem_data); + return notifier_from_errno(rc); +} + +static struct notifier_block s390_cma_mem_nb = { + .notifier_call = s390_cma_mem_notifier, +}; + +static int __init s390_cma_mem_init(void) +{ + return register_memory_notifier(&s390_cma_mem_nb); +} +device_initcall(s390_cma_mem_init); + +#endif /* CONFIG_CMA */ + int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 2e10d2b8ad35..5bea139517a2 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -119,7 +119,8 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, return addr; check_asce_limit: - if (addr + len > current->mm->context.asce_limit) { + if (addr + len > current->mm->context.asce_limit && + addr + len <= TASK_SIZE) { rc = crst_table_upgrade(mm, addr + len); if (rc) return (unsigned long) rc; @@ -183,7 +184,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, } check_asce_limit: - if (addr + len > current->mm->context.asce_limit) { + if (addr + len > current->mm->context.asce_limit && + addr + len <= TASK_SIZE) { rc = crst_table_upgrade(mm, addr + len); if (rc) return (unsigned long) rc; diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 69a7b01ae746..07fa7b8ae233 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -10,9 +10,10 @@ #include <linux/errno.h> #include <linux/types.h> #include <linux/mm.h> +#include <linux/memblock.h> #include <linux/gfp.h> #include <linux/init.h> - +#include <asm/facility.h> #include <asm/page-states.h> static int cmma_flag = 1; @@ -36,14 +37,16 @@ __setup("cmma=", cmma); static inline int cmma_test_essa(void) { register unsigned long tmp asm("0") = 0; - register int rc asm("1") = -EOPNOTSUPP; + register int rc asm("1"); + /* test ESSA_GET_STATE */ asm volatile( - " .insn rrf,0xb9ab0000,%1,%1,0,0\n" + " .insn rrf,0xb9ab0000,%1,%1,%2,0\n" "0: la %0,0\n" "1:\n" EX_TABLE(0b,1b) - : "+&d" (rc), "+&d" (tmp)); + : "=&d" (rc), "+&d" (tmp) + : "i" (ESSA_GET_STATE), "0" (-EOPNOTSUPP)); return rc; } @@ -51,11 +54,26 @@ void __init cmma_init(void) { if (!cmma_flag) return; - if (cmma_test_essa()) + if (cmma_test_essa()) { cmma_flag = 0; + return; + } + if (test_facility(147)) + cmma_flag = 2; } -static inline void set_page_unstable(struct page *page, int order) +static inline unsigned char get_page_state(struct page *page) +{ + unsigned char state; + + asm volatile(" .insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (state) + : "a" (page_to_phys(page)), + "i" (ESSA_GET_STATE)); + return state & 0x3f; +} + +static inline void set_page_unused(struct page *page, int order) { int i, rc; @@ -66,14 +84,18 @@ static inline void set_page_unstable(struct page *page, int order) "i" (ESSA_SET_UNUSED)); } -void arch_free_page(struct page *page, int order) +static inline void set_page_stable_dat(struct page *page, int order) { - if (!cmma_flag) - return; - set_page_unstable(page, order); + int i, rc; + + for (i = 0; i < (1 << order); i++) + asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (rc) + : "a" (page_to_phys(page + i)), + "i" (ESSA_SET_STABLE)); } -static inline void set_page_stable(struct page *page, int order) +static inline void set_page_stable_nodat(struct page *page, int order) { int i, rc; @@ -81,14 +103,154 @@ static inline void set_page_stable(struct page *page, int order) asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" : "=&d" (rc) : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_STABLE)); + "i" (ESSA_SET_STABLE_NODAT)); +} + +static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + pmd_t *pmd; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd) || pmd_large(*pmd)) + continue; + page = virt_to_page(pmd_val(*pmd)); + set_bit(PG_arch_1, &page->flags); + } while (pmd++, addr = next, addr != end); +} + +static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + pud_t *pud; + int i; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none(*pud) || pud_large(*pud)) + continue; + if (!pud_folded(*pud)) { + page = virt_to_page(pud_val(*pud)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_pmd(pud, addr, next); + } while (pud++, addr = next, addr != end); +} + +static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + p4d_t *p4d; + int i; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) + continue; + if (!p4d_folded(*p4d)) { + page = virt_to_page(p4d_val(*p4d)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_pud(p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + +static void mark_kernel_pgd(void) +{ + unsigned long addr, next; + struct page *page; + pgd_t *pgd; + int i; + + addr = 0; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, MODULES_END); + if (pgd_none(*pgd)) + continue; + if (!pgd_folded(*pgd)) { + page = virt_to_page(pgd_val(*pgd)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_p4d(pgd, addr, next); + } while (pgd++, addr = next, addr != MODULES_END); +} + +void __init cmma_init_nodat(void) +{ + struct memblock_region *reg; + struct page *page; + unsigned long start, end, ix; + + if (cmma_flag < 2) + return; + /* Mark pages used in kernel page tables */ + mark_kernel_pgd(); + + /* Set all kernel pages not used for page tables to stable/no-dat */ + for_each_memblock(memory, reg) { + start = memblock_region_memory_base_pfn(reg); + end = memblock_region_memory_end_pfn(reg); + page = pfn_to_page(start); + for (ix = start; ix < end; ix++, page++) { + if (__test_and_clear_bit(PG_arch_1, &page->flags)) + continue; /* skip page table pages */ + if (!list_empty(&page->lru)) + continue; /* skip free pages */ + set_page_stable_nodat(page, 0); + } + } +} + +void arch_free_page(struct page *page, int order) +{ + if (!cmma_flag) + return; + set_page_unused(page, order); } void arch_alloc_page(struct page *page, int order) { if (!cmma_flag) return; - set_page_stable(page, order); + if (cmma_flag < 2) + set_page_stable_dat(page, order); + else + set_page_stable_nodat(page, order); +} + +void arch_set_page_dat(struct page *page, int order) +{ + if (!cmma_flag) + return; + set_page_stable_dat(page, order); +} + +void arch_set_page_nodat(struct page *page, int order) +{ + if (cmma_flag < 2) + return; + set_page_stable_nodat(page, order); +} + +int arch_test_page_nodat(struct page *page) +{ + unsigned char state; + + if (cmma_flag < 2) + return 0; + state = get_page_state(page); + return !!(state & 0x20); } void arch_set_page_states(int make_stable) @@ -108,9 +270,9 @@ void arch_set_page_states(int make_stable) list_for_each(l, &zone->free_area[order].free_list[t]) { page = list_entry(l, struct page, lru); if (make_stable) - set_page_stable(page, order); + set_page_stable_dat(page, 0); else - set_page_unstable(page, order); + set_page_unused(page, order); } } spin_unlock_irqrestore(&zone->lock, flags); diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 180481589246..552f898dfa74 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -7,6 +7,7 @@ #include <asm/cacheflush.h> #include <asm/facility.h> #include <asm/pgtable.h> +#include <asm/pgalloc.h> #include <asm/page.h> #include <asm/set_memory.h> @@ -191,7 +192,7 @@ static int split_pud_page(pud_t *pudp, unsigned long addr) pud_t new; int i, ro, nx; - pm_dir = vmem_pmd_alloc(); + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!pm_dir) return -ENOMEM; pmd_addr = pud_pfn(*pudp) << PAGE_SHIFT; @@ -328,7 +329,7 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr) return; } for (i = 0; i < nr; i++) { - __ptep_ipte(address, pte, IPTE_GLOBAL); + __ptep_ipte(address, pte, 0, 0, IPTE_GLOBAL); address += PAGE_SIZE; pte++; } diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 18918e394ce4..c5b74dd61197 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -57,6 +57,7 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) if (!page) return NULL; + arch_set_page_dat(page, 2); return (unsigned long *) page_to_phys(page); } @@ -82,7 +83,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) int rc, notify; /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ - BUG_ON(mm->context.asce_limit < (1UL << 42)); + BUG_ON(mm->context.asce_limit < _REGION2_SIZE); if (end >= TASK_SIZE_MAX) return -ENOMEM; rc = 0; @@ -95,11 +96,11 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) } spin_lock_bh(&mm->page_table_lock); pgd = (unsigned long *) mm->pgd; - if (mm->context.asce_limit == (1UL << 42)) { + if (mm->context.asce_limit == _REGION2_SIZE) { crst_table_init(table, _REGION2_ENTRY_EMPTY); p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd); mm->pgd = (pgd_t *) table; - mm->context.asce_limit = 1UL << 53; + mm->context.asce_limit = _REGION1_SIZE; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION2; } else { @@ -123,7 +124,7 @@ void crst_table_downgrade(struct mm_struct *mm) pgd_t *pgd; /* downgrade should only happen from 3 to 2 levels (compat only) */ - BUG_ON(mm->context.asce_limit != (1UL << 42)); + BUG_ON(mm->context.asce_limit != _REGION2_SIZE); if (current->active_mm == mm) { clear_user_asce(); @@ -132,7 +133,7 @@ void crst_table_downgrade(struct mm_struct *mm) pgd = mm->pgd; mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); - mm->context.asce_limit = 1UL << 31; + mm->context.asce_limit = _REGION3_SIZE; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; crst_table_free(mm, (unsigned long *) pgd); @@ -214,6 +215,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) __free_page(page); return NULL; } + arch_set_page_dat(page, 0); /* Initialize page table */ table = (unsigned long *) page_to_phys(page); if (mm_alloc_pgste(mm)) { diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 4a1f7366b17a..4198a71b8fdd 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -25,8 +25,49 @@ #include <asm/mmu_context.h> #include <asm/page-states.h> +static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int nodat) +{ + unsigned long opt, asce; + + if (MACHINE_HAS_TLB_GUEST) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL || nodat) + opt |= IPTE_NODAT; + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); + } else { + __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); + } +} + +static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int nodat) +{ + unsigned long opt, asce; + + if (MACHINE_HAS_TLB_GUEST) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL || nodat) + opt |= IPTE_NODAT; + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); + } else { + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + } +} + static inline pte_t ptep_flush_direct(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + int nodat) { pte_t old; @@ -36,15 +77,16 @@ static inline pte_t ptep_flush_direct(struct mm_struct *mm, atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __ptep_ipte(addr, ptep, IPTE_LOCAL); + ptep_ipte_local(mm, addr, ptep, nodat); else - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + ptep_ipte_global(mm, addr, ptep, nodat); atomic_dec(&mm->context.flush_count); return old; } static inline pte_t ptep_flush_lazy(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + int nodat) { pte_t old; @@ -57,7 +99,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, pte_val(*ptep) |= _PAGE_INVALID; mm->context.flush_mm = 1; } else - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + ptep_ipte_global(mm, addr, ptep, nodat); atomic_dec(&mm->context.flush_count); return old; } @@ -229,10 +271,12 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_direct(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_direct(mm, addr, ptep, nodat); old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); preempt_enable(); return old; @@ -244,10 +288,12 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_lazy(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); preempt_enable(); return old; @@ -259,10 +305,12 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_lazy(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); if (mm_has_pgste(mm)) { pgste = pgste_update_all(old, pgste, mm); pgste_set(ptep, pgste); @@ -290,6 +338,28 @@ void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(ptep_modify_prot_commit); +static inline void pmdp_idte_local(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); + else + __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); +} + +static inline void pmdp_idte_global(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); + else + __pmdp_csp(pmdp); +} + static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { @@ -298,16 +368,12 @@ static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, old = *pmdp; if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) return old; - if (!MACHINE_HAS_IDTE) { - __pmdp_csp(pmdp); - return old; - } atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __pmdp_idte(addr, pmdp, IDTE_LOCAL); + pmdp_idte_local(mm, addr, pmdp); else - __pmdp_idte(addr, pmdp, IDTE_GLOBAL); + pmdp_idte_global(mm, addr, pmdp); atomic_dec(&mm->context.flush_count); return old; } @@ -325,10 +391,9 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, cpumask_of(smp_processor_id()))) { pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID; mm->context.flush_mm = 1; - } else if (MACHINE_HAS_IDTE) - __pmdp_idte(addr, pmdp, IDTE_GLOBAL); - else - __pmdp_csp(pmdp); + } else { + pmdp_idte_global(mm, addr, pmdp); + } atomic_dec(&mm->context.flush_count); return old; } @@ -359,28 +424,46 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(pmdp_xchg_lazy); -static inline pud_t pudp_flush_direct(struct mm_struct *mm, - unsigned long addr, pud_t *pudp) +static inline void pudp_idte_local(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) { - pud_t old; + if (MACHINE_HAS_TLB_GUEST) + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); + else + __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); +} - old = *pudp; - if (pud_val(old) & _REGION_ENTRY_INVALID) - return old; - if (!MACHINE_HAS_IDTE) { +static inline void pudp_idte_global(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); + else /* * Invalid bit position is the same for pmd and pud, so we can * re-use _pmd_csp() here */ __pmdp_csp((pmd_t *) pudp); +} + +static inline pud_t pudp_flush_direct(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + pud_t old; + + old = *pudp; + if (pud_val(old) & _REGION_ENTRY_INVALID) return old; - } atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __pudp_idte(addr, pudp, IDTE_LOCAL); + pudp_idte_local(mm, addr, pudp); else - __pudp_idte(addr, pudp, IDTE_GLOBAL); + pudp_idte_global(mm, addr, pudp); atomic_dec(&mm->context.flush_count); return old; } @@ -482,7 +565,7 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, { pte_t entry; pgste_t pgste; - int pte_i, pte_p; + int pte_i, pte_p, nodat; pgste = pgste_get_lock(ptep); entry = *ptep; @@ -495,13 +578,14 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, return -EAGAIN; } /* Change access rights and set pgste bit */ + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); if (prot == PROT_NONE && !pte_i) { - ptep_flush_direct(mm, addr, ptep); + ptep_flush_direct(mm, addr, ptep, nodat); pgste = pgste_update_all(entry, pgste, mm); pte_val(entry) |= _PAGE_INVALID; } if (prot == PROT_READ && !pte_p) { - ptep_flush_direct(mm, addr, ptep); + ptep_flush_direct(mm, addr, ptep, nodat); pte_val(entry) &= ~_PAGE_INVALID; pte_val(entry) |= _PAGE_PROTECT; } @@ -541,10 +625,12 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) { pgste_t pgste; + int nodat; pgste = pgste_get_lock(ptep); /* notifier is called by the caller */ - ptep_flush_direct(mm, saddr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_flush_direct(mm, saddr, ptep, nodat); /* don't touch the storage key - it belongs to parent pgste */ pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); pgste_set_unlock(ptep, pgste); @@ -617,6 +703,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pte_t *ptep; pte_t pte; bool dirty; + int nodat; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); @@ -645,7 +732,8 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pte = *ptep; if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { pgste = pgste_pte_notify(mm, addr, ptep, pgste); - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_ipte_global(mm, addr, ptep, nodat); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) pte_val(pte) |= _PAGE_PROTECT; else diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index d8398962a723..c0af0d7b6e5f 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -38,37 +38,14 @@ static void __ref *vmem_alloc_pages(unsigned int order) return (void *) memblock_alloc(size, size); } -static inline p4d_t *vmem_p4d_alloc(void) +void *vmem_crst_alloc(unsigned long val) { - p4d_t *p4d = NULL; + unsigned long *table; - p4d = vmem_alloc_pages(2); - if (!p4d) - return NULL; - clear_table((unsigned long *) p4d, _REGION2_ENTRY_EMPTY, PAGE_SIZE * 4); - return p4d; -} - -static inline pud_t *vmem_pud_alloc(void) -{ - pud_t *pud = NULL; - - pud = vmem_alloc_pages(2); - if (!pud) - return NULL; - clear_table((unsigned long *) pud, _REGION3_ENTRY_EMPTY, PAGE_SIZE * 4); - return pud; -} - -pmd_t *vmem_pmd_alloc(void) -{ - pmd_t *pmd = NULL; - - pmd = vmem_alloc_pages(2); - if (!pmd) - return NULL; - clear_table((unsigned long *) pmd, _SEGMENT_ENTRY_EMPTY, PAGE_SIZE * 4); - return pmd; + table = vmem_alloc_pages(CRST_ALLOC_ORDER); + if (table) + crst_table_init(table, val); + return table; } pte_t __ref *vmem_pte_alloc(void) @@ -114,14 +91,14 @@ static int vmem_add_mem(unsigned long start, unsigned long size) while (address < end) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { - p4_dir = vmem_p4d_alloc(); + p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); if (!p4_dir) goto out; pgd_populate(&init_mm, pg_dir, p4_dir); } p4_dir = p4d_offset(pg_dir, address); if (p4d_none(*p4_dir)) { - pu_dir = vmem_pud_alloc(); + pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); if (!pu_dir) goto out; p4d_populate(&init_mm, p4_dir, pu_dir); @@ -136,7 +113,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size) continue; } if (pud_none(*pu_dir)) { - pm_dir = vmem_pmd_alloc(); + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!pm_dir) goto out; pud_populate(&init_mm, pu_dir, pm_dir); @@ -253,7 +230,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) for (address = start; address < end;) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { - p4_dir = vmem_p4d_alloc(); + p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); if (!p4_dir) goto out; pgd_populate(&init_mm, pg_dir, p4_dir); @@ -261,7 +238,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) p4_dir = p4d_offset(pg_dir, address); if (p4d_none(*p4_dir)) { - pu_dir = vmem_pud_alloc(); + pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); if (!pu_dir) goto out; p4d_populate(&init_mm, p4_dir, pu_dir); @@ -269,7 +246,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) pu_dir = pud_offset(p4_dir, address); if (pud_none(*pu_dir)) { - pm_dir = vmem_pmd_alloc(); + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!pm_dir) goto out; pud_populate(&init_mm, pu_dir, pm_dir); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 1803797fc885..8ec88497a28d 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1093,15 +1093,27 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i case BPF_JMP | BPF_JSGT | BPF_K: /* ((s64) dst > (s64) imm) */ mask = 0x2000; /* jh */ goto branch_ks; + case BPF_JMP | BPF_JSLT | BPF_K: /* ((s64) dst < (s64) imm) */ + mask = 0x4000; /* jl */ + goto branch_ks; case BPF_JMP | BPF_JSGE | BPF_K: /* ((s64) dst >= (s64) imm) */ mask = 0xa000; /* jhe */ goto branch_ks; + case BPF_JMP | BPF_JSLE | BPF_K: /* ((s64) dst <= (s64) imm) */ + mask = 0xc000; /* jle */ + goto branch_ks; case BPF_JMP | BPF_JGT | BPF_K: /* (dst_reg > imm) */ mask = 0x2000; /* jh */ goto branch_ku; + case BPF_JMP | BPF_JLT | BPF_K: /* (dst_reg < imm) */ + mask = 0x4000; /* jl */ + goto branch_ku; case BPF_JMP | BPF_JGE | BPF_K: /* (dst_reg >= imm) */ mask = 0xa000; /* jhe */ goto branch_ku; + case BPF_JMP | BPF_JLE | BPF_K: /* (dst_reg <= imm) */ + mask = 0xc000; /* jle */ + goto branch_ku; case BPF_JMP | BPF_JNE | BPF_K: /* (dst_reg != imm) */ mask = 0x7000; /* jne */ goto branch_ku; @@ -1119,15 +1131,27 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i case BPF_JMP | BPF_JSGT | BPF_X: /* ((s64) dst > (s64) src) */ mask = 0x2000; /* jh */ goto branch_xs; + case BPF_JMP | BPF_JSLT | BPF_X: /* ((s64) dst < (s64) src) */ + mask = 0x4000; /* jl */ + goto branch_xs; case BPF_JMP | BPF_JSGE | BPF_X: /* ((s64) dst >= (s64) src) */ mask = 0xa000; /* jhe */ goto branch_xs; + case BPF_JMP | BPF_JSLE | BPF_X: /* ((s64) dst <= (s64) src) */ + mask = 0xc000; /* jle */ + goto branch_xs; case BPF_JMP | BPF_JGT | BPF_X: /* (dst > src) */ mask = 0x2000; /* jh */ goto branch_xu; + case BPF_JMP | BPF_JLT | BPF_X: /* (dst < src) */ + mask = 0x4000; /* jl */ + goto branch_xu; case BPF_JMP | BPF_JGE | BPF_X: /* (dst >= src) */ mask = 0xa000; /* jhe */ goto branch_xu; + case BPF_JMP | BPF_JLE | BPF_X: /* (dst <= src) */ + mask = 0xc000; /* jle */ + goto branch_xu; case BPF_JMP | BPF_JNE | BPF_X: /* (dst != src) */ mask = 0x7000; /* jne */ goto branch_xu; diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index bd534b4d40e3..0ae3936e266f 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -24,6 +24,14 @@ bool zpci_unique_uid; +static void update_uid_checking(bool new) +{ + if (zpci_unique_uid != new) + zpci_dbg(1, "uid checking:%d\n", new); + + zpci_unique_uid = new; +} + static inline void zpci_err_clp(unsigned int rsp, int rc) { struct { @@ -319,7 +327,7 @@ static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, void *data, goto out; } - zpci_unique_uid = rrb->response.uid_checking; + update_uid_checking(rrb->response.uid_checking); WARN_ON_ONCE(rrb->response.entry_size != sizeof(struct clp_fh_list_entry)); diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 025ea20fc4b4..29d72bf8ed2b 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -41,7 +41,7 @@ static struct facility_def facility_defs[] = { 27, /* mvcos */ 32, /* compare and swap and store */ 33, /* compare and swap and store 2 */ - 34, /* general extension facility */ + 34, /* general instructions extension */ 35, /* execute extensions */ #endif #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES @@ -54,6 +54,9 @@ static struct facility_def facility_defs[] = { #ifdef CONFIG_HAVE_MARCH_Z13_FEATURES 53, /* load-and-zero-rightmost-byte, etc. */ #endif +#ifdef CONFIG_HAVE_MARCH_Z14_FEATURES + 58, /* miscellaneous-instruction-extension 2 */ +#endif -1 /* END */ } }, diff --git a/arch/sh/configs/se7751_defconfig b/arch/sh/configs/se7751_defconfig index 75c92fc1876b..56b5e4ce8d4a 100644 --- a/arch/sh/configs/se7751_defconfig +++ b/arch/sh/configs/se7751_defconfig @@ -28,7 +28,6 @@ CONFIG_IP_PNP_RARP=y # CONFIG_INET_LRO is not set # CONFIG_IPV6 is not set CONFIG_NETFILTER=y -CONFIG_NETFILTER_DEBUG=y CONFIG_IP_NF_QUEUE=y CONFIG_MTD=y CONFIG_MTD_PARTITIONS=y diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h index d0078747d308..8f8cf941a8cd 100644 --- a/arch/sh/include/asm/futex.h +++ b/arch/sh/include/asm/futex.h @@ -27,21 +27,12 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval); } -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - u32 oparg = (encoded_op << 8) >> 20; - u32 cmparg = (encoded_op << 20) >> 20; u32 oldval, newval, prev; int ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); do { @@ -80,17 +71,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = ((int)oldval < (int)cmparg); break; - case FUTEX_OP_CMP_GE: ret = ((int)oldval >= (int)cmparg); break; - case FUTEX_OP_CMP_LE: ret = ((int)oldval <= (int)cmparg); break; - case FUTEX_OP_CMP_GT: ret = ((int)oldval > (int)cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; return ret; } diff --git a/arch/sh/include/asm/spinlock-cas.h b/arch/sh/include/asm/spinlock-cas.h index c46e8cc7b515..5ed7dbbd94ff 100644 --- a/arch/sh/include/asm/spinlock-cas.h +++ b/arch/sh/include/asm/spinlock-cas.h @@ -29,11 +29,6 @@ static inline unsigned __sl_cas(volatile unsigned *p, unsigned old, unsigned new #define arch_spin_is_locked(x) ((x)->lock <= 0) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, VAL > 0); -} - static inline void arch_spin_lock(arch_spinlock_t *lock) { while (!__sl_cas(&lock->lock, 1, 0)); diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h index cec78143fa83..f77263aae760 100644 --- a/arch/sh/include/asm/spinlock-llsc.h +++ b/arch/sh/include/asm/spinlock-llsc.h @@ -21,11 +21,6 @@ #define arch_spin_is_locked(x) ((x)->lock <= 0) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, VAL > 0); -} - /* * Simple spin lock operations. There are two variants, one clears IRQ's * on the local processor, one does not. diff --git a/arch/sparc/crypto/aes_glue.c b/arch/sparc/crypto/aes_glue.c index c90930de76ba..3cd4f6b198b6 100644 --- a/arch/sparc/crypto/aes_glue.c +++ b/arch/sparc/crypto/aes_glue.c @@ -344,8 +344,7 @@ static void ctr_crypt_final(struct crypto_sparc64_aes_ctx *ctx, ctx->ops->ecb_encrypt(&ctx->key[0], (const u64 *)ctrblk, keystream, AES_BLOCK_SIZE); - crypto_xor((u8 *) keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, (u8 *) keystream, src, nbytes); crypto_inc(ctrblk, AES_BLOCK_SIZE); } diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index ee3f11c43cda..7643e979e333 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -29,6 +29,8 @@ int atomic_xchg(atomic_t *, int); int __atomic_add_unless(atomic_t *, int, int); void atomic_set(atomic_t *, int); +#define atomic_set_release(v, i) atomic_set((v), (i)) + #define atomic_read(v) ACCESS_ONCE((v)->counter) #define atomic_add(i, v) ((void)atomic_add_return( (int)(i), (v))) diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h index 4e899b0dabf7..1cfd89d92208 100644 --- a/arch/sparc/include/asm/futex_64.h +++ b/arch/sparc/include/asm/futex_64.h @@ -29,22 +29,14 @@ : "r" (uaddr), "r" (oparg), "i" (-EFAULT) \ : "memory") -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) - return -EFAULT; if (unlikely((((unsigned long) uaddr) & 0x3UL))) return -EINVAL; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - pagefault_disable(); switch (op) { @@ -69,17 +61,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h index 8011e79f59c9..67345b2dc408 100644 --- a/arch/sparc/include/asm/spinlock_32.h +++ b/arch/sparc/include/asm/spinlock_32.h @@ -14,11 +14,6 @@ #define arch_spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - static inline void arch_spin_lock(arch_spinlock_t *lock) { __asm__ __volatile__( diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 186fd8199f54..b2f5c50d0947 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -98,6 +98,8 @@ #define SO_PEERGROUPS 0x003d +#define SO_ZEROCOPY 0x003e + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 8799ae9a8788..c340af7b1371 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -128,6 +128,8 @@ static u32 WDISP10(u32 off) #define BA (BRANCH | CONDA) #define BG (BRANCH | CONDG) +#define BL (BRANCH | CONDL) +#define BLE (BRANCH | CONDLE) #define BGU (BRANCH | CONDGU) #define BLEU (BRANCH | CONDLEU) #define BGE (BRANCH | CONDGE) @@ -715,9 +717,15 @@ static int emit_compare_and_branch(const u8 code, const u8 dst, u8 src, case BPF_JGT: br_opcode = BGU; break; + case BPF_JLT: + br_opcode = BLU; + break; case BPF_JGE: br_opcode = BGEU; break; + case BPF_JLE: + br_opcode = BLEU; + break; case BPF_JSET: case BPF_JNE: br_opcode = BNE; @@ -725,9 +733,15 @@ static int emit_compare_and_branch(const u8 code, const u8 dst, u8 src, case BPF_JSGT: br_opcode = BG; break; + case BPF_JSLT: + br_opcode = BL; + break; case BPF_JSGE: br_opcode = BGE; break; + case BPF_JSLE: + br_opcode = BLE; + break; default: /* Make sure we dont leak kernel information to the * user. @@ -746,18 +760,30 @@ static int emit_compare_and_branch(const u8 code, const u8 dst, u8 src, case BPF_JGT: cbcond_opcode = CBCONDGU; break; + case BPF_JLT: + cbcond_opcode = CBCONDLU; + break; case BPF_JGE: cbcond_opcode = CBCONDGEU; break; + case BPF_JLE: + cbcond_opcode = CBCONDLEU; + break; case BPF_JNE: cbcond_opcode = CBCONDNE; break; case BPF_JSGT: cbcond_opcode = CBCONDG; break; + case BPF_JSLT: + cbcond_opcode = CBCONDL; + break; case BPF_JSGE: cbcond_opcode = CBCONDGE; break; + case BPF_JSLE: + cbcond_opcode = CBCONDLE; + break; default: /* Make sure we dont leak kernel information to the * user. @@ -1176,10 +1202,14 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) /* IF (dst COND src) JUMP off */ case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: case BPF_JMP | BPF_JNE | BPF_X: case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: case BPF_JMP | BPF_JSET | BPF_X: { int err; @@ -1191,10 +1221,14 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) /* IF (dst COND imm) JUMP off */ case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: case BPF_JMP | BPF_JSET | BPF_K: { int err; diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h index a93774255136..53a423e7cb92 100644 --- a/arch/tile/include/asm/atomic_32.h +++ b/arch/tile/include/asm/atomic_32.h @@ -101,6 +101,8 @@ static inline void atomic_set(atomic_t *v, int n) _atomic_xchg(&v->counter, n); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + /* A 64bit atomic type */ typedef struct { diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h index e64a1b75fc38..83c1e639b411 100644 --- a/arch/tile/include/asm/futex.h +++ b/arch/tile/include/asm/futex.h @@ -106,12 +106,9 @@ lock = __atomic_hashed_lock((int __force *)uaddr) #endif -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int uninitialized_var(val), ret; __futex_prolog(); @@ -119,12 +116,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) /* The 32-bit futex code makes this assumption, so validate it here. */ BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int)); - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { case FUTEX_OP_SET: @@ -148,30 +139,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) } pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (val == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (val != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (val < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (val >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (val <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (val > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = val; + return ret; } diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h index b14b1ba5bf9c..cba8ba9b8da6 100644 --- a/arch/tile/include/asm/spinlock_32.h +++ b/arch/tile/include/asm/spinlock_32.h @@ -64,8 +64,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) lock->current_ticket = old_ticket + TICKET_QUANTUM; } -void arch_spin_unlock_wait(arch_spinlock_t *lock); - /* * Read-write spinlocks, allowing multiple readers * but only one writer. diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h index b9718fb4e74a..9a2c2d605752 100644 --- a/arch/tile/include/asm/spinlock_64.h +++ b/arch/tile/include/asm/spinlock_64.h @@ -58,8 +58,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) __insn_fetchadd4(&lock->lock, 1U << __ARCH_SPIN_CURRENT_SHIFT); } -void arch_spin_unlock_wait(arch_spinlock_t *lock); - void arch_spin_lock_slow(arch_spinlock_t *lock, u32 val); /* Grab the "next" ticket number and bump it atomically. diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c index 076c6cc43113..db9333f2447c 100644 --- a/arch/tile/lib/spinlock_32.c +++ b/arch/tile/lib/spinlock_32.c @@ -62,29 +62,6 @@ int arch_spin_trylock(arch_spinlock_t *lock) } EXPORT_SYMBOL(arch_spin_trylock); -void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - u32 iterations = 0; - int curr = READ_ONCE(lock->current_ticket); - int next = READ_ONCE(lock->next_ticket); - - /* Return immediately if unlocked. */ - if (next == curr) - return; - - /* Wait until the current locker has released the lock. */ - do { - delay_backoff(iterations++); - } while (READ_ONCE(lock->current_ticket) == curr); - - /* - * The TILE architecture doesn't do read speculation; therefore - * a control dependency guarantees a LOAD->{LOAD,STORE} order. - */ - barrier(); -} -EXPORT_SYMBOL(arch_spin_unlock_wait); - /* * The low byte is always reserved to be the marker for a "tns" operation * since the low bit is set to "1" by a tns. The next seven bits are diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c index a4b5b2cbce93..de414c22892f 100644 --- a/arch/tile/lib/spinlock_64.c +++ b/arch/tile/lib/spinlock_64.c @@ -62,28 +62,6 @@ int arch_spin_trylock(arch_spinlock_t *lock) } EXPORT_SYMBOL(arch_spin_trylock); -void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - u32 iterations = 0; - u32 val = READ_ONCE(lock->lock); - u32 curr = arch_spin_current(val); - - /* Return immediately if unlocked. */ - if (arch_spin_next(val) == curr) - return; - - /* Wait until the current locker has released the lock. */ - do { - delay_backoff(iterations++); - } while (arch_spin_current(READ_ONCE(lock->lock)) == curr); - - /* - * The TILE architecture doesn't do read speculation; therefore - * a control dependency guarantees a LOAD->{LOAD,STORE} order. - */ - barrier(); -} -EXPORT_SYMBOL(arch_spin_unlock_wait); /* * If the read lock fails due to a writer, we retry periodically diff --git a/arch/um/include/asm/unwind.h b/arch/um/include/asm/unwind.h new file mode 100644 index 000000000000..7ffa5437b761 --- /dev/null +++ b/arch/um/include/asm/unwind.h @@ -0,0 +1,8 @@ +#ifndef _ASM_UML_UNWIND_H +#define _ASM_UML_UNWIND_H + +static inline void +unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size) {} + +#endif /* _ASM_UML_UNWIND_H */ diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 586b786b3edf..0038a2d10a7a 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -8,10 +8,7 @@ obj-$(CONFIG_KVM) += kvm/ obj-$(CONFIG_XEN) += xen/ # Hyper-V paravirtualization support -obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/ - -# lguest paravirtualization support -obj-$(CONFIG_LGUEST_GUEST) += lguest/ +obj-$(subst m,y,$(CONFIG_HYPERV)) += hyperv/ obj-y += realmode/ obj-y += kernel/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e4844e934728..4b278a33ccbb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -55,6 +55,8 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_PMEM_API if X86_64 + # Causing hangs/crashes, see the commit that added this change for details. + select ARCH_HAS_REFCOUNT if BROKEN select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN @@ -73,7 +75,6 @@ config X86 select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH - select ARCH_WANT_FRAME_POINTERS select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_EXTABLE_SORT @@ -158,6 +159,7 @@ config X86 select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS + select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES @@ -167,8 +169,9 @@ config X86 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK @@ -426,16 +429,16 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH -config INTEL_RDT_A - bool "Intel Resource Director Technology Allocation support" +config INTEL_RDT + bool "Intel Resource Director Technology support" default n depends on X86 && CPU_SUP_INTEL select KERNFS help - Select to enable resource allocation which is a sub-feature of - Intel Resource Director Technology(RDT). More information about - RDT can be found in the Intel x86 Architecture Software - Developer Manual. + Select to enable resource allocation and monitoring which are + sub-features of Intel Resource Director Technology(RDT). More + information about RDT can be found in the Intel x86 + Architecture Software Developer Manual. Say N if unsure. @@ -779,8 +782,6 @@ config KVM_DEBUG_FS Statistics are displayed in debugfs filesystem. Enabling this option may incur significant overhead. -source "arch/x86/lguest/Kconfig" - config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" depends on PARAVIRT @@ -1805,7 +1806,9 @@ config X86_SMAP config X86_INTEL_MPX prompt "Intel MPX (Memory Protection Extensions)" def_bool n - depends on CPU_SUP_INTEL + # Note: only available in 64-bit mode due to VMA flags shortage + depends on CPU_SUP_INTEL && X86_64 + select ARCH_USES_HIGH_VMA_FLAGS ---help--- MPX provides hardware features that can be used in conjunction with compiler-instrumented code to check diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index cd20ca0b4043..71a48a30fc84 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -305,8 +305,6 @@ config DEBUG_ENTRY Some of these sanity checks may slow down kernel entries and exits or otherwise impact performance. - This is currently used to help test NMI code. - If unsure, say N. config DEBUG_NMI_SELFTEST @@ -358,4 +356,61 @@ config PUNIT_ATOM_DEBUG The current power state can be read from /sys/kernel/debug/punit_atom/dev_power_state +choice + prompt "Choose kernel unwinder" + default FRAME_POINTER_UNWINDER + ---help--- + This determines which method will be used for unwinding kernel stack + traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack, + livepatch, lockdep, and more. + +config FRAME_POINTER_UNWINDER + bool "Frame pointer unwinder" + select FRAME_POINTER + ---help--- + This option enables the frame pointer unwinder for unwinding kernel + stack traces. + + The unwinder itself is fast and it uses less RAM than the ORC + unwinder, but the kernel text size will grow by ~3% and the kernel's + overall performance will degrade by roughly 5-10%. + + This option is recommended if you want to use the livepatch + consistency model, as this is currently the only way to get a + reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). + +config ORC_UNWINDER + bool "ORC unwinder" + depends on X86_64 + select STACK_VALIDATION + ---help--- + This option enables the ORC (Oops Rewind Capability) unwinder for + unwinding kernel stack traces. It uses a custom data format which is + a simplified version of the DWARF Call Frame Information standard. + + This unwinder is more accurate across interrupt entry frames than the + frame pointer unwinder. It also enables a 5-10% performance + improvement across the entire kernel compared to frame pointers. + + Enabling this option will increase the kernel's runtime memory usage + by roughly 2-4MB, depending on your kernel config. + +config GUESS_UNWINDER + bool "Guess unwinder" + depends on EXPERT + ---help--- + This option enables the "guess" unwinder for unwinding kernel stack + traces. It scans the stack and reports every kernel text address it + finds. Some of the addresses it reports may be incorrect. + + While this option often produces false positives, it can still be + useful in many cases. Unlike the other unwinders, it has no runtime + overhead. + +endchoice + +config FRAME_POINTER + depends on !ORC_UNWINDER && !GUESS_UNWINDER + bool + endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1e902f926be3..6276572259c8 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -14,9 +14,11 @@ endif # For gcc stack alignment is specified with -mpreferred-stack-boundary, # clang has the option -mstack-alignment for that purpose. ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) - cc_stack_align_opt := -mpreferred-stack-boundary -else ifneq ($(call cc-option, -mstack-alignment=4),) - cc_stack_align_opt := -mstack-alignment + cc_stack_align4 := -mpreferred-stack-boundary=2 + cc_stack_align8 := -mpreferred-stack-boundary=3 +else ifneq ($(call cc-option, -mstack-alignment=16),) + cc_stack_align4 := -mstack-alignment=4 + cc_stack_align8 := -mstack-alignment=8 endif # How to compile the 16-bit code. Note we always compile for -march=i386; @@ -36,7 +38,7 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) -REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4)) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -76,7 +78,7 @@ ifeq ($(CONFIG_X86_32),y) # Align the stack to the register width instead of using the default # alignment of 16 bytes. This reduces stack usage and the number of # alignment instructions. - KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2) + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4)) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: @@ -115,7 +117,7 @@ else # default alignment which keep the stack *mis*aligned. # Furthermore an alignment to the register width reduces stack usage # and the number of alignment instructions. - KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3) + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align8)) # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) @@ -232,9 +234,6 @@ KBUILD_CFLAGS += -Wno-sign-compare # KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -KBUILD_CFLAGS += $(mflags-y) -KBUILD_AFLAGS += $(mflags-y) - archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index a1686f3dc295..e56dbc67e837 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -767,7 +767,7 @@ static efi_status_t setup_e820(struct boot_params *params, m |= (u64)efi->efi_memmap_hi << 32; #endif - d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size)); + d = efi_early_memdesc_ptr(m, efi->efi_memdesc_size, i); switch (d->type) { case EFI_RESERVED_TYPE: case EFI_RUNTIME_SERVICES_CODE: @@ -1061,7 +1061,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; desc->l = 0; desc->d = SEG_OP_SIZE_32BIT; @@ -1081,7 +1081,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; if (IS_ENABLED(CONFIG_X86_64)) { desc->l = 1; @@ -1102,7 +1102,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; desc->l = 0; desc->d = SEG_OP_SIZE_32BIT; @@ -1119,7 +1119,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = 0; desc->dpl = 0; desc->p = 1; - desc->limit = 0x0; + desc->limit1 = 0x0; desc->avl = 0; desc->l = 0; desc->d = 0; diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index d85b9625e836..11c68cf53d4e 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -61,71 +61,6 @@ __HEAD ENTRY(startup_32) -#ifdef CONFIG_EFI_STUB - jmp preferred_addr - - /* - * We don't need the return address, so set up the stack so - * efi_main() can find its arguments. - */ -ENTRY(efi_pe_entry) - add $0x4, %esp - - call 1f -1: popl %esi - subl $1b, %esi - - popl %ecx - movl %ecx, efi32_config(%esi) /* Handle */ - popl %ecx - movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */ - - /* Relocate efi_config->call() */ - leal efi32_config(%esi), %eax - add %esi, 40(%eax) - pushl %eax - - call make_boot_params - cmpl $0, %eax - je fail - movl %esi, BP_code32_start(%eax) - popl %ecx - pushl %eax - pushl %ecx - jmp 2f /* Skip efi_config initialization */ - -ENTRY(efi32_stub_entry) - add $0x4, %esp - popl %ecx - popl %edx - - call 1f -1: popl %esi - subl $1b, %esi - - movl %ecx, efi32_config(%esi) /* Handle */ - movl %edx, efi32_config+8(%esi) /* EFI System table pointer */ - - /* Relocate efi_config->call() */ - leal efi32_config(%esi), %eax - add %esi, 40(%eax) - pushl %eax -2: - call efi_main - cmpl $0, %eax - movl %eax, %esi - jne 2f -fail: - /* EFI init failed, so hang. */ - hlt - jmp fail -2: - movl BP_code32_start(%esi), %eax - leal preferred_addr(%eax), %eax - jmp *%eax - -preferred_addr: -#endif cld /* * Test KEEP_SEGMENTS flag to see if the bootloader is asking @@ -208,6 +143,70 @@ preferred_addr: jmp *%eax ENDPROC(startup_32) +#ifdef CONFIG_EFI_STUB +/* + * We don't need the return address, so set up the stack so efi_main() can find + * its arguments. + */ +ENTRY(efi_pe_entry) + add $0x4, %esp + + call 1f +1: popl %esi + subl $1b, %esi + + popl %ecx + movl %ecx, efi32_config(%esi) /* Handle */ + popl %ecx + movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */ + + /* Relocate efi_config->call() */ + leal efi32_config(%esi), %eax + add %esi, 40(%eax) + pushl %eax + + call make_boot_params + cmpl $0, %eax + je fail + movl %esi, BP_code32_start(%eax) + popl %ecx + pushl %eax + pushl %ecx + jmp 2f /* Skip efi_config initialization */ +ENDPROC(efi_pe_entry) + +ENTRY(efi32_stub_entry) + add $0x4, %esp + popl %ecx + popl %edx + + call 1f +1: popl %esi + subl $1b, %esi + + movl %ecx, efi32_config(%esi) /* Handle */ + movl %edx, efi32_config+8(%esi) /* EFI System table pointer */ + + /* Relocate efi_config->call() */ + leal efi32_config(%esi), %eax + add %esi, 40(%eax) + pushl %eax +2: + call efi_main + cmpl $0, %eax + movl %eax, %esi + jne 2f +fail: + /* EFI init failed, so hang. */ + hlt + jmp fail +2: + movl BP_code32_start(%esi), %eax + leal startup_32(%eax), %eax + jmp *%eax +ENDPROC(efi32_stub_entry) +#endif + .text relocated: diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fbf4c32d0b62..b4a5d284391c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -243,65 +243,6 @@ ENTRY(startup_64) * that maps our entire kernel(text+data+bss+brk), zero page * and command line. */ -#ifdef CONFIG_EFI_STUB - /* - * The entry point for the PE/COFF executable is efi_pe_entry, so - * only legacy boot loaders will execute this jmp. - */ - jmp preferred_addr - -ENTRY(efi_pe_entry) - movq %rcx, efi64_config(%rip) /* Handle */ - movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */ - - leaq efi64_config(%rip), %rax - movq %rax, efi_config(%rip) - - call 1f -1: popq %rbp - subq $1b, %rbp - - /* - * Relocate efi_config->call(). - */ - addq %rbp, efi64_config+40(%rip) - - movq %rax, %rdi - call make_boot_params - cmpq $0,%rax - je fail - mov %rax, %rsi - leaq startup_32(%rip), %rax - movl %eax, BP_code32_start(%rsi) - jmp 2f /* Skip the relocation */ - -handover_entry: - call 1f -1: popq %rbp - subq $1b, %rbp - - /* - * Relocate efi_config->call(). - */ - movq efi_config(%rip), %rax - addq %rbp, 40(%rax) -2: - movq efi_config(%rip), %rdi - call efi_main - movq %rax,%rsi - cmpq $0,%rax - jne 2f -fail: - /* EFI init failed, so hang. */ - hlt - jmp fail -2: - movl BP_code32_start(%esi), %eax - leaq preferred_addr(%rax), %rax - jmp *%rax - -preferred_addr: -#endif /* Setup data segments. */ xorl %eax, %eax @@ -413,6 +354,59 @@ lvl5: jmp *%rax #ifdef CONFIG_EFI_STUB + +/* The entry point for the PE/COFF executable is efi_pe_entry. */ +ENTRY(efi_pe_entry) + movq %rcx, efi64_config(%rip) /* Handle */ + movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */ + + leaq efi64_config(%rip), %rax + movq %rax, efi_config(%rip) + + call 1f +1: popq %rbp + subq $1b, %rbp + + /* + * Relocate efi_config->call(). + */ + addq %rbp, efi64_config+40(%rip) + + movq %rax, %rdi + call make_boot_params + cmpq $0,%rax + je fail + mov %rax, %rsi + leaq startup_32(%rip), %rax + movl %eax, BP_code32_start(%rsi) + jmp 2f /* Skip the relocation */ + +handover_entry: + call 1f +1: popq %rbp + subq $1b, %rbp + + /* + * Relocate efi_config->call(). + */ + movq efi_config(%rip), %rax + addq %rbp, 40(%rax) +2: + movq efi_config(%rip), %rdi + call efi_main + movq %rax,%rsi + cmpq $0,%rax + jne 2f +fail: + /* EFI init failed, so hang. */ + hlt + jmp fail +2: + movl BP_code32_start(%esi), %eax + leaq startup_64(%rax), %rax + jmp *%rax +ENDPROC(efi_pe_entry) + .org 0x390 ENTRY(efi64_stub_entry) movq %rdi, efi64_config(%rip) /* Handle */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 99c7194f7ea6..17818ba6906f 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -37,7 +37,9 @@ #include <linux/uts.h> #include <linux/utsname.h> #include <linux/ctype.h> +#include <linux/efi.h> #include <generated/utsrelease.h> +#include <asm/efi.h> /* Macros used by the included decompressor code below. */ #define STATIC @@ -558,6 +560,87 @@ static void process_mem_region(struct mem_vector *entry, } } +#ifdef CONFIG_EFI +/* + * Returns true if mirror region found (and must have been processed + * for slots adding) + */ +static bool +process_efi_entries(unsigned long minimum, unsigned long image_size) +{ + struct efi_info *e = &boot_params->efi_info; + bool efi_mirror_found = false; + struct mem_vector region; + efi_memory_desc_t *md; + unsigned long pmap; + char *signature; + u32 nr_desc; + int i; + + signature = (char *)&e->efi_loader_signature; + if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && + strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) + return false; + +#ifdef CONFIG_X86_32 + /* Can't handle data above 4GB at this time */ + if (e->efi_memmap_hi) { + warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n"); + return false; + } + pmap = e->efi_memmap; +#else + pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); +#endif + + nr_desc = e->efi_memmap_size / e->efi_memdesc_size; + for (i = 0; i < nr_desc; i++) { + md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); + if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { + efi_mirror_found = true; + break; + } + } + + for (i = 0; i < nr_desc; i++) { + md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); + + /* + * Here we are more conservative in picking free memory than + * the EFI spec allows: + * + * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also + * free memory and thus available to place the kernel image into, + * but in practice there's firmware where using that memory leads + * to crashes. + * + * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free. + */ + if (md->type != EFI_CONVENTIONAL_MEMORY) + continue; + + if (efi_mirror_found && + !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) + continue; + + region.start = md->phys_addr; + region.size = md->num_pages << EFI_PAGE_SHIFT; + process_mem_region(®ion, minimum, image_size); + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Aborted EFI scan (slot_areas full)!\n"); + break; + } + } + return true; +} +#else +static inline bool +process_efi_entries(unsigned long minimum, unsigned long image_size) +{ + return false; +} +#endif + static void process_e820_entries(unsigned long minimum, unsigned long image_size) { @@ -586,13 +669,16 @@ static unsigned long find_random_phys_addr(unsigned long minimum, { /* Check if we had too many memmaps. */ if (memmap_too_large) { - debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n"); + debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); return 0; } /* Make sure minimum is aligned. */ minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + if (process_efi_entries(minimum, image_size)) + return slots_fetch_random(); + process_e820_entries(minimum, image_size); return slots_fetch_random(); } @@ -652,7 +738,7 @@ void choose_random_location(unsigned long input, */ min_addr = min(*output, 512UL << 20); - /* Walk e820 and find a random address. */ + /* Walk available memory entries to find a random address. */ random_addr = find_random_phys_addr(min_addr, output_size); if (!random_addr) { warn("Physical KASLR disabled: no suitable memory region!"); diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index a0838ab929f2..c14217cd0155 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -116,8 +116,7 @@ void __putstr(const char *s) } } - if (boot_params->screen_info.orig_video_mode == 0 && - lines == 0 && cols == 0) + if (lines == 0 || cols == 0) return; x = boot_params->screen_info.orig_x; diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 2ed8f0c25def..1bb08ecffd24 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -520,8 +520,14 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr # the description in lib/decompressor_xxx.c for specific information. # # extra_bytes = (uncompressed_size >> 12) + 65536 + 128 +# +# LZ4 is even worse: data that cannot be further compressed grows by 0.4%, +# or one byte per 256 bytes. OTOH, we can safely get rid of the +128 as +# the size-dependent part now grows so fast. +# +# extra_bytes = (uncompressed_size >> 8) + 65536 -#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 65536 + 128) +#define ZO_z_extra_bytes ((ZO_z_output_len >> 8) + 65536) #if ZO_z_output_len > ZO_z_input_len # define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \ ZO_z_input_len) diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 4b429df40d7a..550cd5012b73 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1,3 +1,5 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set +CONFIG_GUESS_UNWINDER=y +# CONFIG_FRAME_POINTER_UNWINDER is not set diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 4a55cdcdc008..5c15d6b57329 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -475,8 +475,8 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx, unsigned int nbytes = walk->nbytes; aesni_enc(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); + crypto_inc(ctrblk, AES_BLOCK_SIZE); } diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 17c05531dfd1..f9eca34301e2 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -271,8 +271,7 @@ static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk) unsigned int nbytes = walk->nbytes; blowfish_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, BF_BLOCK_SIZE); } diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 8648158f3916..dbea6020ffe7 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -256,8 +256,7 @@ static void ctr_crypt_final(struct blkcipher_desc *desc, unsigned int nbytes = walk->nbytes; __cast5_encrypt(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, CAST5_BLOCK_SIZE); } diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index d6fc59aaaadf..30c0a37f4882 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -277,8 +277,7 @@ static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx, unsigned int nbytes = walk->nbytes; des3_ede_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE); } diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 9976fcecd17e..af28a8a24366 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -2,7 +2,6 @@ # Makefile for the x86 low level entry code # -OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 05ed3d393da7..640aafebdc00 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -1,4 +1,5 @@ #include <linux/jump_label.h> +#include <asm/unwind_hints.h> /* @@ -112,6 +113,7 @@ For 32-bit we have the following conventions - kernel is built with movq %rdx, 12*8+\offset(%rsp) movq %rsi, 13*8+\offset(%rsp) movq %rdi, 14*8+\offset(%rsp) + UNWIND_HINT_REGS offset=\offset extra=0 .endm .macro SAVE_C_REGS offset=0 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 @@ -136,6 +138,7 @@ For 32-bit we have the following conventions - kernel is built with movq %r12, 3*8+\offset(%rsp) movq %rbp, 4*8+\offset(%rsp) movq %rbx, 5*8+\offset(%rsp) + UNWIND_HINT_REGS offset=\offset .endm .macro RESTORE_EXTRA_REGS offset=0 @@ -145,6 +148,7 @@ For 32-bit we have the following conventions - kernel is built with movq 3*8+\offset(%rsp), %r12 movq 4*8+\offset(%rsp), %rbp movq 5*8+\offset(%rsp), %rbx + UNWIND_HINT_REGS offset=\offset extra=0 .endm .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 @@ -167,6 +171,7 @@ For 32-bit we have the following conventions - kernel is built with .endif movq 13*8(%rsp), %rsi movq 14*8(%rsp), %rdi + UNWIND_HINT_IRET_REGS offset=16*8 .endm .macro RESTORE_C_REGS RESTORE_C_REGS_HELPER 1,1,1,1,1 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index cdefcfdd9e63..03505ffbe1b6 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -23,6 +23,7 @@ #include <linux/user-return-notifier.h> #include <linux/uprobes.h> #include <linux/livepatch.h> +#include <linux/syscalls.h> #include <asm/desc.h> #include <asm/traps.h> @@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) struct thread_info *ti = current_thread_info(); u32 cached_flags; + addr_limit_user_check(); + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) local_irq_disable(); diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 48ef7bb32c42..8a13d468635a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -673,16 +673,8 @@ ENTRY(name) \ jmp ret_from_intr; \ ENDPROC(name) - -#ifdef CONFIG_TRACING -# define TRACE_BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) -#else -# define TRACE_BUILD_INTERRUPT(name, nr) -#endif - #define BUILD_INTERRUPT(name, nr) \ BUILD_INTERRUPT3(name, nr, smp_##name); \ - TRACE_BUILD_INTERRUPT(name, nr) /* The include is where all of the SMP etc. interrupts come from */ #include <asm/entry_arch.h> @@ -880,25 +872,17 @@ ENTRY(xen_failsafe_callback) ENDPROC(xen_failsafe_callback) BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - xen_evtchn_do_upcall) + xen_evtchn_do_upcall) #endif /* CONFIG_XEN */ #if IS_ENABLED(CONFIG_HYPERV) BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - hyperv_vector_handler) + hyperv_vector_handler) #endif /* CONFIG_HYPERV */ -#ifdef CONFIG_TRACING -ENTRY(trace_page_fault) - ASM_CLAC - pushl $trace_do_page_fault - jmp common_exception -END(trace_page_fault) -#endif - ENTRY(page_fault) ASM_CLAC pushl $do_page_fault diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 6d078b89a5e8..49167258d587 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -36,6 +36,7 @@ #include <asm/smap.h> #include <asm/pgtable_types.h> #include <asm/export.h> +#include <asm/frame.h> #include <linux/err.h> .code64 @@ -43,9 +44,10 @@ #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) + UNWIND_HINT_EMPTY swapgs sysretq -ENDPROC(native_usergs_sysret64) +END(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ .macro TRACE_IRQS_IRETQ @@ -134,19 +136,14 @@ ENDPROC(native_usergs_sysret64) */ ENTRY(entry_SYSCALL_64) + UNWIND_HINT_EMPTY /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, * it is too small to ever cause noticeable irq latency. */ - SWAPGS_UNSAFE_STACK - /* - * A hypervisor implementation might want to use a label - * after the swapgs, so that it can do the swapgs - * for the guest and jump here on syscall. - */ -GLOBAL(entry_SYSCALL_64_after_swapgs) + swapgs movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -158,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_64_after_hwframe) pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -169,6 +167,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r10 /* pt_regs->r10 */ pushq %r11 /* pt_regs->r11 */ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + UNWIND_HINT_REGS extra=0 /* * If we need to do entry work or if we guess we'll need to do @@ -223,6 +222,7 @@ entry_SYSCALL_64_fastpath: movq EFLAGS(%rsp), %r11 RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY USERGS_SYSRET64 1: @@ -316,6 +316,7 @@ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY USERGS_SYSRET64 opportunistic_sysret_failed: @@ -343,6 +344,7 @@ ENTRY(stub_ptregs_64) DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF popq %rax + UNWIND_HINT_REGS extra=0 jmp entry_SYSCALL64_slow_path 1: @@ -351,6 +353,7 @@ END(stub_ptregs_64) .macro ptregs_stub func ENTRY(ptregs_\func) + UNWIND_HINT_FUNC leaq \func(%rip), %rax jmp stub_ptregs_64 END(ptregs_\func) @@ -367,6 +370,7 @@ END(ptregs_\func) * %rsi: next task */ ENTRY(__switch_to_asm) + UNWIND_HINT_FUNC /* * Save callee-saved registers * This must match the order in inactive_task_frame @@ -406,6 +410,7 @@ END(__switch_to_asm) * r12: kernel thread arg */ ENTRY(ret_from_fork) + UNWIND_HINT_EMPTY movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ @@ -413,6 +418,7 @@ ENTRY(ret_from_fork) jnz 1f /* kernel threads are uncommon */ 2: + UNWIND_HINT_REGS movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ @@ -440,13 +446,102 @@ END(ret_from_fork) ENTRY(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + UNWIND_HINT_IRET_REGS pushq $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 jmp common_interrupt .align 8 + vector=vector+1 .endr END(irq_entries_start) +.macro DEBUG_ENTRY_ASSERT_IRQS_OFF +#ifdef CONFIG_DEBUG_ENTRY + pushfq + testl $X86_EFLAGS_IF, (%rsp) + jz .Lokay_\@ + ud2 +.Lokay_\@: + addq $8, %rsp +#endif +.endm + +/* + * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers + * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. + * Requires kernel GSBASE. + * + * The invariant is that, if irq_count != -1, then the IRQ stack is in use. + */ +.macro ENTER_IRQ_STACK regs=1 old_rsp + DEBUG_ENTRY_ASSERT_IRQS_OFF + movq %rsp, \old_rsp + + .if \regs + UNWIND_HINT_REGS base=\old_rsp + .endif + + incl PER_CPU_VAR(irq_count) + jnz .Lirq_stack_push_old_rsp_\@ + + /* + * Right now, if we just incremented irq_count to zero, we've + * claimed the IRQ stack but we haven't switched to it yet. + * + * If anything is added that can interrupt us here without using IST, + * it must be *extremely* careful to limit its stack usage. This + * could include kprobes and a hypothetical future IST-less #DB + * handler. + * + * The OOPS unwinder relies on the word at the top of the IRQ + * stack linking back to the previous RSP for the entire time we're + * on the IRQ stack. For this to work reliably, we need to write + * it before we actually move ourselves to the IRQ stack. + */ + + movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) + movq PER_CPU_VAR(irq_stack_ptr), %rsp + +#ifdef CONFIG_DEBUG_ENTRY + /* + * If the first movq above becomes wrong due to IRQ stack layout + * changes, the only way we'll notice is if we try to unwind right + * here. Assert that we set up the stack right to catch this type + * of bug quickly. + */ + cmpq -8(%rsp), \old_rsp + je .Lirq_stack_okay\@ + ud2 + .Lirq_stack_okay\@: +#endif + +.Lirq_stack_push_old_rsp_\@: + pushq \old_rsp + + .if \regs + UNWIND_HINT_REGS indirect=1 + .endif +.endm + +/* + * Undoes ENTER_IRQ_STACK. + */ +.macro LEAVE_IRQ_STACK regs=1 + DEBUG_ENTRY_ASSERT_IRQS_OFF + /* We need to be off the IRQ stack before decrementing irq_count. */ + popq %rsp + + .if \regs + UNWIND_HINT_REGS + .endif + + /* + * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming + * the irq stack but we're not on it. + */ + + decl PER_CPU_VAR(irq_count) +.endm + /* * Interrupt entry/exit. * @@ -485,17 +580,7 @@ END(irq_entries_start) CALL_enter_from_user_mode 1: - /* - * Save previous stack pointer, optionally switch to interrupt stack. - * irq_count is used to check if a CPU is already on an interrupt stack - * or not. While this is essentially redundant with preempt_count it is - * a little cheaper to use a separate counter in the PDA (short of - * moving irq_enter into assembly, which would be too much work) - */ - movq %rsp, %rdi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rdi + ENTER_IRQ_STACK old_rsp=%rdi /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -515,10 +600,8 @@ common_interrupt: ret_from_intr: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) - /* Restore saved previous stack */ - popq %rsp + LEAVE_IRQ_STACK testb $3, CS(%rsp) jz retint_kernel @@ -561,6 +644,7 @@ restore_c_regs_and_iret: INTERRUPT_RETURN ENTRY(native_iret) + UNWIND_HINT_IRET_REGS /* * Are we returning to a stack segment from the LDT? Note: in * 64-bit mode SS:RSP on the exception stack is always valid. @@ -633,6 +717,7 @@ native_irq_return_ldt: orq PER_CPU_VAR(espfix_stack), %rax SWAPGS movq %rax, %rsp + UNWIND_HINT_IRET_REGS offset=8 /* * At this point, we cannot write to the stack any more, but we can @@ -654,6 +739,7 @@ END(common_interrupt) */ .macro apicinterrupt3 num sym do_sym ENTRY(\sym) + UNWIND_HINT_IRET_REGS ASM_CLAC pushq $~(\num) .Lcommon_\sym: @@ -662,31 +748,13 @@ ENTRY(\sym) END(\sym) .endm -#ifdef CONFIG_TRACING -#define trace(sym) trace_##sym -#define smp_trace(sym) smp_trace_##sym - -.macro trace_apicinterrupt num sym -apicinterrupt3 \num trace(\sym) smp_trace(\sym) -.endm -#else -.macro trace_apicinterrupt num sym do_sym -.endm -#endif - /* Make sure APIC interrupt handlers end up in the irqentry section: */ -#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) -# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" -# define POP_SECTION_IRQENTRY .popsection -#else -# define PUSH_SECTION_IRQENTRY -# define POP_SECTION_IRQENTRY -#endif +#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" +#define POP_SECTION_IRQENTRY .popsection .macro apicinterrupt num sym do_sym PUSH_SECTION_IRQENTRY apicinterrupt3 \num \sym \do_sym -trace_apicinterrupt \num \sym POP_SECTION_IRQENTRY .endm @@ -740,13 +808,14 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) + UNWIND_HINT_IRET_REGS offset=8 + /* Sanity check */ .if \shift_ist != -1 && \paranoid == 0 .error "using shift_ist requires paranoid=1" .endif ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME .ifeq \has_error_code pushq $-1 /* ORIG_RAX: no syscall to restart */ @@ -763,6 +832,7 @@ ENTRY(\sym) .else call error_entry .endif + UNWIND_HINT_REGS /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ .if \paranoid @@ -829,17 +899,6 @@ ENTRY(\sym) END(\sym) .endm -#ifdef CONFIG_TRACING -.macro trace_idtentry sym do_sym has_error_code:req -idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#else -.macro trace_idtentry sym do_sym has_error_code:req -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#endif - idtentry divide_error do_divide_error has_error_code=0 idtentry overflow do_overflow has_error_code=0 idtentry bounds do_bounds has_error_code=0 @@ -860,6 +919,7 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 * edi: new selector */ ENTRY(native_load_gs_index) + FRAME_BEGIN pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS @@ -868,8 +928,9 @@ ENTRY(native_load_gs_index) 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE SWAPGS popfq + FRAME_END ret -END(native_load_gs_index) +ENDPROC(native_load_gs_index) EXPORT_SYMBOL(native_load_gs_index) _ASM_EXTABLE(.Lgs_change, bad_gs) @@ -892,17 +953,15 @@ bad_gs: ENTRY(do_softirq_own_stack) pushq %rbp mov %rsp, %rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr), %rsp - push %rbp /* frame pointer backlink */ + ENTER_IRQ_STACK regs=0 old_rsp=%r11 call __do_softirq + LEAVE_IRQ_STACK regs=0 leaveq - decl PER_CPU_VAR(irq_count) ret -END(do_softirq_own_stack) +ENDPROC(do_softirq_own_stack) #ifdef CONFIG_XEN -idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 +idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 /* * A note on the "critical region" in our callback handler. @@ -923,14 +982,14 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ + UNWIND_HINT_FUNC movq %rdi, %rsp /* we don't return, adjust the stack frame */ -11: incl PER_CPU_VAR(irq_count) - movq %rsp, %rbp - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rbp /* frame pointer backlink */ + UNWIND_HINT_REGS + + ENTER_IRQ_STACK old_rsp=%r10 call xen_evtchn_do_upcall - popq %rsp - decl PER_CPU_VAR(irq_count) + LEAVE_IRQ_STACK + #ifndef CONFIG_PREEMPT call xen_maybe_preempt_hcall #endif @@ -951,6 +1010,7 @@ END(xen_do_hypervisor_callback) * with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) + UNWIND_HINT_EMPTY movl %ds, %ecx cmpw %cx, 0x10(%rsp) jne 1f @@ -968,13 +1028,13 @@ ENTRY(xen_failsafe_callback) movq 8(%rsp), %r11 addq $0x30, %rsp pushq $0 /* RIP */ - pushq %r11 - pushq %rcx + UNWIND_HINT_IRET_REGS offset=8 jmp general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp), %rcx movq 8(%rsp), %r11 addq $0x30, %rsp + UNWIND_HINT_IRET_REGS pushq $-1 /* orig_ax = -1 => not a system call */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS @@ -998,13 +1058,12 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN -idtentry xen_debug do_debug has_error_code=0 -idtentry xen_int3 do_int3 has_error_code=0 -idtentry xen_stack_segment do_stack_segment has_error_code=1 +idtentry xendebug do_debug has_error_code=0 +idtentry xenint3 do_int3 has_error_code=0 #endif idtentry general_protection do_general_protection has_error_code=1 -trace_idtentry page_fault do_page_fault has_error_code=1 +idtentry page_fault do_page_fault has_error_code=1 #ifdef CONFIG_KVM_GUEST idtentry async_page_fault do_async_page_fault has_error_code=1 @@ -1020,6 +1079,7 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) + UNWIND_HINT_FUNC cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1047,6 +1107,7 @@ END(paranoid_entry) * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) + UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ @@ -1068,6 +1129,7 @@ END(paranoid_exit) * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) + UNWIND_HINT_FUNC cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1152,6 +1214,7 @@ END(error_entry) * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode */ ENTRY(error_exit) + UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF testl %ebx, %ebx @@ -1160,19 +1223,9 @@ ENTRY(error_exit) END(error_exit) /* Runs on exception stack */ +/* XXX: broken on Xen PV */ ENTRY(nmi) - /* - * Fix up the exception frame if we're on Xen. - * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most - * one value to the stack on native, so it may clobber the rdx - * scratch slot, but it won't clobber any of the important - * slots past it. - * - * Xen is a different story, because the Xen frame itself overlaps - * the "NMI executing" variable. - */ - PARAVIRT_ADJUST_EXCEPTION_FRAME - + UNWIND_HINT_IRET_REGS /* * We allow breakpoints in NMIs. If a breakpoint occurs, then * the iretq it performs will take us out of NMI context. @@ -1234,11 +1287,13 @@ ENTRY(nmi) cld movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ pushq 3*8(%rdx) /* pt_regs->flags */ pushq 2*8(%rdx) /* pt_regs->cs */ pushq 1*8(%rdx) /* pt_regs->rip */ + UNWIND_HINT_IRET_REGS pushq $-1 /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -1255,6 +1310,7 @@ ENTRY(nmi) pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS ENCODE_FRAME_POINTER /* @@ -1409,6 +1465,7 @@ first_nmi: .rept 5 pushq 11*8(%rsp) .endr + UNWIND_HINT_IRET_REGS /* Everything up to here is safe from nested NMIs */ @@ -1424,6 +1481,7 @@ first_nmi: pushq $__KERNEL_CS /* CS */ pushq $1f /* RIP */ INTERRUPT_RETURN /* continues at repeat_nmi below */ + UNWIND_HINT_IRET_REGS 1: #endif @@ -1473,6 +1531,7 @@ end_repeat_nmi: * exceptions might do. */ call paranoid_entry + UNWIND_HINT_REGS /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi @@ -1510,17 +1569,19 @@ nmi_restore: END(nmi) ENTRY(ignore_sysret) + UNWIND_HINT_EMPTY mov $-ENOSYS, %eax sysret END(ignore_sysret) ENTRY(rewind_stack_do_exit) + UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp movq PER_CPU_VAR(cpu_current_top_of_stack), %rax - leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp + leaq -PTREGS_SIZE(%rax), %rsp + UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE call do_exit -1: jmp 1b END(rewind_stack_do_exit) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e1721dafbcb1..e26c25ca7756 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat) */ ENTRY(entry_SYSCALL_compat) /* Interrupts are off on entry. */ - SWAPGS_UNSAFE_STACK + swapgs /* Stash user ESP and switch to the kernel stack. */ movl %esp, %r8d movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - /* Zero-extending 32-bit regs, do not remove */ - movl %eax, %eax - /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq %r8 /* pt_regs->sp */ pushq %r11 /* pt_regs->flags */ pushq $__USER32_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_compat_after_hwframe) + movl %eax, %eax /* discard orig_ax high bits */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -294,7 +293,6 @@ ENTRY(entry_INT80_compat) /* * Interrupts are off on entry. */ - PARAVIRT_ADJUST_EXCEPTION_FRAME ASM_CLAC /* Do this early to minimize exposure */ SWAPGS @@ -342,8 +340,7 @@ ENTRY(entry_INT80_compat) jmp restore_regs_and_iret END(entry_INT80_compat) - ALIGN -GLOBAL(stub32_clone) +ENTRY(stub32_clone) /* * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). @@ -353,3 +350,4 @@ GLOBAL(stub32_clone) */ xchg %r8, %rcx jmp sys_clone +ENDPROC(stub32_clone) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 726355ce8497..1911310959f8 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -351,7 +351,7 @@ static void vgetcpu_cpu_init(void *arg) * and 8 bits for the node) */ d.limit0 = cpu | ((node & 0xf) << 12); - d.limit = node >> 4; + d.limit1 = node >> 4; d.type = 5; /* RO data, expand down, accessed */ d.dpl = 3; /* Visible to user code */ d.s = 1; /* Not a system segment */ diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index ad44af0dd667..f5cbbba99283 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -400,11 +400,24 @@ static int amd_uncore_cpu_starting(unsigned int cpu) if (amd_uncore_llc) { unsigned int apicid = cpu_data(cpu).apicid; - unsigned int nshared; + unsigned int nshared, subleaf, prev_eax = 0; uncore = *per_cpu_ptr(amd_uncore_llc, cpu); - cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx); - nshared = ((eax >> 14) & 0xfff) + 1; + /* + * Iterate over Cache Topology Definition leaves until no + * more cache descriptions are available. + */ + for (subleaf = 0; subleaf < 5; subleaf++) { + cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx); + + /* EAX[0:4] gives type of cache */ + if (!(eax & 0x1f)) + break; + + prev_eax = eax; + } + nshared = ((prev_eax >> 14) & 0xfff) + 1; + uncore->id = apicid - (apicid % nshared); uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc); @@ -555,7 +568,7 @@ static int __init amd_uncore_init(void) ret = 0; } - if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) { + if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { amd_uncore_llc = alloc_percpu(struct amd_uncore *); if (!amd_uncore_llc) { ret = -ENOMEM; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index af12e294caed..80534d3c2480 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -487,22 +487,28 @@ static inline int precise_br_compat(struct perf_event *event) return m == b; } -int x86_pmu_hw_config(struct perf_event *event) +int x86_pmu_max_precise(void) { - if (event->attr.precise_ip) { - int precise = 0; + int precise = 0; - /* Support for constant skid */ - if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { + /* Support for constant skid */ + if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { + precise++; + + /* Support for IP fixup */ + if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; - /* Support for IP fixup */ - if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) - precise++; + if (x86_pmu.pebs_prec_dist) + precise++; + } + return precise; +} - if (x86_pmu.pebs_prec_dist) - precise++; - } +int x86_pmu_hw_config(struct perf_event *event) +{ + if (event->attr.precise_ip) { + int precise = x86_pmu_max_precise(); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; @@ -1751,6 +1757,7 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) } static struct attribute_group x86_pmu_attr_group; +static struct attribute_group x86_pmu_caps_group; static int __init init_hw_perf_events(void) { @@ -1799,6 +1806,14 @@ static int __init init_hw_perf_events(void) x86_pmu_format_group.attrs = x86_pmu.format_attrs; + if (x86_pmu.caps_attrs) { + struct attribute **tmp; + + tmp = merge_attr(x86_pmu_caps_group.attrs, x86_pmu.caps_attrs); + if (!WARN_ON(!tmp)) + x86_pmu_caps_group.attrs = tmp; + } + if (x86_pmu.event_attrs) x86_pmu_events_group.attrs = x86_pmu.event_attrs; @@ -2213,10 +2228,30 @@ static struct attribute_group x86_pmu_attr_group = { .attrs = x86_pmu_attrs, }; +static ssize_t max_precise_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); +} + +static DEVICE_ATTR_RO(max_precise); + +static struct attribute *x86_pmu_caps_attrs[] = { + &dev_attr_max_precise.attr, + NULL +}; + +static struct attribute_group x86_pmu_caps_group = { + .name = "caps", + .attrs = x86_pmu_caps_attrs, +}; + static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, &x86_pmu_events_group, + &x86_pmu_caps_group, NULL, }; @@ -2335,12 +2370,9 @@ static unsigned long get_segment_base(unsigned int segment) #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; - if (idx > LDT_ENTRIES) - return 0; - /* IRQs are off, so this synchronizes with smp_store_release */ ldt = lockless_dereference(current->active_mm->context.ldt); - if (!ldt || idx > ldt->nr_entries) + if (!ldt || idx >= ldt->nr_entries) return 0; desc = &ldt->entries[idx]; @@ -2348,7 +2380,7 @@ static unsigned long get_segment_base(unsigned int segment) return 0; #endif } else { - if (idx > GDT_ENTRIES) + if (idx >= GDT_ENTRIES) return 0; desc = raw_cpu_ptr(gdt_page.gdt) + idx; diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index 06c2baa51814..e9d8520a801a 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o +obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index ddd8d3516bfc..16076eb34699 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -268,7 +268,7 @@ static void bts_event_start(struct perf_event *event, int flags) bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; - event->hw.itrace_started = 1; + perf_event_itrace_started(event); event->hw.state = 0; __bts_event_start(event); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 98b0f0729527..829e89cfcee2 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3415,12 +3415,26 @@ static struct attribute *intel_arch3_formats_attr[] = { &format_attr_any.attr, &format_attr_inv.attr, &format_attr_cmask.attr, + NULL, +}; + +static struct attribute *hsw_format_attr[] = { &format_attr_in_tx.attr, &format_attr_in_tx_cp.attr, + &format_attr_offcore_rsp.attr, + &format_attr_ldlat.attr, + NULL +}; - &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */ - &format_attr_ldlat.attr, /* PEBS load latency */ - NULL, +static struct attribute *nhm_format_attr[] = { + &format_attr_offcore_rsp.attr, + &format_attr_ldlat.attr, + NULL +}; + +static struct attribute *slm_format_attr[] = { + &format_attr_offcore_rsp.attr, + NULL }; static struct attribute *skl_format_attr[] = { @@ -3781,6 +3795,36 @@ done: static DEVICE_ATTR_RW(freeze_on_smi); +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *lbr_attrs[] = { + &dev_attr_branches.attr, + NULL +}; + +static char pmu_name_str[30]; + +static ssize_t pmu_name_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", pmu_name_str); +} + +static DEVICE_ATTR_RO(pmu_name); + +static struct attribute *intel_pmu_caps_attrs[] = { + &dev_attr_pmu_name.attr, + NULL +}; + static struct attribute *intel_pmu_attrs[] = { &dev_attr_freeze_on_smi.attr, NULL, @@ -3795,6 +3839,8 @@ __init int intel_pmu_init(void) unsigned int unused; struct extra_reg *er; int version, i; + struct attribute **extra_attr = NULL; + char *name; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { switch (boot_cpu_data.x86) { @@ -3862,6 +3908,7 @@ __init int intel_pmu_init(void) switch (boot_cpu_data.x86_model) { case INTEL_FAM6_CORE_YONAH: pr_cont("Core events, "); + name = "core"; break; case INTEL_FAM6_CORE2_MEROM: @@ -3877,6 +3924,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_core2_event_constraints; x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints; pr_cont("Core2 events, "); + name = "core2"; break; case INTEL_FAM6_NEHALEM: @@ -3905,8 +3953,11 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_nhm(); x86_add_quirk(intel_nehalem_quirk); + x86_pmu.pebs_no_tlb = 1; + extra_attr = nhm_format_attr; pr_cont("Nehalem events, "); + name = "nehalem"; break; case INTEL_FAM6_ATOM_PINEVIEW: @@ -3923,6 +3974,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints; x86_pmu.pebs_aliases = intel_pebs_aliases_core2; pr_cont("Atom events, "); + name = "bonnell"; break; case INTEL_FAM6_ATOM_SILVERMONT1: @@ -3940,7 +3992,9 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_slm_extra_regs; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.cpu_events = slm_events_attrs; + extra_attr = slm_format_attr; pr_cont("Silvermont events, "); + name = "silvermont"; break; case INTEL_FAM6_ATOM_GOLDMONT: @@ -3965,7 +4019,9 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.cpu_events = glm_events_attrs; + extra_attr = slm_format_attr; pr_cont("Goldmont events, "); + name = "goldmont"; break; case INTEL_FAM6_ATOM_GEMINI_LAKE: @@ -3991,7 +4047,9 @@ __init int intel_pmu_init(void) x86_pmu.cpu_events = glm_events_attrs; /* Goldmont Plus has 4-wide pipeline */ event_attr_td_total_slots_scale_glm.event_str = "4"; + extra_attr = slm_format_attr; pr_cont("Goldmont plus events, "); + name = "goldmont_plus"; break; case INTEL_FAM6_WESTMERE: @@ -4020,7 +4078,9 @@ __init int intel_pmu_init(void) X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); intel_pmu_pebs_data_source_nhm(); + extra_attr = nhm_format_attr; pr_cont("Westmere events, "); + name = "westmere"; break; case INTEL_FAM6_SANDYBRIDGE: @@ -4056,7 +4116,10 @@ __init int intel_pmu_init(void) intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1); + extra_attr = nhm_format_attr; + pr_cont("SandyBridge events, "); + name = "sandybridge"; break; case INTEL_FAM6_IVYBRIDGE: @@ -4090,7 +4153,10 @@ __init int intel_pmu_init(void) intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + extra_attr = nhm_format_attr; + pr_cont("IvyBridge events, "); + name = "ivybridge"; break; @@ -4118,7 +4184,10 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = hsw_get_event_constraints; x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.lbr_double_abort = true; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; pr_cont("Haswell events, "); + name = "haswell"; break; case INTEL_FAM6_BROADWELL_CORE: @@ -4154,7 +4223,10 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = hsw_get_event_constraints; x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.limit_period = bdw_limit_period; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; pr_cont("Broadwell events, "); + name = "broadwell"; break; case INTEL_FAM6_XEON_PHI_KNL: @@ -4172,8 +4244,9 @@ __init int intel_pmu_init(void) /* all extra regs are per-cpu when HT is on */ x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - + extra_attr = slm_format_attr; pr_cont("Knights Landing/Mill events, "); + name = "knights-landing"; break; case INTEL_FAM6_SKYLAKE_MOBILE: @@ -4203,11 +4276,14 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, - skl_format_attr); - WARN_ON(!x86_pmu.format_attrs); + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; + extra_attr = merge_attr(extra_attr, skl_format_attr); x86_pmu.cpu_events = hsw_events_attrs; + intel_pmu_pebs_data_source_skl( + boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); pr_cont("Skylake events, "); + name = "skylake"; break; default: @@ -4215,6 +4291,7 @@ __init int intel_pmu_init(void) case 1: x86_pmu.event_constraints = intel_v1_event_constraints; pr_cont("generic architected perfmon v1, "); + name = "generic_arch_v1"; break; default: /* @@ -4222,10 +4299,19 @@ __init int intel_pmu_init(void) */ x86_pmu.event_constraints = intel_gen_event_constraints; pr_cont("generic architected perfmon, "); + name = "generic_arch_v2+"; break; } } + snprintf(pmu_name_str, sizeof pmu_name_str, "%s", name); + + if (version >= 2 && extra_attr) { + x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, + extra_attr); + WARN_ON(!x86_pmu.format_attrs); + } + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); @@ -4272,8 +4358,13 @@ __init int intel_pmu_init(void) x86_pmu.lbr_nr = 0; } - if (x86_pmu.lbr_nr) + x86_pmu.caps_attrs = intel_pmu_caps_attrs; + + if (x86_pmu.lbr_nr) { + x86_pmu.caps_attrs = merge_attr(x86_pmu.caps_attrs, lbr_attrs); pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); + } + /* * Access extra MSR may cause #GP under certain circumstances. * E.g. KVM doesn't support offcore event diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c deleted file mode 100644 index 2521f771f2f5..000000000000 --- a/arch/x86/events/intel/cqm.c +++ /dev/null @@ -1,1766 +0,0 @@ -/* - * Intel Cache Quality-of-Service Monitoring (CQM) support. - * - * Based very, very heavily on work by Peter Zijlstra. - */ - -#include <linux/perf_event.h> -#include <linux/slab.h> -#include <asm/cpu_device_id.h> -#include <asm/intel_rdt_common.h> -#include "../perf_event.h" - -#define MSR_IA32_QM_CTR 0x0c8e -#define MSR_IA32_QM_EVTSEL 0x0c8d - -#define MBM_CNTR_WIDTH 24 -/* - * Guaranteed time in ms as per SDM where MBM counters will not overflow. - */ -#define MBM_CTR_OVERFLOW_TIME 1000 - -static u32 cqm_max_rmid = -1; -static unsigned int cqm_l3_scale; /* supposedly cacheline size */ -static bool cqm_enabled, mbm_enabled; -unsigned int mbm_socket_max; - -/* - * The cached intel_pqr_state is strictly per CPU and can never be - * updated from a remote CPU. Both functions which modify the state - * (intel_cqm_event_start and intel_cqm_event_stop) are called with - * interrupts disabled, which is sufficient for the protection. - */ -DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); -static struct hrtimer *mbm_timers; -/** - * struct sample - mbm event's (local or total) data - * @total_bytes #bytes since we began monitoring - * @prev_msr previous value of MSR - */ -struct sample { - u64 total_bytes; - u64 prev_msr; -}; - -/* - * samples profiled for total memory bandwidth type events - */ -static struct sample *mbm_total; -/* - * samples profiled for local memory bandwidth type events - */ -static struct sample *mbm_local; - -#define pkg_id topology_physical_package_id(smp_processor_id()) -/* - * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array. - * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of - * rmids per socket, an example is given below - * RMID1 of Socket0: vrmid = 1 - * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1 - * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1 - */ -#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid) -/* - * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. - * Also protects event->hw.cqm_rmid - * - * Hold either for stability, both for modification of ->hw.cqm_rmid. - */ -static DEFINE_MUTEX(cache_mutex); -static DEFINE_RAW_SPINLOCK(cache_lock); - -/* - * Groups of events that have the same target(s), one RMID per group. - */ -static LIST_HEAD(cache_groups); - -/* - * Mask of CPUs for reading CQM values. We only need one per-socket. - */ -static cpumask_t cqm_cpumask; - -#define RMID_VAL_ERROR (1ULL << 63) -#define RMID_VAL_UNAVAIL (1ULL << 62) - -/* - * Event IDs are used to program IA32_QM_EVTSEL before reading event - * counter from IA32_QM_CTR - */ -#define QOS_L3_OCCUP_EVENT_ID 0x01 -#define QOS_MBM_TOTAL_EVENT_ID 0x02 -#define QOS_MBM_LOCAL_EVENT_ID 0x03 - -/* - * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). - * - * This rmid is always free and is guaranteed to have an associated - * near-zero occupancy value, i.e. no cachelines are tagged with this - * RMID, once __intel_cqm_rmid_rotate() returns. - */ -static u32 intel_cqm_rotation_rmid; - -#define INVALID_RMID (-1) - -/* - * Is @rmid valid for programming the hardware? - * - * rmid 0 is reserved by the hardware for all non-monitored tasks, which - * means that we should never come across an rmid with that value. - * Likewise, an rmid value of -1 is used to indicate "no rmid currently - * assigned" and is used as part of the rotation code. - */ -static inline bool __rmid_valid(u32 rmid) -{ - if (!rmid || rmid == INVALID_RMID) - return false; - - return true; -} - -static u64 __rmid_read(u32 rmid) -{ - u64 val; - - /* - * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt, - * it just says that to increase confusion. - */ - wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); - - /* - * Aside from the ERROR and UNAVAIL bits, assume this thing returns - * the number of cachelines tagged with @rmid. - */ - return val; -} - -enum rmid_recycle_state { - RMID_YOUNG = 0, - RMID_AVAILABLE, - RMID_DIRTY, -}; - -struct cqm_rmid_entry { - u32 rmid; - enum rmid_recycle_state state; - struct list_head list; - unsigned long queue_time; -}; - -/* - * cqm_rmid_free_lru - A least recently used list of RMIDs. - * - * Oldest entry at the head, newest (most recently used) entry at the - * tail. This list is never traversed, it's only used to keep track of - * the lru order. That is, we only pick entries of the head or insert - * them on the tail. - * - * All entries on the list are 'free', and their RMIDs are not currently - * in use. To mark an RMID as in use, remove its entry from the lru - * list. - * - * - * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs. - * - * This list is contains RMIDs that no one is currently using but that - * may have a non-zero occupancy value associated with them. The - * rotation worker moves RMIDs from the limbo list to the free list once - * the occupancy value drops below __intel_cqm_threshold. - * - * Both lists are protected by cache_mutex. - */ -static LIST_HEAD(cqm_rmid_free_lru); -static LIST_HEAD(cqm_rmid_limbo_lru); - -/* - * We use a simple array of pointers so that we can lookup a struct - * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid() - * and __put_rmid() from having to worry about dealing with struct - * cqm_rmid_entry - they just deal with rmids, i.e. integers. - * - * Once this array is initialized it is read-only. No locks are required - * to access it. - * - * All entries for all RMIDs can be looked up in the this array at all - * times. - */ -static struct cqm_rmid_entry **cqm_rmid_ptrs; - -static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid) -{ - struct cqm_rmid_entry *entry; - - entry = cqm_rmid_ptrs[rmid]; - WARN_ON(entry->rmid != rmid); - - return entry; -} - -/* - * Returns < 0 on fail. - * - * We expect to be called with cache_mutex held. - */ -static u32 __get_rmid(void) -{ - struct cqm_rmid_entry *entry; - - lockdep_assert_held(&cache_mutex); - - if (list_empty(&cqm_rmid_free_lru)) - return INVALID_RMID; - - entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list); - list_del(&entry->list); - - return entry->rmid; -} - -static void __put_rmid(u32 rmid) -{ - struct cqm_rmid_entry *entry; - - lockdep_assert_held(&cache_mutex); - - WARN_ON(!__rmid_valid(rmid)); - entry = __rmid_entry(rmid); - - entry->queue_time = jiffies; - entry->state = RMID_YOUNG; - - list_add_tail(&entry->list, &cqm_rmid_limbo_lru); -} - -static void cqm_cleanup(void) -{ - int i; - - if (!cqm_rmid_ptrs) - return; - - for (i = 0; i < cqm_max_rmid; i++) - kfree(cqm_rmid_ptrs[i]); - - kfree(cqm_rmid_ptrs); - cqm_rmid_ptrs = NULL; - cqm_enabled = false; -} - -static int intel_cqm_setup_rmid_cache(void) -{ - struct cqm_rmid_entry *entry; - unsigned int nr_rmids; - int r = 0; - - nr_rmids = cqm_max_rmid + 1; - cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) * - nr_rmids, GFP_KERNEL); - if (!cqm_rmid_ptrs) - return -ENOMEM; - - for (; r <= cqm_max_rmid; r++) { - struct cqm_rmid_entry *entry; - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - goto fail; - - INIT_LIST_HEAD(&entry->list); - entry->rmid = r; - cqm_rmid_ptrs[r] = entry; - - list_add_tail(&entry->list, &cqm_rmid_free_lru); - } - - /* - * RMID 0 is special and is always allocated. It's used for all - * tasks that are not monitored. - */ - entry = __rmid_entry(0); - list_del(&entry->list); - - mutex_lock(&cache_mutex); - intel_cqm_rotation_rmid = __get_rmid(); - mutex_unlock(&cache_mutex); - - return 0; - -fail: - cqm_cleanup(); - return -ENOMEM; -} - -/* - * Determine if @a and @b measure the same set of tasks. - * - * If @a and @b measure the same set of tasks then we want to share a - * single RMID. - */ -static bool __match_event(struct perf_event *a, struct perf_event *b) -{ - /* Per-cpu and task events don't mix */ - if ((a->attach_state & PERF_ATTACH_TASK) != - (b->attach_state & PERF_ATTACH_TASK)) - return false; - -#ifdef CONFIG_CGROUP_PERF - if (a->cgrp != b->cgrp) - return false; -#endif - - /* If not task event, we're machine wide */ - if (!(b->attach_state & PERF_ATTACH_TASK)) - return true; - - /* - * Events that target same task are placed into the same cache group. - * Mark it as a multi event group, so that we update ->count - * for every event rather than just the group leader later. - */ - if (a->hw.target == b->hw.target) { - b->hw.is_group_event = true; - return true; - } - - /* - * Are we an inherited event? - */ - if (b->parent == a) - return true; - - return false; -} - -#ifdef CONFIG_CGROUP_PERF -static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) -{ - if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.target, event->ctx); - - return event->cgrp; -} -#endif - -/* - * Determine if @a's tasks intersect with @b's tasks - * - * There are combinations of events that we explicitly prohibit, - * - * PROHIBITS - * system-wide -> cgroup and task - * cgroup -> system-wide - * -> task in cgroup - * task -> system-wide - * -> task in cgroup - * - * Call this function before allocating an RMID. - */ -static bool __conflict_event(struct perf_event *a, struct perf_event *b) -{ -#ifdef CONFIG_CGROUP_PERF - /* - * We can have any number of cgroups but only one system-wide - * event at a time. - */ - if (a->cgrp && b->cgrp) { - struct perf_cgroup *ac = a->cgrp; - struct perf_cgroup *bc = b->cgrp; - - /* - * This condition should have been caught in - * __match_event() and we should be sharing an RMID. - */ - WARN_ON_ONCE(ac == bc); - - if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || - cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) - return true; - - return false; - } - - if (a->cgrp || b->cgrp) { - struct perf_cgroup *ac, *bc; - - /* - * cgroup and system-wide events are mutually exclusive - */ - if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || - (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) - return true; - - /* - * Ensure neither event is part of the other's cgroup - */ - ac = event_to_cgroup(a); - bc = event_to_cgroup(b); - if (ac == bc) - return true; - - /* - * Must have cgroup and non-intersecting task events. - */ - if (!ac || !bc) - return false; - - /* - * We have cgroup and task events, and the task belongs - * to a cgroup. Check for for overlap. - */ - if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || - cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) - return true; - - return false; - } -#endif - /* - * If one of them is not a task, same story as above with cgroups. - */ - if (!(a->attach_state & PERF_ATTACH_TASK) || - !(b->attach_state & PERF_ATTACH_TASK)) - return true; - - /* - * Must be non-overlapping. - */ - return false; -} - -struct rmid_read { - u32 rmid; - u32 evt_type; - atomic64_t value; -}; - -static void __intel_cqm_event_count(void *info); -static void init_mbm_sample(u32 rmid, u32 evt_type); -static void __intel_mbm_event_count(void *info); - -static bool is_cqm_event(int e) -{ - return (e == QOS_L3_OCCUP_EVENT_ID); -} - -static bool is_mbm_event(int e) -{ - return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID); -} - -static void cqm_mask_call(struct rmid_read *rr) -{ - if (is_mbm_event(rr->evt_type)) - on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1); - else - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1); -} - -/* - * Exchange the RMID of a group of events. - */ -static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) -{ - struct perf_event *event; - struct list_head *head = &group->hw.cqm_group_entry; - u32 old_rmid = group->hw.cqm_rmid; - - lockdep_assert_held(&cache_mutex); - - /* - * If our RMID is being deallocated, perform a read now. - */ - if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { - struct rmid_read rr = { - .rmid = old_rmid, - .evt_type = group->attr.config, - .value = ATOMIC64_INIT(0), - }; - - cqm_mask_call(&rr); - local64_set(&group->count, atomic64_read(&rr.value)); - } - - raw_spin_lock_irq(&cache_lock); - - group->hw.cqm_rmid = rmid; - list_for_each_entry(event, head, hw.cqm_group_entry) - event->hw.cqm_rmid = rmid; - - raw_spin_unlock_irq(&cache_lock); - - /* - * If the allocation is for mbm, init the mbm stats. - * Need to check if each event in the group is mbm event - * because there could be multiple type of events in the same group. - */ - if (__rmid_valid(rmid)) { - event = group; - if (is_mbm_event(event->attr.config)) - init_mbm_sample(rmid, event->attr.config); - - list_for_each_entry(event, head, hw.cqm_group_entry) { - if (is_mbm_event(event->attr.config)) - init_mbm_sample(rmid, event->attr.config); - } - } - - return old_rmid; -} - -/* - * If we fail to assign a new RMID for intel_cqm_rotation_rmid because - * cachelines are still tagged with RMIDs in limbo, we progressively - * increment the threshold until we find an RMID in limbo with <= - * __intel_cqm_threshold lines tagged. This is designed to mitigate the - * problem where cachelines tagged with an RMID are not steadily being - * evicted. - * - * On successful rotations we decrease the threshold back towards zero. - * - * __intel_cqm_max_threshold provides an upper bound on the threshold, - * and is measured in bytes because it's exposed to userland. - */ -static unsigned int __intel_cqm_threshold; -static unsigned int __intel_cqm_max_threshold; - -/* - * Test whether an RMID has a zero occupancy value on this cpu. - */ -static void intel_cqm_stable(void *arg) -{ - struct cqm_rmid_entry *entry; - - list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { - if (entry->state != RMID_AVAILABLE) - break; - - if (__rmid_read(entry->rmid) > __intel_cqm_threshold) - entry->state = RMID_DIRTY; - } -} - -/* - * If we have group events waiting for an RMID that don't conflict with - * events already running, assign @rmid. - */ -static bool intel_cqm_sched_in_event(u32 rmid) -{ - struct perf_event *leader, *event; - - lockdep_assert_held(&cache_mutex); - - leader = list_first_entry(&cache_groups, struct perf_event, - hw.cqm_groups_entry); - event = leader; - - list_for_each_entry_continue(event, &cache_groups, - hw.cqm_groups_entry) { - if (__rmid_valid(event->hw.cqm_rmid)) - continue; - - if (__conflict_event(event, leader)) - continue; - - intel_cqm_xchg_rmid(event, rmid); - return true; - } - - return false; -} - -/* - * Initially use this constant for both the limbo queue time and the - * rotation timer interval, pmu::hrtimer_interval_ms. - * - * They don't need to be the same, but the two are related since if you - * rotate faster than you recycle RMIDs, you may run out of available - * RMIDs. - */ -#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */ - -static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; - -/* - * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list - * @nr_available: number of freeable RMIDs on the limbo list - * - * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no - * cachelines are tagged with those RMIDs. After this we can reuse them - * and know that the current set of active RMIDs is stable. - * - * Return %true or %false depending on whether stabilization needs to be - * reattempted. - * - * If we return %true then @nr_available is updated to indicate the - * number of RMIDs on the limbo list that have been queued for the - * minimum queue time (RMID_AVAILABLE), but whose data occupancy values - * are above __intel_cqm_threshold. - */ -static bool intel_cqm_rmid_stabilize(unsigned int *available) -{ - struct cqm_rmid_entry *entry, *tmp; - - lockdep_assert_held(&cache_mutex); - - *available = 0; - list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { - unsigned long min_queue_time; - unsigned long now = jiffies; - - /* - * We hold RMIDs placed into limbo for a minimum queue - * time. Before the minimum queue time has elapsed we do - * not recycle RMIDs. - * - * The reasoning is that until a sufficient time has - * passed since we stopped using an RMID, any RMID - * placed onto the limbo list will likely still have - * data tagged in the cache, which means we'll probably - * fail to recycle it anyway. - * - * We can save ourselves an expensive IPI by skipping - * any RMIDs that have not been queued for the minimum - * time. - */ - min_queue_time = entry->queue_time + - msecs_to_jiffies(__rmid_queue_time_ms); - - if (time_after(min_queue_time, now)) - break; - - entry->state = RMID_AVAILABLE; - (*available)++; - } - - /* - * Fast return if none of the RMIDs on the limbo list have been - * sitting on the queue for the minimum queue time. - */ - if (!*available) - return false; - - /* - * Test whether an RMID is free for each package. - */ - on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); - - list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { - /* - * Exhausted all RMIDs that have waited min queue time. - */ - if (entry->state == RMID_YOUNG) - break; - - if (entry->state == RMID_DIRTY) - continue; - - list_del(&entry->list); /* remove from limbo */ - - /* - * The rotation RMID gets priority if it's - * currently invalid. In which case, skip adding - * the RMID to the the free lru. - */ - if (!__rmid_valid(intel_cqm_rotation_rmid)) { - intel_cqm_rotation_rmid = entry->rmid; - continue; - } - - /* - * If we have groups waiting for RMIDs, hand - * them one now provided they don't conflict. - */ - if (intel_cqm_sched_in_event(entry->rmid)) - continue; - - /* - * Otherwise place it onto the free list. - */ - list_add_tail(&entry->list, &cqm_rmid_free_lru); - } - - - return __rmid_valid(intel_cqm_rotation_rmid); -} - -/* - * Pick a victim group and move it to the tail of the group list. - * @next: The first group without an RMID - */ -static void __intel_cqm_pick_and_rotate(struct perf_event *next) -{ - struct perf_event *rotor; - u32 rmid; - - lockdep_assert_held(&cache_mutex); - - rotor = list_first_entry(&cache_groups, struct perf_event, - hw.cqm_groups_entry); - - /* - * The group at the front of the list should always have a valid - * RMID. If it doesn't then no groups have RMIDs assigned and we - * don't need to rotate the list. - */ - if (next == rotor) - return; - - rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); - __put_rmid(rmid); - - list_rotate_left(&cache_groups); -} - -/* - * Deallocate the RMIDs from any events that conflict with @event, and - * place them on the back of the group list. - */ -static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) -{ - struct perf_event *group, *g; - u32 rmid; - - lockdep_assert_held(&cache_mutex); - - list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) { - if (group == event) - continue; - - rmid = group->hw.cqm_rmid; - - /* - * Skip events that don't have a valid RMID. - */ - if (!__rmid_valid(rmid)) - continue; - - /* - * No conflict? No problem! Leave the event alone. - */ - if (!__conflict_event(group, event)) - continue; - - intel_cqm_xchg_rmid(group, INVALID_RMID); - __put_rmid(rmid); - } -} - -/* - * Attempt to rotate the groups and assign new RMIDs. - * - * We rotate for two reasons, - * 1. To handle the scheduling of conflicting events - * 2. To recycle RMIDs - * - * Rotating RMIDs is complicated because the hardware doesn't give us - * any clues. - * - * There's problems with the hardware interface; when you change the - * task:RMID map cachelines retain their 'old' tags, giving a skewed - * picture. In order to work around this, we must always keep one free - * RMID - intel_cqm_rotation_rmid. - * - * Rotation works by taking away an RMID from a group (the old RMID), - * and assigning the free RMID to another group (the new RMID). We must - * then wait for the old RMID to not be used (no cachelines tagged). - * This ensure that all cachelines are tagged with 'active' RMIDs. At - * this point we can start reading values for the new RMID and treat the - * old RMID as the free RMID for the next rotation. - * - * Return %true or %false depending on whether we did any rotating. - */ -static bool __intel_cqm_rmid_rotate(void) -{ - struct perf_event *group, *start = NULL; - unsigned int threshold_limit; - unsigned int nr_needed = 0; - unsigned int nr_available; - bool rotated = false; - - mutex_lock(&cache_mutex); - -again: - /* - * Fast path through this function if there are no groups and no - * RMIDs that need cleaning. - */ - if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru)) - goto out; - - list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) { - if (!__rmid_valid(group->hw.cqm_rmid)) { - if (!start) - start = group; - nr_needed++; - } - } - - /* - * We have some event groups, but they all have RMIDs assigned - * and no RMIDs need cleaning. - */ - if (!nr_needed && list_empty(&cqm_rmid_limbo_lru)) - goto out; - - if (!nr_needed) - goto stabilize; - - /* - * We have more event groups without RMIDs than available RMIDs, - * or we have event groups that conflict with the ones currently - * scheduled. - * - * We force deallocate the rmid of the group at the head of - * cache_groups. The first event group without an RMID then gets - * assigned intel_cqm_rotation_rmid. This ensures we always make - * forward progress. - * - * Rotate the cache_groups list so the previous head is now the - * tail. - */ - __intel_cqm_pick_and_rotate(start); - - /* - * If the rotation is going to succeed, reduce the threshold so - * that we don't needlessly reuse dirty RMIDs. - */ - if (__rmid_valid(intel_cqm_rotation_rmid)) { - intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); - intel_cqm_rotation_rmid = __get_rmid(); - - intel_cqm_sched_out_conflicting_events(start); - - if (__intel_cqm_threshold) - __intel_cqm_threshold--; - } - - rotated = true; - -stabilize: - /* - * We now need to stablize the RMID we freed above (if any) to - * ensure that the next time we rotate we have an RMID with zero - * occupancy value. - * - * Alternatively, if we didn't need to perform any rotation, - * we'll have a bunch of RMIDs in limbo that need stabilizing. - */ - threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale; - - while (intel_cqm_rmid_stabilize(&nr_available) && - __intel_cqm_threshold < threshold_limit) { - unsigned int steal_limit; - - /* - * Don't spin if nobody is actively waiting for an RMID, - * the rotation worker will be kicked as soon as an - * event needs an RMID anyway. - */ - if (!nr_needed) - break; - - /* Allow max 25% of RMIDs to be in limbo. */ - steal_limit = (cqm_max_rmid + 1) / 4; - - /* - * We failed to stabilize any RMIDs so our rotation - * logic is now stuck. In order to make forward progress - * we have a few options: - * - * 1. rotate ("steal") another RMID - * 2. increase the threshold - * 3. do nothing - * - * We do both of 1. and 2. until we hit the steal limit. - * - * The steal limit prevents all RMIDs ending up on the - * limbo list. This can happen if every RMID has a - * non-zero occupancy above threshold_limit, and the - * occupancy values aren't dropping fast enough. - * - * Note that there is prioritisation at work here - we'd - * rather increase the number of RMIDs on the limbo list - * than increase the threshold, because increasing the - * threshold skews the event data (because we reuse - * dirty RMIDs) - threshold bumps are a last resort. - */ - if (nr_available < steal_limit) - goto again; - - __intel_cqm_threshold++; - } - -out: - mutex_unlock(&cache_mutex); - return rotated; -} - -static void intel_cqm_rmid_rotate(struct work_struct *work); - -static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate); - -static struct pmu intel_cqm_pmu; - -static void intel_cqm_rmid_rotate(struct work_struct *work) -{ - unsigned long delay; - - __intel_cqm_rmid_rotate(); - - delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms); - schedule_delayed_work(&intel_cqm_rmid_work, delay); -} - -static u64 update_sample(unsigned int rmid, u32 evt_type, int first) -{ - struct sample *mbm_current; - u32 vrmid = rmid_2_index(rmid); - u64 val, bytes, shift; - u32 eventid; - - if (evt_type == QOS_MBM_LOCAL_EVENT_ID) { - mbm_current = &mbm_local[vrmid]; - eventid = QOS_MBM_LOCAL_EVENT_ID; - } else { - mbm_current = &mbm_total[vrmid]; - eventid = QOS_MBM_TOTAL_EVENT_ID; - } - - wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return mbm_current->total_bytes; - - if (first) { - mbm_current->prev_msr = val; - mbm_current->total_bytes = 0; - return mbm_current->total_bytes; - } - - /* - * The h/w guarantees that counters will not overflow - * so long as we poll them at least once per second. - */ - shift = 64 - MBM_CNTR_WIDTH; - bytes = (val << shift) - (mbm_current->prev_msr << shift); - bytes >>= shift; - - bytes *= cqm_l3_scale; - - mbm_current->total_bytes += bytes; - mbm_current->prev_msr = val; - - return mbm_current->total_bytes; -} - -static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type) -{ - return update_sample(rmid, evt_type, 0); -} - -static void __intel_mbm_event_init(void *info) -{ - struct rmid_read *rr = info; - - update_sample(rr->rmid, rr->evt_type, 1); -} - -static void init_mbm_sample(u32 rmid, u32 evt_type) -{ - struct rmid_read rr = { - .rmid = rmid, - .evt_type = evt_type, - .value = ATOMIC64_INIT(0), - }; - - /* on each socket, init sample */ - on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1); -} - -/* - * Find a group and setup RMID. - * - * If we're part of a group, we use the group's RMID. - */ -static void intel_cqm_setup_event(struct perf_event *event, - struct perf_event **group) -{ - struct perf_event *iter; - bool conflict = false; - u32 rmid; - - event->hw.is_group_event = false; - list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { - rmid = iter->hw.cqm_rmid; - - if (__match_event(iter, event)) { - /* All tasks in a group share an RMID */ - event->hw.cqm_rmid = rmid; - *group = iter; - if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) - init_mbm_sample(rmid, event->attr.config); - return; - } - - /* - * We only care about conflicts for events that are - * actually scheduled in (and hence have a valid RMID). - */ - if (__conflict_event(iter, event) && __rmid_valid(rmid)) - conflict = true; - } - - if (conflict) - rmid = INVALID_RMID; - else - rmid = __get_rmid(); - - if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) - init_mbm_sample(rmid, event->attr.config); - - event->hw.cqm_rmid = rmid; -} - -static void intel_cqm_event_read(struct perf_event *event) -{ - unsigned long flags; - u32 rmid; - u64 val; - - /* - * Task events are handled by intel_cqm_event_count(). - */ - if (event->cpu == -1) - return; - - raw_spin_lock_irqsave(&cache_lock, flags); - rmid = event->hw.cqm_rmid; - - if (!__rmid_valid(rmid)) - goto out; - - if (is_mbm_event(event->attr.config)) - val = rmid_read_mbm(rmid, event->attr.config); - else - val = __rmid_read(rmid); - - /* - * Ignore this reading on error states and do not update the value. - */ - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - goto out; - - local64_set(&event->count, val); -out: - raw_spin_unlock_irqrestore(&cache_lock, flags); -} - -static void __intel_cqm_event_count(void *info) -{ - struct rmid_read *rr = info; - u64 val; - - val = __rmid_read(rr->rmid); - - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; - - atomic64_add(val, &rr->value); -} - -static inline bool cqm_group_leader(struct perf_event *event) -{ - return !list_empty(&event->hw.cqm_groups_entry); -} - -static void __intel_mbm_event_count(void *info) -{ - struct rmid_read *rr = info; - u64 val; - - val = rmid_read_mbm(rr->rmid, rr->evt_type); - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; - atomic64_add(val, &rr->value); -} - -static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer) -{ - struct perf_event *iter, *iter1; - int ret = HRTIMER_RESTART; - struct list_head *head; - unsigned long flags; - u32 grp_rmid; - - /* - * Need to cache_lock as the timer Event Select MSR reads - * can race with the mbm/cqm count() and mbm_init() reads. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - if (list_empty(&cache_groups)) { - ret = HRTIMER_NORESTART; - goto out; - } - - list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { - grp_rmid = iter->hw.cqm_rmid; - if (!__rmid_valid(grp_rmid)) - continue; - if (is_mbm_event(iter->attr.config)) - update_sample(grp_rmid, iter->attr.config, 0); - - head = &iter->hw.cqm_group_entry; - if (list_empty(head)) - continue; - list_for_each_entry(iter1, head, hw.cqm_group_entry) { - if (!iter1->hw.is_group_event) - break; - if (is_mbm_event(iter1->attr.config)) - update_sample(iter1->hw.cqm_rmid, - iter1->attr.config, 0); - } - } - - hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME)); -out: - raw_spin_unlock_irqrestore(&cache_lock, flags); - - return ret; -} - -static void __mbm_start_timer(void *info) -{ - hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME), - HRTIMER_MODE_REL_PINNED); -} - -static void __mbm_stop_timer(void *info) -{ - hrtimer_cancel(&mbm_timers[pkg_id]); -} - -static void mbm_start_timers(void) -{ - on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1); -} - -static void mbm_stop_timers(void) -{ - on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1); -} - -static void mbm_hrtimer_init(void) -{ - struct hrtimer *hr; - int i; - - for (i = 0; i < mbm_socket_max; i++) { - hr = &mbm_timers[i]; - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hr->function = mbm_hrtimer_handle; - } -} - -static u64 intel_cqm_event_count(struct perf_event *event) -{ - unsigned long flags; - struct rmid_read rr = { - .evt_type = event->attr.config, - .value = ATOMIC64_INIT(0), - }; - - /* - * We only need to worry about task events. System-wide events - * are handled like usual, i.e. entirely with - * intel_cqm_event_read(). - */ - if (event->cpu != -1) - return __perf_event_count(event); - - /* - * Only the group leader gets to report values except in case of - * multiple events in the same group, we still need to read the - * other events.This stops us - * reporting duplicate values to userspace, and gives us a clear - * rule for which task gets to report the values. - * - * Note that it is impossible to attribute these values to - * specific packages - we forfeit that ability when we create - * task events. - */ - if (!cqm_group_leader(event) && !event->hw.is_group_event) - return 0; - - /* - * Getting up-to-date values requires an SMP IPI which is not - * possible if we're being called in interrupt context. Return - * the cached values instead. - */ - if (unlikely(in_interrupt())) - goto out; - - /* - * Notice that we don't perform the reading of an RMID - * atomically, because we can't hold a spin lock across the - * IPIs. - * - * Speculatively perform the read, since @event might be - * assigned a different (possibly invalid) RMID while we're - * busying performing the IPI calls. It's therefore necessary to - * check @event's RMID afterwards, and if it has changed, - * discard the result of the read. - */ - rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid); - - if (!__rmid_valid(rr.rmid)) - goto out; - - cqm_mask_call(&rr); - - raw_spin_lock_irqsave(&cache_lock, flags); - if (event->hw.cqm_rmid == rr.rmid) - local64_set(&event->count, atomic64_read(&rr.value)); - raw_spin_unlock_irqrestore(&cache_lock, flags); -out: - return __perf_event_count(event); -} - -static void intel_cqm_event_start(struct perf_event *event, int mode) -{ - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - u32 rmid = event->hw.cqm_rmid; - - if (!(event->hw.cqm_state & PERF_HES_STOPPED)) - return; - - event->hw.cqm_state &= ~PERF_HES_STOPPED; - - if (state->rmid_usecnt++) { - if (!WARN_ON_ONCE(state->rmid != rmid)) - return; - } else { - WARN_ON_ONCE(state->rmid); - } - - state->rmid = rmid; - wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid); -} - -static void intel_cqm_event_stop(struct perf_event *event, int mode) -{ - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - - if (event->hw.cqm_state & PERF_HES_STOPPED) - return; - - event->hw.cqm_state |= PERF_HES_STOPPED; - - intel_cqm_event_read(event); - - if (!--state->rmid_usecnt) { - state->rmid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid); - } else { - WARN_ON_ONCE(!state->rmid); - } -} - -static int intel_cqm_event_add(struct perf_event *event, int mode) -{ - unsigned long flags; - u32 rmid; - - raw_spin_lock_irqsave(&cache_lock, flags); - - event->hw.cqm_state = PERF_HES_STOPPED; - rmid = event->hw.cqm_rmid; - - if (__rmid_valid(rmid) && (mode & PERF_EF_START)) - intel_cqm_event_start(event, mode); - - raw_spin_unlock_irqrestore(&cache_lock, flags); - - return 0; -} - -static void intel_cqm_event_destroy(struct perf_event *event) -{ - struct perf_event *group_other = NULL; - unsigned long flags; - - mutex_lock(&cache_mutex); - /* - * Hold the cache_lock as mbm timer handlers could be - * scanning the list of events. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - /* - * If there's another event in this group... - */ - if (!list_empty(&event->hw.cqm_group_entry)) { - group_other = list_first_entry(&event->hw.cqm_group_entry, - struct perf_event, - hw.cqm_group_entry); - list_del(&event->hw.cqm_group_entry); - } - - /* - * And we're the group leader.. - */ - if (cqm_group_leader(event)) { - /* - * If there was a group_other, make that leader, otherwise - * destroy the group and return the RMID. - */ - if (group_other) { - list_replace(&event->hw.cqm_groups_entry, - &group_other->hw.cqm_groups_entry); - } else { - u32 rmid = event->hw.cqm_rmid; - - if (__rmid_valid(rmid)) - __put_rmid(rmid); - list_del(&event->hw.cqm_groups_entry); - } - } - - raw_spin_unlock_irqrestore(&cache_lock, flags); - - /* - * Stop the mbm overflow timers when the last event is destroyed. - */ - if (mbm_enabled && list_empty(&cache_groups)) - mbm_stop_timers(); - - mutex_unlock(&cache_mutex); -} - -static int intel_cqm_event_init(struct perf_event *event) -{ - struct perf_event *group = NULL; - bool rotate = false; - unsigned long flags; - - if (event->attr.type != intel_cqm_pmu.type) - return -ENOENT; - - if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) || - (event->attr.config > QOS_MBM_LOCAL_EVENT_ID)) - return -EINVAL; - - if ((is_cqm_event(event->attr.config) && !cqm_enabled) || - (is_mbm_event(event->attr.config) && !mbm_enabled)) - return -EINVAL; - - /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ - return -EINVAL; - - INIT_LIST_HEAD(&event->hw.cqm_group_entry); - INIT_LIST_HEAD(&event->hw.cqm_groups_entry); - - event->destroy = intel_cqm_event_destroy; - - mutex_lock(&cache_mutex); - - /* - * Start the mbm overflow timers when the first event is created. - */ - if (mbm_enabled && list_empty(&cache_groups)) - mbm_start_timers(); - - /* Will also set rmid */ - intel_cqm_setup_event(event, &group); - - /* - * Hold the cache_lock as mbm timer handlers be - * scanning the list of events. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - if (group) { - list_add_tail(&event->hw.cqm_group_entry, - &group->hw.cqm_group_entry); - } else { - list_add_tail(&event->hw.cqm_groups_entry, - &cache_groups); - - /* - * All RMIDs are either in use or have recently been - * used. Kick the rotation worker to clean/free some. - * - * We only do this for the group leader, rather than for - * every event in a group to save on needless work. - */ - if (!__rmid_valid(event->hw.cqm_rmid)) - rotate = true; - } - - raw_spin_unlock_irqrestore(&cache_lock, flags); - mutex_unlock(&cache_mutex); - - if (rotate) - schedule_delayed_work(&intel_cqm_rmid_work, 0); - - return 0; -} - -EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); -EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1"); -EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); -EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); -EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); - -EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02"); -EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1"); -EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB"); -EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6"); - -EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03"); -EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1"); -EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB"); -EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6"); - -static struct attribute *intel_cqm_events_attr[] = { - EVENT_PTR(intel_cqm_llc), - EVENT_PTR(intel_cqm_llc_pkg), - EVENT_PTR(intel_cqm_llc_unit), - EVENT_PTR(intel_cqm_llc_scale), - EVENT_PTR(intel_cqm_llc_snapshot), - NULL, -}; - -static struct attribute *intel_mbm_events_attr[] = { - EVENT_PTR(intel_cqm_total_bytes), - EVENT_PTR(intel_cqm_local_bytes), - EVENT_PTR(intel_cqm_total_bytes_pkg), - EVENT_PTR(intel_cqm_local_bytes_pkg), - EVENT_PTR(intel_cqm_total_bytes_unit), - EVENT_PTR(intel_cqm_local_bytes_unit), - EVENT_PTR(intel_cqm_total_bytes_scale), - EVENT_PTR(intel_cqm_local_bytes_scale), - NULL, -}; - -static struct attribute *intel_cmt_mbm_events_attr[] = { - EVENT_PTR(intel_cqm_llc), - EVENT_PTR(intel_cqm_total_bytes), - EVENT_PTR(intel_cqm_local_bytes), - EVENT_PTR(intel_cqm_llc_pkg), - EVENT_PTR(intel_cqm_total_bytes_pkg), - EVENT_PTR(intel_cqm_local_bytes_pkg), - EVENT_PTR(intel_cqm_llc_unit), - EVENT_PTR(intel_cqm_total_bytes_unit), - EVENT_PTR(intel_cqm_local_bytes_unit), - EVENT_PTR(intel_cqm_llc_scale), - EVENT_PTR(intel_cqm_total_bytes_scale), - EVENT_PTR(intel_cqm_local_bytes_scale), - EVENT_PTR(intel_cqm_llc_snapshot), - NULL, -}; - -static struct attribute_group intel_cqm_events_group = { - .name = "events", - .attrs = NULL, -}; - -PMU_FORMAT_ATTR(event, "config:0-7"); -static struct attribute *intel_cqm_formats_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group intel_cqm_format_group = { - .name = "format", - .attrs = intel_cqm_formats_attr, -}; - -static ssize_t -max_recycle_threshold_show(struct device *dev, struct device_attribute *attr, - char *page) -{ - ssize_t rv; - - mutex_lock(&cache_mutex); - rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold); - mutex_unlock(&cache_mutex); - - return rv; -} - -static ssize_t -max_recycle_threshold_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - unsigned int bytes, cachelines; - int ret; - - ret = kstrtouint(buf, 0, &bytes); - if (ret) - return ret; - - mutex_lock(&cache_mutex); - - __intel_cqm_max_threshold = bytes; - cachelines = bytes / cqm_l3_scale; - - /* - * The new maximum takes effect immediately. - */ - if (__intel_cqm_threshold > cachelines) - __intel_cqm_threshold = cachelines; - - mutex_unlock(&cache_mutex); - - return count; -} - -static DEVICE_ATTR_RW(max_recycle_threshold); - -static struct attribute *intel_cqm_attrs[] = { - &dev_attr_max_recycle_threshold.attr, - NULL, -}; - -static const struct attribute_group intel_cqm_group = { - .attrs = intel_cqm_attrs, -}; - -static const struct attribute_group *intel_cqm_attr_groups[] = { - &intel_cqm_events_group, - &intel_cqm_format_group, - &intel_cqm_group, - NULL, -}; - -static struct pmu intel_cqm_pmu = { - .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME, - .attr_groups = intel_cqm_attr_groups, - .task_ctx_nr = perf_sw_context, - .event_init = intel_cqm_event_init, - .add = intel_cqm_event_add, - .del = intel_cqm_event_stop, - .start = intel_cqm_event_start, - .stop = intel_cqm_event_stop, - .read = intel_cqm_event_read, - .count = intel_cqm_event_count, -}; - -static inline void cqm_pick_event_reader(int cpu) -{ - int reader; - - /* First online cpu in package becomes the reader */ - reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu)); - if (reader >= nr_cpu_ids) - cpumask_set_cpu(cpu, &cqm_cpumask); -} - -static int intel_cqm_cpu_starting(unsigned int cpu) -{ - struct intel_pqr_state *state = &per_cpu(pqr_state, cpu); - struct cpuinfo_x86 *c = &cpu_data(cpu); - - state->rmid = 0; - state->closid = 0; - state->rmid_usecnt = 0; - - WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); - WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); - - cqm_pick_event_reader(cpu); - return 0; -} - -static int intel_cqm_cpu_exit(unsigned int cpu) -{ - int target; - - /* Is @cpu the current cqm reader for this package ? */ - if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) - return 0; - - /* Find another online reader in this package */ - target = cpumask_any_but(topology_core_cpumask(cpu), cpu); - - if (target < nr_cpu_ids) - cpumask_set_cpu(target, &cqm_cpumask); - - return 0; -} - -static const struct x86_cpu_id intel_cqm_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC }, - {} -}; - -static void mbm_cleanup(void) -{ - if (!mbm_enabled) - return; - - kfree(mbm_local); - kfree(mbm_total); - mbm_enabled = false; -} - -static const struct x86_cpu_id intel_mbm_local_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL }, - {} -}; - -static const struct x86_cpu_id intel_mbm_total_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL }, - {} -}; - -static int intel_mbm_init(void) -{ - int ret = 0, array_size, maxid = cqm_max_rmid + 1; - - mbm_socket_max = topology_max_packages(); - array_size = sizeof(struct sample) * maxid * mbm_socket_max; - mbm_local = kmalloc(array_size, GFP_KERNEL); - if (!mbm_local) - return -ENOMEM; - - mbm_total = kmalloc(array_size, GFP_KERNEL); - if (!mbm_total) { - ret = -ENOMEM; - goto out; - } - - array_size = sizeof(struct hrtimer) * mbm_socket_max; - mbm_timers = kmalloc(array_size, GFP_KERNEL); - if (!mbm_timers) { - ret = -ENOMEM; - goto out; - } - mbm_hrtimer_init(); - -out: - if (ret) - mbm_cleanup(); - - return ret; -} - -static int __init intel_cqm_init(void) -{ - char *str = NULL, scale[20]; - int cpu, ret; - - if (x86_match_cpu(intel_cqm_match)) - cqm_enabled = true; - - if (x86_match_cpu(intel_mbm_local_match) && - x86_match_cpu(intel_mbm_total_match)) - mbm_enabled = true; - - if (!cqm_enabled && !mbm_enabled) - return -ENODEV; - - cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; - - /* - * It's possible that not all resources support the same number - * of RMIDs. Instead of making scheduling much more complicated - * (where we have to match a task's RMID to a cpu that supports - * that many RMIDs) just find the minimum RMIDs supported across - * all cpus. - * - * Also, check that the scales match on all cpus. - */ - cpus_read_lock(); - for_each_online_cpu(cpu) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - - if (c->x86_cache_max_rmid < cqm_max_rmid) - cqm_max_rmid = c->x86_cache_max_rmid; - - if (c->x86_cache_occ_scale != cqm_l3_scale) { - pr_err("Multiple LLC scale values, disabling\n"); - ret = -EINVAL; - goto out; - } - } - - /* - * A reasonable upper limit on the max threshold is the number - * of lines tagged per RMID if all RMIDs have the same number of - * lines tagged in the LLC. - * - * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. - */ - __intel_cqm_max_threshold = - boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); - - snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); - str = kstrdup(scale, GFP_KERNEL); - if (!str) { - ret = -ENOMEM; - goto out; - } - - event_attr_intel_cqm_llc_scale.event_str = str; - - ret = intel_cqm_setup_rmid_cache(); - if (ret) - goto out; - - if (mbm_enabled) - ret = intel_mbm_init(); - if (ret && !cqm_enabled) - goto out; - - if (cqm_enabled && mbm_enabled) - intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr; - else if (!cqm_enabled && mbm_enabled) - intel_cqm_events_group.attrs = intel_mbm_events_attr; - else if (cqm_enabled && !mbm_enabled) - intel_cqm_events_group.attrs = intel_cqm_events_attr; - - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); - if (ret) { - pr_err("Intel CQM perf registration failed: %d\n", ret); - goto out; - } - - if (cqm_enabled) - pr_info("Intel CQM monitoring enabled\n"); - if (mbm_enabled) - pr_info("Intel MBM enabled\n"); - - /* - * Setup the hot cpu notifier once we are sure cqm - * is enabled to avoid notifier leak. - */ - cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING, - "perf/x86/cqm:starting", - intel_cqm_cpu_starting, NULL); - cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE, - "perf/x86/cqm:online", - NULL, intel_cqm_cpu_exit); -out: - cpus_read_unlock(); - - if (ret) { - kfree(str); - cqm_cleanup(); - mbm_cleanup(); - } - - return ret; -} -device_initcall(intel_cqm_init); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index a322fed5f8ed..e1965e5ff570 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -49,34 +49,47 @@ union intel_x86_pebs_dse { */ #define P(a, b) PERF_MEM_S(a, b) #define OP_LH (P(OP, LOAD) | P(LVL, HIT)) +#define LEVEL(x) P(LVLNUM, x) +#define REM P(REMOTE, REMOTE) #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) /* Version for Sandy Bridge and later */ static u64 pebs_data_source[] = { - P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ - OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */ - OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ - OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ - OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ - OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ - OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ - OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ - OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ - OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ - OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ - OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ - OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */ - OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */ - OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */ - OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */ + P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */ + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ + OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ + OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ + OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ + OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ + OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | SNOOP_NONE_MISS, /* 0x0c: L3 miss, excl */ + OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */ + OP_LH | P(LVL, IO) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */ + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */ }; /* Patch up minor differences in the bits */ void __init intel_pmu_pebs_data_source_nhm(void) { - pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT); - pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); - pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); + pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT); + pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); + pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); +} + +void __init intel_pmu_pebs_data_source_skl(bool pmem) +{ + u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4); + + pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT); + pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT); + pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE); + pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD); + pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM); } static u64 precise_store_data(u64 status) @@ -149,8 +162,6 @@ static u64 load_latency_data(u64 status) { union intel_x86_pebs_dse dse; u64 val; - int model = boot_cpu_data.x86_model; - int fam = boot_cpu_data.x86; dse.val = status; @@ -162,8 +173,7 @@ static u64 load_latency_data(u64 status) /* * Nehalem models do not support TLB, Lock infos */ - if (fam == 0x6 && (model == 26 || model == 30 - || model == 31 || model == 46)) { + if (x86_pmu.pebs_no_tlb) { val |= P(TLB, NA) | P(LOCK, NA); return val; } @@ -1175,7 +1185,7 @@ static void setup_pebs_sample_data(struct perf_event *event, else regs->flags &= ~PERF_EFLAGS_EXACT; - if ((sample_type & PERF_SAMPLE_ADDR) && + if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) && x86_pmu.intel_cap.pebs_format >= 1) data->addr = pebs->dla; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 955457a30197..8a6bbacd17dc 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -109,6 +109,9 @@ enum { X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ X86_BR_CALL_STACK = 1 << 16,/* call stack */ X86_BR_IND_JMP = 1 << 17,/* indirect jump */ + + X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ + }; #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) @@ -514,6 +517,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].in_tx = 0; cpuc->lbr_entries[i].abort = 0; cpuc->lbr_entries[i].cycles = 0; + cpuc->lbr_entries[i].type = 0; cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; @@ -600,6 +604,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_entries[out].in_tx = in_tx; cpuc->lbr_entries[out].abort = abort; cpuc->lbr_entries[out].cycles = cycles; + cpuc->lbr_entries[out].type = 0; cpuc->lbr_entries[out].reserved = 0; out++; } @@ -677,6 +682,10 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_CALL) mask |= X86_BR_CALL | X86_BR_ZERO_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE) + mask |= X86_BR_TYPE_SAVE; + /* * stash actual user request into reg, it may * be used by fixup code for some CPU @@ -930,6 +939,43 @@ static int branch_type(unsigned long from, unsigned long to, int abort) return ret; } +#define X86_BR_TYPE_MAP_MAX 16 + +static int branch_map[X86_BR_TYPE_MAP_MAX] = { + PERF_BR_CALL, /* X86_BR_CALL */ + PERF_BR_RET, /* X86_BR_RET */ + PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ + PERF_BR_SYSRET, /* X86_BR_SYSRET */ + PERF_BR_UNKNOWN, /* X86_BR_INT */ + PERF_BR_UNKNOWN, /* X86_BR_IRET */ + PERF_BR_COND, /* X86_BR_JCC */ + PERF_BR_UNCOND, /* X86_BR_JMP */ + PERF_BR_UNKNOWN, /* X86_BR_IRQ */ + PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_ABORT */ + PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ + PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ + PERF_BR_CALL, /* X86_BR_ZERO_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ + PERF_BR_IND, /* X86_BR_IND_JMP */ +}; + +static int +common_branch_type(int type) +{ + int i; + + type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ + + if (type) { + i = __ffs(type); + if (i < X86_BR_TYPE_MAP_MAX) + return branch_map[i]; + } + + return PERF_BR_UNKNOWN; +} + /* * implement actual branch filter based on user demand. * Hardware may not exactly satisfy that request, thus @@ -946,7 +992,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) bool compress = false; /* if sampling all branches, then nothing to filter */ - if ((br_sel & X86_BR_ALL) == X86_BR_ALL) + if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && + ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) return; for (i = 0; i < cpuc->lbr_stack.nr; i++) { @@ -967,6 +1014,9 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].from = 0; compress = true; } + + if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) + cpuc->lbr_entries[i].type = common_branch_type(type); } if (!compress) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index ae8324d65e61..81fd41d5a0d9 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -471,8 +471,9 @@ static void pt_config(struct perf_event *event) struct pt *pt = this_cpu_ptr(&pt_ctx); u64 reg; - if (!event->hw.itrace_started) { - event->hw.itrace_started = 1; + /* First round: clear STATUS, in particular the PSB byte counter. */ + if (!event->hw.config) { + perf_event_itrace_started(event); wrmsrl(MSR_IA32_RTIT_STATUS, 0); } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 476aec3a4cab..4196f81ec0e1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -91,7 +91,7 @@ struct amd_nb { (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ - PERF_SAMPLE_TRANSACTION) + PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR) /* * A debug store configuration. @@ -558,6 +558,7 @@ struct x86_pmu { int attr_rdpmc; struct attribute **format_attrs; struct attribute **event_attrs; + struct attribute **caps_attrs; ssize_t (*events_sysfs_show)(char *page, u64 config); struct attribute **cpu_events; @@ -591,7 +592,8 @@ struct x86_pmu { pebs :1, pebs_active :1, pebs_broken :1, - pebs_prec_dist :1; + pebs_prec_dist :1, + pebs_no_tlb :1; int pebs_record_size; int pebs_buffer_size; void (*drain_pebs)(struct pt_regs *regs); @@ -741,6 +743,8 @@ int x86_reserve_hardware(void); void x86_release_hardware(void); +int x86_pmu_max_precise(void); + void hw_perf_lbr_event_destroy(struct perf_event *event); int x86_setup_perfctr(struct perf_event *event); @@ -947,6 +951,8 @@ void intel_pmu_lbr_init_knl(void); void intel_pmu_pebs_data_source_nhm(void); +void intel_pmu_pebs_data_source_skl(bool pmem); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index 171ae09864d7..367a8203cfcf 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1 +1 @@ -obj-y := hv_init.o +obj-y := hv_init.o mmu.o diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 5b882cc0c0e9..1a8eb550c40f 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -26,6 +26,8 @@ #include <linux/mm.h> #include <linux/clockchips.h> #include <linux/hyperv.h> +#include <linux/slab.h> +#include <linux/cpuhotplug.h> #ifdef CONFIG_HYPERV_TSCPAGE @@ -75,10 +77,25 @@ static struct clocksource hyperv_cs_msr = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static void *hypercall_pg; +void *hv_hypercall_pg; +EXPORT_SYMBOL_GPL(hv_hypercall_pg); struct clocksource *hyperv_cs; EXPORT_SYMBOL_GPL(hyperv_cs); +u32 *hv_vp_index; +EXPORT_SYMBOL_GPL(hv_vp_index); + +static int hv_cpu_init(unsigned int cpu) +{ + u64 msr_vp_index; + + hv_get_vp_index(msr_vp_index); + + hv_vp_index[smp_processor_id()] = msr_vp_index; + + return 0; +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -94,6 +111,16 @@ void hyperv_init(void) if (x86_hyper != &x86_hyper_ms_hyperv) return; + /* Allocate percpu VP index */ + hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), + GFP_KERNEL); + if (!hv_vp_index) + return; + + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + hv_cpu_init, NULL) < 0) + goto free_vp_index; + /* * Setup the hypercall page and enable hypercalls. * 1. Register the guest ID @@ -102,17 +129,19 @@ void hyperv_init(void) guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); - if (hypercall_pg == NULL) { + hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); + if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - return; + goto free_vp_index; } rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; - hypercall_msr.guest_physical_address = vmalloc_to_pfn(hypercall_pg); + hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hyper_alloc_mmu(); + /* * Register Hyper-V specific clocksource. */ @@ -148,6 +177,12 @@ register_msr_cs: hyperv_cs = &hyperv_cs_msr; if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); + + return; + +free_vp_index: + kfree(hv_vp_index); + hv_vp_index = NULL; } /* @@ -170,51 +205,6 @@ void hyperv_cleanup(void) } EXPORT_SYMBOL_GPL(hyperv_cleanup); -/* - * hv_do_hypercall- Invoke the specified hypercall - */ -u64 hv_do_hypercall(u64 control, void *input, void *output) -{ - u64 input_address = (input) ? virt_to_phys(input) : 0; - u64 output_address = (output) ? virt_to_phys(output) : 0; -#ifdef CONFIG_X86_64 - u64 hv_status = 0; - - if (!hypercall_pg) - return (u64)ULLONG_MAX; - - __asm__ __volatile__("mov %0, %%r8" : : "r" (output_address) : "r8"); - __asm__ __volatile__("call *%3" : "=a" (hv_status) : - "c" (control), "d" (input_address), - "m" (hypercall_pg)); - - return hv_status; - -#else - - u32 control_hi = control >> 32; - u32 control_lo = control & 0xFFFFFFFF; - u32 hv_status_hi = 1; - u32 hv_status_lo = 1; - u32 input_address_hi = input_address >> 32; - u32 input_address_lo = input_address & 0xFFFFFFFF; - u32 output_address_hi = output_address >> 32; - u32 output_address_lo = output_address & 0xFFFFFFFF; - - if (!hypercall_pg) - return (u64)ULLONG_MAX; - - __asm__ __volatile__ ("call *%8" : "=d"(hv_status_hi), - "=a"(hv_status_lo) : "d" (control_hi), - "a" (control_lo), "b" (input_address_hi), - "c" (input_address_lo), "D"(output_address_hi), - "S"(output_address_lo), "m" (hypercall_pg)); - - return hv_status_lo | ((u64)hv_status_hi << 32); -#endif /* !x86_64 */ -} -EXPORT_SYMBOL_GPL(hv_do_hypercall); - void hyperv_report_panic(struct pt_regs *regs) { static bool panic_reported; diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c new file mode 100644 index 000000000000..39e7f6e50919 --- /dev/null +++ b/arch/x86/hyperv/mmu.c @@ -0,0 +1,272 @@ +#define pr_fmt(fmt) "Hyper-V: " fmt + +#include <linux/hyperv.h> +#include <linux/log2.h> +#include <linux/slab.h> +#include <linux/types.h> + +#include <asm/fpu/api.h> +#include <asm/mshyperv.h> +#include <asm/msr.h> +#include <asm/tlbflush.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/hyperv.h> + +/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ +struct hv_flush_pcpu { + u64 address_space; + u64 flags; + u64 processor_mask; + u64 gva_list[]; +}; + +/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ +struct hv_flush_pcpu_ex { + u64 address_space; + u64 flags; + struct { + u64 format; + u64 valid_bank_mask; + u64 bank_contents[]; + } hv_vp_set; + u64 gva_list[]; +}; + +/* Each gva in gva_list encodes up to 4096 pages to flush */ +#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE) + +static struct hv_flush_pcpu __percpu *pcpu_flush; + +static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex; + +/* + * Fills in gva_list starting from offset. Returns the number of items added. + */ +static inline int fill_gva_list(u64 gva_list[], int offset, + unsigned long start, unsigned long end) +{ + int gva_n = offset; + unsigned long cur = start, diff; + + do { + diff = end > cur ? end - cur : 0; + + gva_list[gva_n] = cur & PAGE_MASK; + /* + * Lower 12 bits encode the number of additional + * pages to flush (in addition to the 'cur' page). + */ + if (diff >= HV_TLB_FLUSH_UNIT) + gva_list[gva_n] |= ~PAGE_MASK; + else if (diff) + gva_list[gva_n] |= (diff - 1) >> PAGE_SHIFT; + + cur += HV_TLB_FLUSH_UNIT; + gva_n++; + + } while (cur < end); + + return gva_n - offset; +} + +/* Return the number of banks in the resulting vp_set */ +static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush, + const struct cpumask *cpus) +{ + int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1; + + /* + * Some banks may end up being empty but this is acceptable. + */ + for_each_cpu(cpu, cpus) { + vcpu = hv_cpu_number_to_vp_number(cpu); + vcpu_bank = vcpu / 64; + vcpu_offset = vcpu % 64; + + /* valid_bank_mask can represent up to 64 banks */ + if (vcpu_bank >= 64) + return 0; + + __set_bit(vcpu_offset, (unsigned long *) + &flush->hv_vp_set.bank_contents[vcpu_bank]); + if (vcpu_bank >= nr_bank) + nr_bank = vcpu_bank + 1; + } + flush->hv_vp_set.valid_bank_mask = GENMASK_ULL(nr_bank - 1, 0); + + return nr_bank; +} + +static void hyperv_flush_tlb_others(const struct cpumask *cpus, + const struct flush_tlb_info *info) +{ + int cpu, vcpu, gva_n, max_gvas; + struct hv_flush_pcpu *flush; + u64 status = U64_MAX; + unsigned long flags; + + trace_hyperv_mmu_flush_tlb_others(cpus, info); + + if (!pcpu_flush || !hv_hypercall_pg) + goto do_native; + + if (cpumask_empty(cpus)) + return; + + local_irq_save(flags); + + flush = this_cpu_ptr(pcpu_flush); + + if (info->mm) { + flush->address_space = virt_to_phys(info->mm->pgd); + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->processor_mask = 0; + if (cpumask_equal(cpus, cpu_present_mask)) { + flush->flags |= HV_FLUSH_ALL_PROCESSORS; + } else { + for_each_cpu(cpu, cpus) { + vcpu = hv_cpu_number_to_vp_number(cpu); + if (vcpu >= 64) + goto do_native; + + __set_bit(vcpu, (unsigned long *) + &flush->processor_mask); + } + } + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = (PAGE_SIZE - sizeof(*flush)) / sizeof(flush->gva_list[0]); + + if (info->end == TLB_FLUSH_ALL) { + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else if (info->end && + ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else { + gva_n = fill_gva_list(flush->gva_list, 0, + info->start, info->end); + status = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST, + gva_n, 0, flush, NULL); + } + + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + return; +do_native: + native_flush_tlb_others(cpus, info); +} + +static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, + const struct flush_tlb_info *info) +{ + int nr_bank = 0, max_gvas, gva_n; + struct hv_flush_pcpu_ex *flush; + u64 status = U64_MAX; + unsigned long flags; + + trace_hyperv_mmu_flush_tlb_others(cpus, info); + + if (!pcpu_flush_ex || !hv_hypercall_pg) + goto do_native; + + if (cpumask_empty(cpus)) + return; + + local_irq_save(flags); + + flush = this_cpu_ptr(pcpu_flush_ex); + + if (info->mm) { + flush->address_space = virt_to_phys(info->mm->pgd); + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->hv_vp_set.valid_bank_mask = 0; + + if (!cpumask_equal(cpus, cpu_present_mask)) { + flush->hv_vp_set.format = HV_GENERIC_SET_SPARCE_4K; + nr_bank = cpumask_to_vp_set(flush, cpus); + } + + if (!nr_bank) { + flush->hv_vp_set.format = HV_GENERIC_SET_ALL; + flush->flags |= HV_FLUSH_ALL_PROCESSORS; + } + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = + (PAGE_SIZE - sizeof(*flush) - nr_bank * + sizeof(flush->hv_vp_set.bank_contents[0])) / + sizeof(flush->gva_list[0]); + + if (info->end == TLB_FLUSH_ALL) { + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank + 2, flush, NULL); + } else if (info->end && + ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank + 2, flush, NULL); + } else { + gva_n = fill_gva_list(flush->gva_list, nr_bank, + info->start, info->end); + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, + gva_n, nr_bank + 2, flush, NULL); + } + + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + return; +do_native: + native_flush_tlb_others(cpus, info); +} + +void hyperv_setup_mmu_ops(void) +{ + if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) + return; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) { + pr_info("Using hypercall for remote TLB flush\n"); + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; + } else { + pr_info("Using ext hypercall for remote TLB flush\n"); + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others_ex; + } +} + +void hyper_alloc_mmu(void) +{ + if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) + return; + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + else + pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); +} diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 724153797209..e0bb46c02857 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -226,7 +226,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, if (ksig->ka.sa.sa_flags & SA_ONSTACK) sp = sigsp(sp, ksig); /* This is the legacy signal stack switching. */ - else if ((regs->ss & 0xffff) != __USER32_DS && + else if (regs->ss != __USER32_DS && !(ksig->ka.sa.sa_flags & SA_RESTORER) && ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 7a9df3beb89b..676ee5807d86 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -74,6 +74,9 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) +# define _ASM_EXTABLE_REFCOUNT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) + # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ _ASM_ALIGN ; \ @@ -123,6 +126,9 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) +# define _ASM_EXTABLE_REFCOUNT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) + /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 33380b871463..0874ebda3069 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -197,35 +197,56 @@ static inline int atomic_xchg(atomic_t *v, int new) return xchg(&v->counter, new); } -#define ATOMIC_OP(op) \ -static inline void atomic_##op(int i, atomic_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"l %1,%0" \ - : "+m" (v->counter) \ - : "ir" (i) \ - : "memory"); \ +static inline void atomic_and(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "andl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_and(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val & i)); + + return val; } -#define ATOMIC_FETCH_OP(op, c_op) \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ -{ \ - int val = atomic_read(v); \ - do { \ - } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline void atomic_or(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "orl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); } -#define ATOMIC_OPS(op, c_op) \ - ATOMIC_OP(op) \ - ATOMIC_FETCH_OP(op, c_op) +static inline int atomic_fetch_or(int i, atomic_t *v) +{ + int val = atomic_read(v); -ATOMIC_OPS(and, &) -ATOMIC_OPS(or , |) -ATOMIC_OPS(xor, ^) + do { } while (!atomic_try_cmpxchg(v, &val, val | i)); -#undef ATOMIC_OPS -#undef ATOMIC_FETCH_OP -#undef ATOMIC_OP + return val; +} + +static inline void atomic_xor(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "xorl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_xor(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val ^ i)); + + return val; +} /** * __atomic_add_unless - add unless the number is already a given value @@ -239,10 +260,12 @@ ATOMIC_OPS(xor, ^) static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c = atomic_read(v); + do { if (unlikely(c == u)) break; } while (!atomic_try_cmpxchg(v, &c, c + a)); + return c; } diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 71d7705fb303..9e206f31ce2a 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -312,37 +312,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 -#define ATOMIC64_OP(op, c_op) \ -static inline void atomic64_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ +static inline void atomic64_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ - return old; \ +static inline long long atomic64_fetch_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; + + return old; } -ATOMIC64_FETCH_OP(add, +) +static inline void atomic64_or(long long i, atomic64_t *v) +{ + long long old, c = 0; -#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; +} + +static inline long long atomic64_fetch_or(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; + + return old; +} -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op, c_op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long long atomic64_fetch_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; + + return old; +} -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP +static inline long long atomic64_fetch_add(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c + i)) != c) + c = old; + + return old; +} + +#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 6189a433c9a9..5d9de36a2f04 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -177,7 +177,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) } #define atomic64_try_cmpxchg atomic64_try_cmpxchg -static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new) +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new) { return try_cmpxchg(&v->counter, old, new); } @@ -198,7 +198,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new) */ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) { - long c = atomic64_read(v); + s64 c = atomic64_read(v); do { if (unlikely(c == u)) return false; @@ -217,7 +217,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) */ static inline long atomic64_dec_if_positive(atomic64_t *v) { - long dec, c = atomic64_read(v); + s64 dec, c = atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) @@ -226,34 +226,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) return dec; } -#define ATOMIC64_OP(op) \ -static inline void atomic64_##op(long i, atomic64_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"q %1,%0" \ - : "+m" (v->counter) \ - : "er" (i) \ - : "memory"); \ +static inline void atomic64_and(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "andq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ -{ \ - long val = atomic64_read(v); \ - do { \ - } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline long atomic64_fetch_and(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val & i)); + return val; } -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_or(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "orq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long atomic64_fetch_or(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP + do { + } while (!atomic64_try_cmpxchg(v, &val, val | i)); + return val; +} + +static inline void atomic64_xor(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "xorq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} + +static inline long atomic64_fetch_xor(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val ^ i)); + return val; +} #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d90296d061e8..b5069e802d5c 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -157,7 +157,7 @@ extern void __add_wrong_size(void) #define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ ({ \ bool success; \ - __typeof__(_ptr) _old = (_pold); \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ switch (size) { \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 66ac08607471..42bbbf0f173d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -177,7 +177,7 @@ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ #define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ #define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ #define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ /* diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index d0a21b12dd58..1a2ba368da39 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -5,6 +5,7 @@ #include <asm/ldt.h> #include <asm/mmu.h> #include <asm/fixmap.h> +#include <asm/irq_vectors.h> #include <linux/smp.h> #include <linux/percpu.h> @@ -22,7 +23,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->s = 1; desc->dpl = 0x3; desc->p = info->seg_not_present ^ 1; - desc->limit = (info->limit & 0xf0000) >> 16; + desc->limit1 = (info->limit & 0xf0000) >> 16; desc->avl = info->useable; desc->d = info->seg_32bit; desc->g = info->limit_in_pages; @@ -83,33 +84,25 @@ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); } -#ifdef CONFIG_X86_64 - static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, unsigned dpl, unsigned ist, unsigned seg) { - gate->offset_low = PTR_LOW(func); + gate->offset_low = (u16) func; + gate->bits.p = 1; + gate->bits.dpl = dpl; + gate->bits.zero = 0; + gate->bits.type = type; + gate->offset_middle = (u16) (func >> 16); +#ifdef CONFIG_X86_64 gate->segment = __KERNEL_CS; - gate->ist = ist; - gate->p = 1; - gate->dpl = dpl; - gate->zero0 = 0; - gate->zero1 = 0; - gate->type = type; - gate->offset_middle = PTR_MIDDLE(func); - gate->offset_high = PTR_HIGH(func); -} - + gate->bits.ist = ist; + gate->reserved = 0; + gate->offset_high = (u32) (func >> 32); #else -static inline void pack_gate(gate_desc *gate, unsigned char type, - unsigned long base, unsigned dpl, unsigned flags, - unsigned short seg) -{ - gate->a = (seg << 16) | (base & 0xffff); - gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8); -} - + gate->segment = seg; + gate->bits.ist = 0; #endif +} static inline int desc_empty(const void *ptr) { @@ -173,35 +166,22 @@ native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int memcpy(&gdt[entry], desc, size); } -static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, - unsigned long limit, unsigned char type, - unsigned char flags) -{ - desc->a = ((base & 0xffff) << 16) | (limit & 0xffff); - desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | - (limit & 0x000f0000) | ((type & 0xff) << 8) | - ((flags & 0xf) << 20); - desc->p = 1; -} - - -static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size) +static inline void set_tssldt_descriptor(void *d, unsigned long addr, + unsigned type, unsigned size) { -#ifdef CONFIG_X86_64 - struct ldttss_desc64 *desc = d; + struct ldttss_desc *desc = d; memset(desc, 0, sizeof(*desc)); - desc->limit0 = size & 0xFFFF; - desc->base0 = PTR_LOW(addr); - desc->base1 = PTR_MIDDLE(addr) & 0xFF; + desc->limit0 = (u16) size; + desc->base0 = (u16) addr; + desc->base1 = (addr >> 16) & 0xFF; desc->type = type; desc->p = 1; desc->limit1 = (size >> 16) & 0xF; - desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; - desc->base3 = PTR_HIGH(addr); -#else - pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); + desc->base2 = (addr >> 24) & 0xFF; +#ifdef CONFIG_X86_64 + desc->base3 = (u32) (addr >> 32); #endif } @@ -401,147 +381,20 @@ static inline void set_desc_base(struct desc_struct *desc, unsigned long base) static inline unsigned long get_desc_limit(const struct desc_struct *desc) { - return desc->limit0 | (desc->limit << 16); + return desc->limit0 | (desc->limit1 << 16); } static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) { desc->limit0 = limit & 0xffff; - desc->limit = (limit >> 16) & 0xf; -} - -#ifdef CONFIG_X86_64 -static inline void set_nmi_gate(int gate, void *addr) -{ - gate_desc s; - - pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); - write_idt_entry(debug_idt_table, gate, &s); + desc->limit1 = (limit >> 16) & 0xf; } -#endif -#ifdef CONFIG_TRACING -extern struct desc_ptr trace_idt_descr; -extern gate_desc trace_idt_table[]; -static inline void write_trace_idt_entry(int entry, const gate_desc *gate) -{ - write_idt_entry(trace_idt_table, entry, gate); -} +void update_intr_gate(unsigned int n, const void *addr); +void alloc_intr_gate(unsigned int n, const void *addr); -static inline void _trace_set_gate(int gate, unsigned type, void *addr, - unsigned dpl, unsigned ist, unsigned seg) -{ - gate_desc s; - - pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); - /* - * does not need to be atomic because it is only done once at - * setup time - */ - write_trace_idt_entry(gate, &s); -} -#else -static inline void write_trace_idt_entry(int entry, const gate_desc *gate) -{ -} - -#define _trace_set_gate(gate, type, addr, dpl, ist, seg) -#endif - -static inline void _set_gate(int gate, unsigned type, void *addr, - unsigned dpl, unsigned ist, unsigned seg) -{ - gate_desc s; - - pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); - /* - * does not need to be atomic because it is only done once at - * setup time - */ - write_idt_entry(idt_table, gate, &s); - write_trace_idt_entry(gate, &s); -} - -/* - * This needs to use 'idt_table' rather than 'idt', and - * thus use the _nonmapped_ version of the IDT, as the - * Pentium F0 0F bugfix can have resulted in the mapped - * IDT being write-protected. - */ -#define set_intr_gate_notrace(n, addr) \ - do { \ - BUG_ON((unsigned)n > 0xFF); \ - _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ - __KERNEL_CS); \ - } while (0) - -#define set_intr_gate(n, addr) \ - do { \ - set_intr_gate_notrace(n, addr); \ - _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ - 0, 0, __KERNEL_CS); \ - } while (0) - -extern int first_system_vector; -/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */ extern unsigned long used_vectors[]; -static inline void alloc_system_vector(int vector) -{ - if (!test_bit(vector, used_vectors)) { - set_bit(vector, used_vectors); - if (first_system_vector > vector) - first_system_vector = vector; - } else { - BUG(); - } -} - -#define alloc_intr_gate(n, addr) \ - do { \ - alloc_system_vector(n); \ - set_intr_gate(n, addr); \ - } while (0) - -/* - * This routine sets up an interrupt gate at directory privilege level 3. - */ -static inline void set_system_intr_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); -} - -static inline void set_system_trap_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); -} - -static inline void set_trap_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); -} - -static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3)); -} - -static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); -} - -static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); -} - #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u32, debug_idt_ctr); static inline bool is_debug_idt_enabled(void) @@ -567,31 +420,6 @@ static inline void load_debug_idt(void) } #endif -#ifdef CONFIG_TRACING -extern atomic_t trace_idt_ctr; -static inline bool is_trace_idt_enabled(void) -{ - if (atomic_read(&trace_idt_ctr)) - return true; - - return false; -} - -static inline void load_trace_idt(void) -{ - load_idt((const struct desc_ptr *)&trace_idt_descr); -} -#else -static inline bool is_trace_idt_enabled(void) -{ - return false; -} - -static inline void load_trace_idt(void) -{ -} -#endif - /* * The load_current_idt() must be called with interrupts disabled * to avoid races. That way the IDT will always be set back to the expected @@ -603,9 +431,25 @@ static inline void load_current_idt(void) { if (is_debug_idt_enabled()) load_debug_idt(); - else if (is_trace_idt_enabled()) - load_trace_idt(); else load_idt((const struct desc_ptr *)&idt_descr); } + +extern void idt_setup_early_handler(void); +extern void idt_setup_early_traps(void); +extern void idt_setup_traps(void); +extern void idt_setup_apic_and_irq_gates(void); + +#ifdef CONFIG_X86_64 +extern void idt_setup_early_pf(void); +extern void idt_setup_ist_traps(void); +extern void idt_setup_debugidt_traps(void); +#else +static inline void idt_setup_early_pf(void) { } +static inline void idt_setup_ist_traps(void) { } +static inline void idt_setup_debugidt_traps(void) { } +#endif + +extern void idt_invalidate(void *addr); + #endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 49265345d4d2..346d252029b7 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -11,34 +11,30 @@ #include <linux/types.h> -/* - * FIXME: Accessing the desc_struct through its fields is more elegant, - * and should be the one valid thing to do. However, a lot of open code - * still touches the a and b accessors, and doing this allow us to do it - * incrementally. We keep the signature as a struct, rather than a union, - * so we can get rid of it transparently in the future -- glommer - */ /* 8 byte segment descriptor */ struct desc_struct { - union { - struct { - unsigned int a; - unsigned int b; - }; - struct { - u16 limit0; - u16 base0; - unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1; - unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; - }; - }; + u16 limit0; + u16 base0; + u16 base1: 8, type: 4, s: 1, dpl: 2, p: 1; + u16 limit1: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; } __attribute__((packed)); -#define GDT_ENTRY_INIT(flags, base, limit) { { { \ - .a = ((limit) & 0xffff) | (((base) & 0xffff) << 16), \ - .b = (((base) & 0xff0000) >> 16) | (((flags) & 0xf0ff) << 8) | \ - ((limit) & 0xf0000) | ((base) & 0xff000000), \ - } } } +#define GDT_ENTRY_INIT(flags, base, limit) \ + { \ + .limit0 = (u16) (limit), \ + .limit1 = ((limit) >> 16) & 0x0F, \ + .base0 = (u16) (base), \ + .base1 = ((base) >> 16) & 0xFF, \ + .base2 = ((base) >> 24) & 0xFF, \ + .type = (flags & 0x0f), \ + .s = (flags >> 4) & 0x01, \ + .dpl = (flags >> 5) & 0x03, \ + .p = (flags >> 7) & 0x01, \ + .avl = (flags >> 12) & 0x01, \ + .l = (flags >> 13) & 0x01, \ + .d = (flags >> 14) & 0x01, \ + .g = (flags >> 15) & 0x01, \ + } enum { GATE_INTERRUPT = 0xE, @@ -47,49 +43,63 @@ enum { GATE_TASK = 0x5, }; -/* 16byte gate */ -struct gate_struct64 { - u16 offset_low; - u16 segment; - unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; - u16 offset_middle; - u32 offset_high; - u32 zero1; -} __attribute__((packed)); - -#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF) -#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF) -#define PTR_HIGH(x) ((unsigned long long)(x) >> 32) - enum { DESC_TSS = 0x9, DESC_LDT = 0x2, DESCTYPE_S = 0x10, /* !system */ }; -/* LDT or TSS descriptor in the GDT. 16 bytes. */ -struct ldttss_desc64 { - u16 limit0; - u16 base0; - unsigned base1 : 8, type : 5, dpl : 2, p : 1; - unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; - u32 base3; - u32 zero1; +/* LDT or TSS descriptor in the GDT. */ +struct ldttss_desc { + u16 limit0; + u16 base0; + + u16 base1 : 8, type : 5, dpl : 2, p : 1; + u16 limit1 : 4, zero0 : 3, g : 1, base2 : 8; +#ifdef CONFIG_X86_64 + u32 base3; + u32 zero1; +#endif } __attribute__((packed)); +typedef struct ldttss_desc ldt_desc; +typedef struct ldttss_desc tss_desc; + +struct idt_bits { + u16 ist : 3, + zero : 5, + type : 5, + dpl : 2, + p : 1; +} __attribute__((packed)); + +struct gate_struct { + u16 offset_low; + u16 segment; + struct idt_bits bits; + u16 offset_middle; +#ifdef CONFIG_X86_64 + u32 offset_high; + u32 reserved; +#endif +} __attribute__((packed)); + +typedef struct gate_struct gate_desc; + +static inline unsigned long gate_offset(const gate_desc *g) +{ #ifdef CONFIG_X86_64 -typedef struct gate_struct64 gate_desc; -typedef struct ldttss_desc64 ldt_desc; -typedef struct ldttss_desc64 tss_desc; -#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32)) -#define gate_segment(g) ((g).segment) + return g->offset_low | ((unsigned long)g->offset_middle << 16) | + ((unsigned long) g->offset_high << 32); #else -typedef struct desc_struct gate_desc; -typedef struct desc_struct ldt_desc; -typedef struct desc_struct tss_desc; -#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff)) -#define gate_segment(g) ((g).a >> 16) + return g->offset_low | ((unsigned long)g->offset_middle << 16); #endif +} + +static inline unsigned long gate_segment(const gate_desc *g) +{ + return g->segment; +} struct desc_ptr { unsigned short size; diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index a3de31ffb722..04330c8d9af9 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -126,15 +126,15 @@ do { \ pr_reg[4] = regs->di; \ pr_reg[5] = regs->bp; \ pr_reg[6] = regs->ax; \ - pr_reg[7] = regs->ds & 0xffff; \ - pr_reg[8] = regs->es & 0xffff; \ - pr_reg[9] = regs->fs & 0xffff; \ + pr_reg[7] = regs->ds; \ + pr_reg[8] = regs->es; \ + pr_reg[9] = regs->fs; \ pr_reg[11] = regs->orig_ax; \ pr_reg[12] = regs->ip; \ - pr_reg[13] = regs->cs & 0xffff; \ + pr_reg[13] = regs->cs; \ pr_reg[14] = regs->flags; \ pr_reg[15] = regs->sp; \ - pr_reg[16] = regs->ss & 0xffff; \ + pr_reg[16] = regs->ss; \ } while (0); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ @@ -204,6 +204,7 @@ void set_personality_ia32(bool); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ + unsigned long base; \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ @@ -226,8 +227,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - (pr_reg)[21] = current->thread.fsbase; \ - (pr_reg)[22] = current->thread.gsbase; \ + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 07b06955a05d..aa15d1f7e530 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -13,20 +13,14 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) -BUILD_INTERRUPT3(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR, - smp_irq_move_cleanup_interrupt) -BUILD_INTERRUPT3(reboot_interrupt, REBOOT_VECTOR, smp_reboot_interrupt) +BUILD_INTERRUPT(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR) +BUILD_INTERRUPT(reboot_interrupt, REBOOT_VECTOR) #endif -BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) - #ifdef CONFIG_HAVE_KVM -BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR, - smp_kvm_posted_intr_ipi) -BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR, - smp_kvm_posted_intr_wakeup_ipi) -BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR, - smp_kvm_posted_intr_nested_ipi) +BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR) +BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR) +BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR) #endif /* @@ -41,6 +35,7 @@ BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR, BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) +BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) #ifdef CONFIG_IRQ_WORK BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 255645f60ca2..554cdb205d17 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -450,10 +450,10 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) return 0; } -static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate) +static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask) { if (use_xsave()) { - copy_kernel_to_xregs(&fpstate->xsave, -1); + copy_kernel_to_xregs(&fpstate->xsave, mask); } else { if (use_fxsr()) copy_kernel_to_fxregs(&fpstate->fxsave); @@ -477,7 +477,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate) : : [addr] "m" (fpstate)); } - __copy_kernel_to_fpregs(fpstate); + __copy_kernel_to_fpregs(fpstate, -1); } extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index b4c1f5453436..f4dc9b63bdda 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -41,20 +41,11 @@ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -80,30 +71,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index d6dbafbd4207..6dfe366a8804 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -46,26 +46,6 @@ extern asmlinkage void deferred_error_interrupt(void); extern asmlinkage void call_function_interrupt(void); extern asmlinkage void call_function_single_interrupt(void); -#ifdef CONFIG_TRACING -/* Interrupt handlers registered during init_IRQ */ -extern void trace_apic_timer_interrupt(void); -extern void trace_x86_platform_ipi(void); -extern void trace_error_interrupt(void); -extern void trace_irq_work_interrupt(void); -extern void trace_spurious_interrupt(void); -extern void trace_thermal_interrupt(void); -extern void trace_reschedule_interrupt(void); -extern void trace_threshold_interrupt(void); -extern void trace_deferred_error_interrupt(void); -extern void trace_call_function_interrupt(void); -extern void trace_call_function_single_interrupt(void); -#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt -#define trace_reboot_interrupt reboot_interrupt -#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi -#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi -#define trace_kvm_posted_intr_nested_ipi kvm_posted_intr_nested_ipi -#endif /* CONFIG_TRACING */ - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; struct pci_dev; diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h deleted file mode 100644 index 597dc4995678..000000000000 --- a/arch/x86/include/asm/intel_rdt.h +++ /dev/null @@ -1,286 +0,0 @@ -#ifndef _ASM_X86_INTEL_RDT_H -#define _ASM_X86_INTEL_RDT_H - -#ifdef CONFIG_INTEL_RDT_A - -#include <linux/sched.h> -#include <linux/kernfs.h> -#include <linux/jump_label.h> - -#include <asm/intel_rdt_common.h> - -#define IA32_L3_QOS_CFG 0xc81 -#define IA32_L3_CBM_BASE 0xc90 -#define IA32_L2_CBM_BASE 0xd10 -#define IA32_MBA_THRTL_BASE 0xd50 - -#define L3_QOS_CDP_ENABLE 0x01ULL - -/** - * struct rdtgroup - store rdtgroup's data in resctrl file system. - * @kn: kernfs node - * @rdtgroup_list: linked list for all rdtgroups - * @closid: closid for this rdtgroup - * @cpu_mask: CPUs assigned to this rdtgroup - * @flags: status bits - * @waitcount: how many cpus expect to find this - * group when they acquire rdtgroup_mutex - */ -struct rdtgroup { - struct kernfs_node *kn; - struct list_head rdtgroup_list; - int closid; - struct cpumask cpu_mask; - int flags; - atomic_t waitcount; -}; - -/* rdtgroup.flags */ -#define RDT_DELETED 1 - -/* rftype.flags */ -#define RFTYPE_FLAGS_CPUS_LIST 1 - -/* List of all resource groups */ -extern struct list_head rdt_all_groups; - -extern int max_name_width, max_data_width; - -int __init rdtgroup_init(void); - -/** - * struct rftype - describe each file in the resctrl file system - * @name: File name - * @mode: Access mode - * @kf_ops: File operations - * @flags: File specific RFTYPE_FLAGS_* flags - * @seq_show: Show content of the file - * @write: Write to the file - */ -struct rftype { - char *name; - umode_t mode; - struct kernfs_ops *kf_ops; - unsigned long flags; - - int (*seq_show)(struct kernfs_open_file *of, - struct seq_file *sf, void *v); - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -}; - -/** - * struct rdt_domain - group of cpus sharing an RDT resource - * @list: all instances of this resource - * @id: unique id for this instance - * @cpu_mask: which cpus share this resource - * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) - * @new_ctrl: new ctrl value to be loaded - * @have_new_ctrl: did user provide new_ctrl for this domain - */ -struct rdt_domain { - struct list_head list; - int id; - struct cpumask cpu_mask; - u32 *ctrl_val; - u32 new_ctrl; - bool have_new_ctrl; -}; - -/** - * struct msr_param - set a range of MSRs from a domain - * @res: The resource to use - * @low: Beginning index from base MSR - * @high: End index - */ -struct msr_param { - struct rdt_resource *res; - int low; - int high; -}; - -/** - * struct rdt_cache - Cache allocation related data - * @cbm_len: Length of the cache bit mask - * @min_cbm_bits: Minimum number of consecutive bits to be set - * @cbm_idx_mult: Multiplier of CBM index - * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: - * closid * cbm_idx_multi + cbm_idx_offset - * in a cache bit mask - */ -struct rdt_cache { - unsigned int cbm_len; - unsigned int min_cbm_bits; - unsigned int cbm_idx_mult; - unsigned int cbm_idx_offset; -}; - -/** - * struct rdt_membw - Memory bandwidth allocation related data - * @max_delay: Max throttle delay. Delay is the hardware - * representation for memory bandwidth. - * @min_bw: Minimum memory bandwidth percentage user can request - * @bw_gran: Granularity at which the memory bandwidth is allocated - * @delay_linear: True if memory B/W delay is in linear scale - * @mb_map: Mapping of memory B/W percentage to memory B/W delay - */ -struct rdt_membw { - u32 max_delay; - u32 min_bw; - u32 bw_gran; - u32 delay_linear; - u32 *mb_map; -}; - -/** - * struct rdt_resource - attributes of an RDT resource - * @enabled: Is this feature enabled on this machine - * @capable: Is this feature available on this machine - * @name: Name to use in "schemata" file - * @num_closid: Number of CLOSIDs available - * @cache_level: Which cache level defines scope of this resource - * @default_ctrl: Specifies default cache cbm or memory B/W percent. - * @msr_base: Base MSR address for CBMs - * @msr_update: Function pointer to update QOS MSRs - * @data_width: Character width of data when displaying - * @domains: All domains for this resource - * @cache: Cache allocation related data - * @info_files: resctrl info files for the resource - * @nr_info_files: Number of info files - * @format_str: Per resource format string to show domain value - * @parse_ctrlval: Per resource function pointer to parse control values - */ -struct rdt_resource { - bool enabled; - bool capable; - char *name; - int num_closid; - int cache_level; - u32 default_ctrl; - unsigned int msr_base; - void (*msr_update) (struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); - int data_width; - struct list_head domains; - struct rdt_cache cache; - struct rdt_membw membw; - struct rftype *info_files; - int nr_info_files; - const char *format_str; - int (*parse_ctrlval) (char *buf, struct rdt_resource *r, - struct rdt_domain *d); -}; - -void rdt_get_cache_infofile(struct rdt_resource *r); -void rdt_get_mba_infofile(struct rdt_resource *r); -int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); - -extern struct mutex rdtgroup_mutex; - -extern struct rdt_resource rdt_resources_all[]; -extern struct rdtgroup rdtgroup_default; -DECLARE_STATIC_KEY_FALSE(rdt_enable_key); - -int __init rdtgroup_init(void); - -enum { - RDT_RESOURCE_L3, - RDT_RESOURCE_L3DATA, - RDT_RESOURCE_L3CODE, - RDT_RESOURCE_L2, - RDT_RESOURCE_MBA, - - /* Must be the last */ - RDT_NUM_RESOURCES, -}; - -#define for_each_capable_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ - if (r->capable) - -#define for_each_enabled_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ - if (r->enabled) - -/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ -union cpuid_0x10_1_eax { - struct { - unsigned int cbm_len:5; - } split; - unsigned int full; -}; - -/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ -union cpuid_0x10_3_eax { - struct { - unsigned int max_delay:12; - } split; - unsigned int full; -}; - -/* CPUID.(EAX=10H, ECX=ResID).EDX */ -union cpuid_0x10_x_edx { - struct { - unsigned int cos_max:16; - } split; - unsigned int full; -}; - -DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); - -void rdt_ctrl_update(void *arg); -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); -void rdtgroup_kn_unlock(struct kernfs_node *kn); -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); - -/* - * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR - * - * Following considerations are made so that this has minimal impact - * on scheduler hot path: - * - This will stay as no-op unless we are running on an Intel SKU - * which supports resource control and we enable by mounting the - * resctrl file system. - * - Caches the per cpu CLOSid values and does the MSR write only - * when a task with a different CLOSid is scheduled in. - * - * Must be called with preemption disabled. - */ -static inline void intel_rdt_sched_in(void) -{ - if (static_branch_likely(&rdt_enable_key)) { - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - int closid; - - /* - * If this task has a closid assigned, use it. - * Else use the closid assigned to this cpu. - */ - closid = current->closid; - if (closid == 0) - closid = this_cpu_read(cpu_closid); - - if (closid != state->closid) { - state->closid = closid; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid); - } - } -} - -#else - -static inline void intel_rdt_sched_in(void) {} - -#endif /* CONFIG_INTEL_RDT_A */ -#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h deleted file mode 100644 index b31081b89407..000000000000 --- a/arch/x86/include/asm/intel_rdt_common.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _ASM_X86_INTEL_RDT_COMMON_H -#define _ASM_X86_INTEL_RDT_COMMON_H - -#define MSR_IA32_PQR_ASSOC 0x0c8f - -/** - * struct intel_pqr_state - State cache for the PQR MSR - * @rmid: The cached Resource Monitoring ID - * @closid: The cached Class Of Service ID - * @rmid_usecnt: The usage counter for rmid - * - * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always - * contains both parts, so we need to cache them. - * - * The cache also helps to avoid pointless updates if the value does - * not change. - */ -struct intel_pqr_state { - u32 rmid; - u32 closid; - int rmid_usecnt; -}; - -DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); - -#endif /* _ASM_X86_INTEL_RDT_COMMON_H */ diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h new file mode 100644 index 000000000000..b4bbf8b21512 --- /dev/null +++ b/arch/x86/include/asm/intel_rdt_sched.h @@ -0,0 +1,92 @@ +#ifndef _ASM_X86_INTEL_RDT_SCHED_H +#define _ASM_X86_INTEL_RDT_SCHED_H + +#ifdef CONFIG_INTEL_RDT + +#include <linux/sched.h> +#include <linux/jump_label.h> + +#define IA32_PQR_ASSOC 0x0c8f + +/** + * struct intel_pqr_state - State cache for the PQR MSR + * @cur_rmid: The cached Resource Monitoring ID + * @cur_closid: The cached Class Of Service ID + * @default_rmid: The user assigned Resource Monitoring ID + * @default_closid: The user assigned cached Class Of Service ID + * + * The upper 32 bits of IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to IA32_PQR_ASSOC always + * contains both parts, so we need to cache them. This also + * stores the user configured per cpu CLOSID and RMID. + * + * The cache also helps to avoid pointless updates if the value does + * not change. + */ +struct intel_pqr_state { + u32 cur_rmid; + u32 cur_closid; + u32 default_rmid; + u32 default_closid; +}; + +DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); + +/* + * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR + * + * Following considerations are made so that this has minimal impact + * on scheduler hot path: + * - This will stay as no-op unless we are running on an Intel SKU + * which supports resource control or monitoring and we enable by + * mounting the resctrl file system. + * - Caches the per cpu CLOSid/RMID values and does the MSR write only + * when a task with a different CLOSid/RMID is scheduled in. + * - We allocate RMIDs/CLOSids globally in order to keep this as + * simple as possible. + * Must be called with preemption disabled. + */ +static void __intel_rdt_sched_in(void) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + u32 closid = state->default_closid; + u32 rmid = state->default_rmid; + + /* + * If this task has a closid/rmid assigned, use it. + * Else use the closid/rmid assigned to this cpu. + */ + if (static_branch_likely(&rdt_alloc_enable_key)) { + if (current->closid) + closid = current->closid; + } + + if (static_branch_likely(&rdt_mon_enable_key)) { + if (current->rmid) + rmid = current->rmid; + } + + if (closid != state->cur_closid || rmid != state->cur_rmid) { + state->cur_closid = closid; + state->cur_rmid = rmid; + wrmsr(IA32_PQR_ASSOC, rmid, closid); + } +} + +static inline void intel_rdt_sched_in(void) +{ + if (static_branch_likely(&rdt_enable_key)) + __intel_rdt_sched_in(); +} + +#else + +static inline void intel_rdt_sched_in(void) {} + +#endif /* CONFIG_INTEL_RDT */ + +#endif /* _ASM_X86_INTEL_RDT_SCHED_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 4bc6f459a8b6..c40a95c33bb8 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -69,6 +69,9 @@ build_mmio_write(__writeb, "b", unsigned char, "q", ) build_mmio_write(__writew, "w", unsigned short, "r", ) build_mmio_write(__writel, "l", unsigned int, "r", ) +#define readb readb +#define readw readw +#define readl readl #define readb_relaxed(a) __readb(a) #define readw_relaxed(a) __readw(a) #define readl_relaxed(a) __readl(a) @@ -76,6 +79,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define __raw_readw __readw #define __raw_readl __readl +#define writeb writeb +#define writew writew +#define writel writel #define writeb_relaxed(v, a) __writeb(v, a) #define writew_relaxed(v, a) __writew(v, a) #define writel_relaxed(v, a) __writel(v, a) @@ -88,13 +94,15 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #ifdef CONFIG_X86_64 build_mmio_read(readq, "q", unsigned long, "=r", :"memory") +build_mmio_read(__readq, "q", unsigned long, "=r", ) build_mmio_write(writeq, "q", unsigned long, "r", :"memory") +build_mmio_write(__writeq, "q", unsigned long, "r", ) -#define readq_relaxed(a) readq(a) -#define writeq_relaxed(v, a) writeq(v, a) +#define readq_relaxed(a) __readq(a) +#define writeq_relaxed(v, a) __writeq(v, a) -#define __raw_readq(a) readq(a) -#define __raw_writeq(val, addr) writeq(val, addr) +#define __raw_readq __readq +#define __raw_writeq __writeq /* Let people know that we have them */ #define readq readq @@ -119,6 +127,7 @@ static inline phys_addr_t virt_to_phys(volatile void *address) { return __pa(address); } +#define virt_to_phys virt_to_phys /** * phys_to_virt - map physical address to virtual @@ -137,6 +146,7 @@ static inline void *phys_to_virt(phys_addr_t address) { return __va(address); } +#define phys_to_virt phys_to_virt /* * Change "struct page" to physical address. @@ -169,11 +179,14 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) * else, you probably want one of the following. */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +#define ioremap_nocache ioremap_nocache extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +#define ioremap_cache ioremap_cache extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); +#define ioremap_prot ioremap_prot /** * ioremap - map bus memory into CPU space @@ -193,8 +206,10 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) { return ioremap_nocache(offset, size); } +#define ioremap ioremap extern void iounmap(volatile void __iomem *addr); +#define iounmap iounmap extern void set_iounmap_nonlazy(void); @@ -203,53 +218,6 @@ extern void set_iounmap_nonlazy(void); #include <asm-generic/iomap.h> /* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - -/** - * memset_io Set a range of I/O memory to a constant value - * @addr: The beginning of the I/O-memory range to set - * @val: The value to set the memory to - * @count: The number of bytes to set - * - * Set a range of I/O memory to a given value. - */ -static inline void -memset_io(volatile void __iomem *addr, unsigned char val, size_t count) -{ - memset((void __force *)addr, val, count); -} - -/** - * memcpy_fromio Copy a block of data from I/O memory - * @dst: The (RAM) destination for the copy - * @src: The (I/O memory) source for the data - * @count: The number of bytes to copy - * - * Copy a block of data from I/O memory. - */ -static inline void -memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) -{ - memcpy(dst, (const void __force *)src, count); -} - -/** - * memcpy_toio Copy a block of data into I/O memory - * @dst: The (I/O memory) destination for the copy - * @src: The (RAM) source for the data - * @count: The number of bytes to copy - * - * Copy a block of data to I/O memory. - */ -static inline void -memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) -{ - memcpy((void __force *)dst, src, count); -} - -/* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped * to PAGE_OFFSET is pure coincidence - it does not mean ISA values @@ -341,13 +309,38 @@ BUILDIO(b, b, char) BUILDIO(w, w, short) BUILDIO(l, , int) +#define inb inb +#define inw inw +#define inl inl +#define inb_p inb_p +#define inw_p inw_p +#define inl_p inl_p +#define insb insb +#define insw insw +#define insl insl + +#define outb outb +#define outw outw +#define outl outl +#define outb_p outb_p +#define outw_p outw_p +#define outl_p outl_p +#define outsb outsb +#define outsw outsw +#define outsl outsl + extern void *xlate_dev_mem_ptr(phys_addr_t phys); extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); +#define xlate_dev_mem_ptr xlate_dev_mem_ptr +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr + extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); +#define ioremap_wc ioremap_wc extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); +#define ioremap_wt ioremap_wt extern bool is_early_ioremap_ptep(pte_t *ptep); @@ -365,6 +358,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define IO_SPACE_LIMIT 0xffff +#include <asm-generic/io.h> +#undef PCI_IOBASE + #ifdef CONFIG_MTRR extern int __must_check arch_phys_wc_index(int handle); #define arch_phys_wc_index arch_phys_wc_index diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 668cca540025..9958ceea2fa3 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -42,10 +42,6 @@ extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern __visible unsigned int do_IRQ(struct pt_regs *regs); -/* Interrupt vector management */ -extern DECLARE_BITMAP(used_vectors, NR_VECTORS); -extern int vector_used_by_percpu_irq(unsigned int vector); - extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index f70604125286..ddbb8ea0f5a9 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -3,9 +3,17 @@ #include <asm/cpufeature.h> +#ifdef CONFIG_X86_LOCAL_APIC static inline bool arch_irq_work_has_interrupt(void) { return boot_cpu_has(X86_FEATURE_APIC); } +extern void arch_irq_work_raise(void); +#else +static inline bool arch_irq_work_has_interrupt(void) +{ + return false; +} +#endif #endif /* _ASM_IRQ_WORK_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7cbaab523f22..369e41c23f07 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -492,6 +492,7 @@ struct kvm_vcpu_arch { unsigned long cr4; unsigned long cr4_guest_owned_bits; unsigned long cr8; + u32 pkru; u32 hflags; u64 efer; u64 apic_base; @@ -1374,8 +1375,6 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); -void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address); void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h deleted file mode 100644 index 73d0c9b92087..000000000000 --- a/arch/x86/include/asm/lguest.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef _ASM_X86_LGUEST_H -#define _ASM_X86_LGUEST_H - -#define GDT_ENTRY_LGUEST_CS 10 -#define GDT_ENTRY_LGUEST_DS 11 -#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) -#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) - -#ifndef __ASSEMBLY__ -#include <asm/desc.h> - -#define GUEST_PL 1 - -/* Page for Switcher text itself, then two pages per cpu */ -#define SWITCHER_TEXT_PAGES (1) -#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids) -#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES) - -/* Where we map the Switcher, in both Host and Guest. */ -extern unsigned long switcher_addr; - -/* Found in switcher.S */ -extern unsigned long default_idt_entries[]; - -/* Declarations for definitions in arch/x86/lguest/head_32.S */ -extern char lguest_noirq_iret[]; -extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_pushf[], lgend_pushf[]; - -extern void lguest_iret(void); -extern void lguest_init(void); - -struct lguest_regs { - /* Manually saved part. */ - unsigned long eax, ebx, ecx, edx; - unsigned long esi, edi, ebp; - unsigned long gs; - unsigned long fs, ds, es; - unsigned long trapnum, errcode; - /* Trap pushed part */ - unsigned long eip; - unsigned long cs; - unsigned long eflags; - unsigned long esp; - unsigned long ss; -}; - -/* This is a guest-specific page (mapped ro) into the guest. */ -struct lguest_ro_state { - /* Host information we need to restore when we switch back. */ - u32 host_cr3; - struct desc_ptr host_idt_desc; - struct desc_ptr host_gdt_desc; - u32 host_sp; - - /* Fields which are used when guest is running. */ - struct desc_ptr guest_idt_desc; - struct desc_ptr guest_gdt_desc; - struct x86_hw_tss guest_tss; - struct desc_struct guest_idt[IDT_ENTRIES]; - struct desc_struct guest_gdt[GDT_ENTRIES]; -}; - -struct lg_cpu_arch { - /* The GDT entries copied into lguest_ro_state when running. */ - struct desc_struct gdt[GDT_ENTRIES]; - - /* The IDT entries: some copied into lguest_ro_state when running. */ - struct desc_struct idt[IDT_ENTRIES]; - - /* The address of the last guest-visible pagefault (ie. cr2). */ - unsigned long last_pagefault; -}; - -static inline void lguest_set_ts(void) -{ - u32 cr0; - - cr0 = read_cr0(); - if (!(cr0 & 8)) - write_cr0(cr0 | 8); -} - -/* Full 4G segment descriptors, suitable for CS and DS. */ -#define FULL_EXEC_SEGMENT \ - ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) -#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_LGUEST_H */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h deleted file mode 100644 index 6c119cfae218..000000000000 --- a/arch/x86/include/asm/lguest_hcall.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Architecture specific portion of the lguest hypercalls */ -#ifndef _ASM_X86_LGUEST_HCALL_H -#define _ASM_X86_LGUEST_HCALL_H - -#define LHCALL_FLUSH_ASYNC 0 -#define LHCALL_LGUEST_INIT 1 -#define LHCALL_SHUTDOWN 2 -#define LHCALL_NEW_PGTABLE 4 -#define LHCALL_FLUSH_TLB 5 -#define LHCALL_LOAD_IDT_ENTRY 6 -#define LHCALL_SET_STACK 7 -#define LHCALL_SET_CLOCKEVENT 9 -#define LHCALL_HALT 10 -#define LHCALL_SET_PMD 13 -#define LHCALL_SET_PTE 14 -#define LHCALL_SET_PGD 15 -#define LHCALL_LOAD_TLS 16 -#define LHCALL_LOAD_GDT_ENTRY 18 -#define LHCALL_SEND_INTERRUPTS 19 - -#define LGUEST_TRAP_ENTRY 0x1F - -/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ -#define LGUEST_SHUTDOWN_POWEROFF 1 -#define LGUEST_SHUTDOWN_RESTART 2 - -#ifndef __ASSEMBLY__ -#include <asm/hw_irq.h> - -/*G:030 - * But first, how does our Guest contact the Host to ask for privileged - * operations? There are two ways: the direct way is to make a "hypercall", - * to make requests of the Host Itself. - * - * Our hypercall mechanism uses the highest unused trap code (traps 32 and - * above are used by real hardware interrupts). Seventeen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. - * If a return value makes sense, it's returned in %eax. - * - * Grossly invalid calls result in Sudden Death at the hands of the vengeful - * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". - */ -static inline unsigned long -hcall(unsigned long call, - unsigned long arg1, unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* "int" is the Intel instruction to trigger a trap. */ - asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) - /* The call in %eax (aka "a") might be overwritten */ - : "=a"(call) - /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */ - : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4) - /* "memory" means this might write somewhere in memory. - * This isn't true for all calls, but it's safe to tell - * gcc that it might happen so it doesn't get clever. */ - : "memory"); - return call; -} -/*:*/ - -/* Can't use our min() macro here: needs to be a constant */ -#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) - -#define LHCALL_RING_SIZE 64 -struct hcall_args { - /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ - unsigned long arg0, arg1, arg2, arg3, arg4; -}; - -#endif /* !__ASSEMBLY__ */ -#endif /* _ASM_X86_LGUEST_HCALL_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index d25d9f4abb15..7ae318c340d9 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -148,9 +148,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.execute_only_pkey = -1; } #endif - init_new_context_ldt(tsk, mm); - - return 0; + return init_new_context_ldt(tsk, mm); } static inline void destroy_context(struct mm_struct *mm) { diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index e3b7819caeef..9eb7c718aaf8 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -2,6 +2,15 @@ #define _ASM_X86_MODULE_H #include <asm-generic/module.h> +#include <asm/orc_types.h> + +struct mod_arch_specific { +#ifdef CONFIG_ORC_UNWINDER + unsigned int num_orcs; + int *orc_unwind_ip; + struct orc_entry *orc_unwind; +#endif +}; #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 2b58c8c1eeaa..63cc96f064dc 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -3,6 +3,8 @@ #include <linux/types.h> #include <linux/atomic.h> +#include <linux/nmi.h> +#include <asm/io.h> #include <asm/hyperv.h> /* @@ -28,6 +30,8 @@ struct ms_hyperv_info { u32 features; u32 misc_features; u32 hints; + u32 max_vp_index; + u32 max_lp_index; }; extern struct ms_hyperv_info ms_hyperv; @@ -168,12 +172,155 @@ void hv_remove_crash_handler(void); #if IS_ENABLED(CONFIG_HYPERV) extern struct clocksource *hyperv_cs; +extern void *hv_hypercall_pg; + +static inline u64 hv_do_hypercall(u64 control, void *input, void *output) +{ + u64 input_address = input ? virt_to_phys(input) : 0; + u64 output_address = output ? virt_to_phys(output) : 0; + u64 hv_status; + register void *__sp asm(_ASM_SP); + +#ifdef CONFIG_X86_64 + if (!hv_hypercall_pg) + return U64_MAX; + + __asm__ __volatile__("mov %4, %%r8\n" + "call *%5" + : "=a" (hv_status), "+r" (__sp), + "+c" (control), "+d" (input_address) + : "r" (output_address), "m" (hv_hypercall_pg) + : "cc", "memory", "r8", "r9", "r10", "r11"); +#else + u32 input_address_hi = upper_32_bits(input_address); + u32 input_address_lo = lower_32_bits(input_address); + u32 output_address_hi = upper_32_bits(output_address); + u32 output_address_lo = lower_32_bits(output_address); + + if (!hv_hypercall_pg) + return U64_MAX; + + __asm__ __volatile__("call *%7" + : "=A" (hv_status), + "+c" (input_address_lo), "+r" (__sp) + : "A" (control), + "b" (input_address_hi), + "D"(output_address_hi), "S"(output_address_lo), + "m" (hv_hypercall_pg) + : "cc", "memory"); +#endif /* !x86_64 */ + return hv_status; +} + +#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) +#define HV_HYPERCALL_FAST_BIT BIT(16) +#define HV_HYPERCALL_VARHEAD_OFFSET 17 +#define HV_HYPERCALL_REP_COMP_OFFSET 32 +#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) +#define HV_HYPERCALL_REP_START_OFFSET 48 +#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) + +/* Fast hypercall with 8 bytes of input and no output */ +static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) +{ + u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT; + register void *__sp asm(_ASM_SP); + +#ifdef CONFIG_X86_64 + { + __asm__ __volatile__("call *%4" + : "=a" (hv_status), "+r" (__sp), + "+c" (control), "+d" (input1) + : "m" (hv_hypercall_pg) + : "cc", "r8", "r9", "r10", "r11"); + } +#else + { + u32 input1_hi = upper_32_bits(input1); + u32 input1_lo = lower_32_bits(input1); + + __asm__ __volatile__ ("call *%5" + : "=A"(hv_status), + "+c"(input1_lo), + "+r"(__sp) + : "A" (control), + "b" (input1_hi), + "m" (hv_hypercall_pg) + : "cc", "edi", "esi"); + } +#endif + return hv_status; +} + +/* + * Rep hypercalls. Callers of this functions are supposed to ensure that + * rep_count and varhead_size comply with Hyper-V hypercall definition. + */ +static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, + void *input, void *output) +{ + u64 control = code; + u64 status; + u16 rep_comp; + + control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET; + control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET; + + do { + status = hv_do_hypercall(control, input, output); + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) + return status; + + /* Bits 32-43 of status have 'Reps completed' data. */ + rep_comp = (status & HV_HYPERCALL_REP_COMP_MASK) >> + HV_HYPERCALL_REP_COMP_OFFSET; + + control &= ~HV_HYPERCALL_REP_START_MASK; + control |= (u64)rep_comp << HV_HYPERCALL_REP_START_OFFSET; + + touch_nmi_watchdog(); + } while (rep_comp < rep_count); + + return status; +} + +/* + * Hypervisor's notion of virtual processor ID is different from + * Linux' notion of CPU ID. This information can only be retrieved + * in the context of the calling CPU. Setup a map for easy access + * to this information. + */ +extern u32 *hv_vp_index; + +/** + * hv_cpu_number_to_vp_number() - Map CPU to VP. + * @cpu_number: CPU number in Linux terms + * + * This function returns the mapping between the Linux processor + * number and the hypervisor's virtual processor number, useful + * in making hypercalls and such that talk about specific + * processors. + * + * Return: Virtual processor number in Hyper-V terms + */ +static inline int hv_cpu_number_to_vp_number(int cpu_number) +{ + return hv_vp_index[cpu_number]; +} void hyperv_init(void); +void hyperv_setup_mmu_ops(void); +void hyper_alloc_mmu(void); void hyperv_report_panic(struct pt_regs *regs); bool hv_is_hypercall_page_setup(void); void hyperv_cleanup(void); -#endif +#else /* CONFIG_HYPERV */ +static inline void hyperv_init(void) {} +static inline bool hv_is_hypercall_page_setup(void) { return false; } +static inline void hyperv_cleanup(void) {} +static inline void hyperv_setup_mmu_ops(void) {} +#endif /* CONFIG_HYPERV */ + #ifdef CONFIG_HYPERV_TSCPAGE struct ms_hyperv_tsc_page *hv_get_tsc_page(void); static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h new file mode 100644 index 000000000000..91c8d868424d --- /dev/null +++ b/arch/x86/include/asm/orc_lookup.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ +#ifndef _ORC_LOOKUP_H +#define _ORC_LOOKUP_H + +/* + * This is a lookup table for speeding up access to the .orc_unwind table. + * Given an input address offset, the corresponding lookup table entry + * specifies a subset of the .orc_unwind table to search. + * + * Each block represents the end of the previous range and the start of the + * next range. An extra block is added to give the last range an end. + * + * The block size should be a power of 2 to avoid a costly 'div' instruction. + * + * A block size of 256 was chosen because it roughly doubles unwinder + * performance while only adding ~5% to the ORC data footprint. + */ +#define LOOKUP_BLOCK_ORDER 8 +#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER) + +#ifndef LINKER_SCRIPT + +extern unsigned int orc_lookup[]; +extern unsigned int orc_lookup_end[]; + +#define LOOKUP_START_IP (unsigned long)_stext +#define LOOKUP_STOP_IP (unsigned long)_etext + +#endif /* LINKER_SCRIPT */ + +#endif /* _ORC_LOOKUP_H */ diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h new file mode 100644 index 000000000000..9c9dc579bd7d --- /dev/null +++ b/arch/x86/include/asm/orc_types.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _ORC_TYPES_H +#define _ORC_TYPES_H + +#include <linux/types.h> +#include <linux/compiler.h> + +/* + * The ORC_REG_* registers are base registers which are used to find other + * registers on the stack. + * + * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the + * address of the previous frame: the caller's SP before it called the current + * function. + * + * ORC_REG_UNDEFINED means the corresponding register's value didn't change in + * the current frame. + * + * The most commonly used base registers are SP and BP -- which the previous SP + * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is + * usually based on. + * + * The rest of the base registers are needed for special cases like entry code + * and GCC realigned stacks. + */ +#define ORC_REG_UNDEFINED 0 +#define ORC_REG_PREV_SP 1 +#define ORC_REG_DX 2 +#define ORC_REG_DI 3 +#define ORC_REG_BP 4 +#define ORC_REG_SP 5 +#define ORC_REG_R10 6 +#define ORC_REG_R13 7 +#define ORC_REG_BP_INDIRECT 8 +#define ORC_REG_SP_INDIRECT 9 +#define ORC_REG_MAX 15 + +/* + * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the + * caller's SP right before it made the call). Used for all callable + * functions, i.e. all C code and all callable asm functions. + * + * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points + * to a fully populated pt_regs from a syscall, interrupt, or exception. + * + * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset + * points to the iret return frame. + * + * The UNWIND_HINT macros are used only for the unwind_hint struct. They + * aren't used in struct orc_entry due to size and complexity constraints. + * Objtool converts them to real types when it converts the hints to orc + * entries. + */ +#define ORC_TYPE_CALL 0 +#define ORC_TYPE_REGS 1 +#define ORC_TYPE_REGS_IRET 2 +#define UNWIND_HINT_TYPE_SAVE 3 +#define UNWIND_HINT_TYPE_RESTORE 4 + +#ifndef __ASSEMBLY__ +/* + * This struct is more or less a vastly simplified version of the DWARF Call + * Frame Information standard. It contains only the necessary parts of DWARF + * CFI, simplified for ease of access by the in-kernel unwinder. It tells the + * unwinder how to find the previous SP and BP (and sometimes entry regs) on + * the stack for a given code address. Each instance of the struct corresponds + * to one or more code locations. + */ +struct orc_entry { + s16 sp_offset; + s16 bp_offset; + unsigned sp_reg:4; + unsigned bp_reg:4; + unsigned type:2; +} __packed; + +/* + * This struct is used by asm and inline asm code to manually annotate the + * location of registers on the stack for the ORC unwinder. + * + * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. + */ +struct unwind_hint { + u32 ip; + s16 sp_offset; + u8 sp_reg; + u8 type; +}; +#endif /* __ASSEMBLY__ */ + +#endif /* _ORC_TYPES_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 9ccac1926587..c25dd22f7c70 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -960,11 +960,6 @@ extern void default_banner(void); #define GET_CR2_INTO_RAX \ call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2) -#define PARAVIRT_ADJUST_EXCEPTION_FRAME \ - PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \ - CLBR_NONE, \ - call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame)) - #define USERGS_SYSRET64 \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 9ffc36bfe4cd..6b64fc6367f2 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -196,9 +196,6 @@ struct pv_irq_ops { void (*safe_halt)(void); void (*halt)(void); -#ifdef CONFIG_X86_64 - void (*adjust_exception_frame)(void); -#endif } __no_randomize_layout; struct pv_mmu_ops { diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c61bab07a84e..3fa26a61eabc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -22,6 +22,7 @@ struct vm86; #include <asm/nops.h> #include <asm/special_insns.h> #include <asm/fpu/types.h> +#include <asm/unwind_hints.h> #include <linux/personality.h> #include <linux/cache.h> @@ -667,7 +668,7 @@ static inline void sync_core(void) * In case NMI unmasking or performance ever becomes a problem, * the next best option appears to be MOV-to-CR2 and an * unconditional jump. That sequence also works on all CPUs, - * but it will fault at CPL3 (i.e. Xen PV and lguest). + * but it will fault at CPL3 (i.e. Xen PV). * * CPUID is the conventional way, but it's nasty: it doesn't * exist on some 486-like CPUs, and it usually exits to a @@ -690,6 +691,7 @@ static inline void sync_core(void) unsigned int tmp; asm volatile ( + UNWIND_HINT_SAVE "mov %%ss, %0\n\t" "pushq %q0\n\t" "pushq %%rsp\n\t" @@ -699,6 +701,7 @@ static inline void sync_core(void) "pushq %q0\n\t" "pushq $1f\n\t" "iretq\n\t" + UNWIND_HINT_RESTORE "1:" : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); #endif diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 8d3964fc5f91..b408b1886195 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -24,6 +24,9 @@ void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); void entry_INT80_compat(void); +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +void xen_entry_INT80_compat(void); +#endif #endif void x86_configure_nx(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 2b5d686ea9f3..91c04c8e67fa 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -9,6 +9,20 @@ #ifdef __i386__ struct pt_regs { + /* + * NB: 32-bit x86 CPUs are inconsistent as what happens in the + * following cases (where %seg represents a segment register): + * + * - pushl %seg: some do a 16-bit write and leave the high + * bits alone + * - movl %seg, [mem]: some do a 16-bit write despite the movl + * - IDT entry: some (e.g. 486) will leave the high bits of CS + * and (if applicable) SS undefined. + * + * Fortunately, x86-32 doesn't read the high bits on POP or IRET, + * so we can just treat all of the segment registers as 16-bit + * values. + */ unsigned long bx; unsigned long cx; unsigned long dx; @@ -16,16 +30,22 @@ struct pt_regs { unsigned long di; unsigned long bp; unsigned long ax; - unsigned long ds; - unsigned long es; - unsigned long fs; - unsigned long gs; + unsigned short ds; + unsigned short __dsh; + unsigned short es; + unsigned short __esh; + unsigned short fs; + unsigned short __fsh; + unsigned short gs; + unsigned short __gsh; unsigned long orig_ax; unsigned long ip; - unsigned long cs; + unsigned short cs; + unsigned short __csh; unsigned long flags; unsigned long sp; - unsigned long ss; + unsigned short ss; + unsigned short __ssh; }; #else /* __i386__ */ @@ -176,6 +196,17 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, if (offset == offsetof(struct pt_regs, sp) && regs->cs == __KERNEL_CS) return kernel_stack_pointer(regs); + + /* The selector fields are 16-bit. */ + if (offset == offsetof(struct pt_regs, cs) || + offset == offsetof(struct pt_regs, ss) || + offset == offsetof(struct pt_regs, ds) || + offset == offsetof(struct pt_regs, es) || + offset == offsetof(struct pt_regs, fs) || + offset == offsetof(struct pt_regs, gs)) { + return *(u16 *)((unsigned long)regs + offset); + + } #endif return *(unsigned long *)((unsigned long)regs + offset); } diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h new file mode 100644 index 000000000000..ff871210b9f2 --- /dev/null +++ b/arch/x86/include/asm/refcount.h @@ -0,0 +1,109 @@ +#ifndef __ASM_X86_REFCOUNT_H +#define __ASM_X86_REFCOUNT_H +/* + * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from + * PaX/grsecurity. + */ +#include <linux/refcount.h> + +/* + * This is the first portion of the refcount error handling, which lives in + * .text.unlikely, and is jumped to from the CPU flag check (in the + * following macros). This saves the refcount value location into CX for + * the exception handler to use (in mm/extable.c), and then triggers the + * central refcount exception. The fixup address for the exception points + * back to the regular execution flow in .text. + */ +#define _REFCOUNT_EXCEPTION \ + ".pushsection .text.unlikely\n" \ + "111:\tlea %[counter], %%" _ASM_CX "\n" \ + "112:\t" ASM_UD0 "\n" \ + ASM_UNREACHABLE \ + ".popsection\n" \ + "113:\n" \ + _ASM_EXTABLE_REFCOUNT(112b, 113b) + +/* Trigger refcount exception if refcount result is negative. */ +#define REFCOUNT_CHECK_LT_ZERO \ + "js 111f\n\t" \ + _REFCOUNT_EXCEPTION + +/* Trigger refcount exception if refcount result is zero or negative. */ +#define REFCOUNT_CHECK_LE_ZERO \ + "jz 111f\n\t" \ + REFCOUNT_CHECK_LT_ZERO + +/* Trigger refcount exception unconditionally. */ +#define REFCOUNT_ERROR \ + "jmp 111f\n\t" \ + _REFCOUNT_EXCEPTION + +static __always_inline void refcount_add(unsigned int i, refcount_t *r) +{ + asm volatile(LOCK_PREFIX "addl %1,%0\n\t" + REFCOUNT_CHECK_LT_ZERO + : [counter] "+m" (r->refs.counter) + : "ir" (i) + : "cc", "cx"); +} + +static __always_inline void refcount_inc(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "incl %0\n\t" + REFCOUNT_CHECK_LT_ZERO + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline void refcount_dec(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "decl %0\n\t" + REFCOUNT_CHECK_LE_ZERO + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline __must_check +bool refcount_sub_and_test(unsigned int i, refcount_t *r) +{ + GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO, + r->refs.counter, "er", i, "%0", e); +} + +static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r) +{ + GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO, + r->refs.counter, "%0", e); +} + +static __always_inline __must_check +bool refcount_add_not_zero(unsigned int i, refcount_t *r) +{ + int c, result; + + c = atomic_read(&(r->refs)); + do { + if (unlikely(c == 0)) + return false; + + result = c + i; + + /* Did we try to increment from/to an undesirable state? */ + if (unlikely(c < 0 || c == INT_MAX || result < c)) { + asm volatile(REFCOUNT_ERROR + : : [counter] "m" (r->refs.counter) + : "cc", "cx"); + break; + } + + } while (!atomic_try_cmpxchg(&(r->refs), &c, result)); + + return c != 0; +} + +static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r) +{ + return refcount_add_not_zero(1, r); +} + +#endif diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 661dd305694a..045f99211a99 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -1,45 +1,56 @@ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc +#define __CLOBBERS_MEM "memory" +#define __CLOBBERS_MEM_CC_CX "memory", "cc", "cx" + #if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) /* Use asm goto */ -#define __GEN_RMWcc(fullop, var, cc, ...) \ +#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ - : : "m" (var), ## __VA_ARGS__ \ - : "memory" : cc_label); \ + : : [counter] "m" (var), ## __VA_ARGS__ \ + : clobbers : cc_label); \ return 0; \ cc_label: \ return 1; \ } while (0) -#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) +#define __BINARY_RMWcc_ARG " %1, " -#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val)) #else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ /* Use flags output or a set instruction */ -#define __GEN_RMWcc(fullop, var, cc, ...) \ +#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ bool c; \ asm volatile (fullop ";" CC_SET(cc) \ - : "+m" (var), CC_OUT(cc) (c) \ - : __VA_ARGS__ : "memory"); \ + : [counter] "+m" (var), CC_OUT(cc) (c) \ + : __VA_ARGS__ : clobbers); \ return c; \ } while (0) +#define __BINARY_RMWcc_ARG " %2, " + +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ + #define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) + __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM) + +#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc) \ + __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \ + __CLOBBERS_MEM_CC_CX) #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val)) + __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \ + __CLOBBERS_MEM, vcon (val)) -#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ +#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc) \ + __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \ + __CLOBBERS_MEM_CC_CX, vcon (val)) #endif /* _ASM_X86_RMWcc */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 1549caa098f0..066aaf813141 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -238,9 +238,7 @@ #ifndef __ASSEMBLY__ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; -#ifdef CONFIG_TRACING -# define trace_early_idt_handler_array early_idt_handler_array -#endif +extern void early_ignore_irq(void); /* * Load a segment. Fall back on loading the zero segment if something goes diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index e4585a393965..a65cf544686a 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -39,6 +39,7 @@ static inline void vsmp_init(void) { } #endif void setup_bios_corruption_check(void); +void early_platform_quirks(void); extern unsigned long saved_video_mode; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e00e1bd6e7b3..5161da1a0fa0 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -98,6 +98,7 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X32 30 /* 32-bit native x86-64 binary */ +#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -122,6 +123,7 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) /* * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for @@ -137,7 +139,8 @@ struct thread_info { (_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \ _TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU | \ _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE | \ - _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT) + _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_FSCHECK) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index c7797307fc2b..79a4ca6a9606 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -15,4 +15,18 @@ #include <asm-generic/tlb.h> +/* + * While x86 architecture in general requires an IPI to perform TLB + * shootdown, enablement code for several hypervisors overrides + * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing + * a hypercall. To keep software pagetable walkers safe in this case we + * switch to RCU based table free (HAVE_RCU_TABLE_FREE). See the comment + * below 'ifdef CONFIG_HAVE_RCU_TABLE_FREE' in include/asm-generic/tlb.h + * for more details. + */ +static inline void __tlb_remove_table(void *table) +{ + free_page_and_swap_cache(table); +} + #endif /* _ASM_X86_TLB_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d23e61dc0640..4893abf7f74f 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -198,6 +198,8 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) cr4_set_bits(mask); } +extern void initialize_tlbstate_and_flush(void); + static inline void __native_flush_tlb(void) { /* diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 6358a85e2270..c1d2a9892352 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -75,12 +75,6 @@ static inline const struct cpumask *cpumask_of_node(int node) extern void setup_node_to_cpumask_map(void); -/* - * Returns the number of the node containing Node 'node'. This - * architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) - #define pcibus_to_node(bus) __pcibus_to_node(bus) extern int __node_distance(int, int); diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h new file mode 100644 index 000000000000..57c8da027d99 --- /dev/null +++ b/arch/x86/include/asm/trace/common.h @@ -0,0 +1,16 @@ +#ifndef _ASM_TRACE_COMMON_H +#define _ASM_TRACE_COMMON_H + +#ifdef CONFIG_TRACING +DECLARE_STATIC_KEY_FALSE(trace_pagefault_key); +#define trace_pagefault_enabled() \ + static_branch_unlikely(&trace_pagefault_key) +DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key); +#define trace_resched_ipi_enabled() \ + static_branch_unlikely(&trace_resched_ipi_key) +#else +static inline bool trace_pagefault_enabled(void) { return false; } +static inline bool trace_resched_ipi_enabled(void) { return false; } +#endif + +#endif diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h index 2422b14c50a7..5665bf205b8d 100644 --- a/arch/x86/include/asm/trace/exceptions.h +++ b/arch/x86/include/asm/trace/exceptions.h @@ -5,9 +5,10 @@ #define _TRACE_PAGE_FAULT_H #include <linux/tracepoint.h> +#include <asm/trace/common.h> -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +extern int trace_pagefault_reg(void); +extern void trace_pagefault_unreg(void); DECLARE_EVENT_CLASS(x86_exceptions, @@ -37,8 +38,7 @@ DEFINE_EVENT_FN(x86_exceptions, name, \ TP_PROTO(unsigned long address, struct pt_regs *regs, \ unsigned long error_code), \ TP_ARGS(address, regs, error_code), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); + trace_pagefault_reg, trace_pagefault_unreg); DEFINE_PAGE_FAULT_EVENT(page_fault_user); DEFINE_PAGE_FAULT_EVENT(page_fault_kernel); diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h new file mode 100644 index 000000000000..4253bca99989 --- /dev/null +++ b/arch/x86/include/asm/trace/hyperv.h @@ -0,0 +1,40 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hyperv + +#if !defined(_TRACE_HYPERV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HYPERV_H + +#include <linux/tracepoint.h> + +#if IS_ENABLED(CONFIG_HYPERV) + +TRACE_EVENT(hyperv_mmu_flush_tlb_others, + TP_PROTO(const struct cpumask *cpus, + const struct flush_tlb_info *info), + TP_ARGS(cpus, info), + TP_STRUCT__entry( + __field(unsigned int, ncpus) + __field(struct mm_struct *, mm) + __field(unsigned long, addr) + __field(unsigned long, end) + ), + TP_fast_assign(__entry->ncpus = cpumask_weight(cpus); + __entry->mm = info->mm; + __entry->addr = info->start; + __entry->end = info->end; + ), + TP_printk("ncpus %d mm %p addr %lx, end %lx", + __entry->ncpus, __entry->mm, + __entry->addr, __entry->end) + ); + +#endif /* CONFIG_HYPERV */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH asm/trace/ +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE hyperv +#endif /* _TRACE_HYPERV_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 32dd6a9e343c..1599d394c8c1 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -5,9 +5,12 @@ #define _TRACE_IRQ_VECTORS_H #include <linux/tracepoint.h> +#include <asm/trace/common.h> -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +#ifdef CONFIG_X86_LOCAL_APIC + +extern int trace_resched_ipi_reg(void); +extern void trace_resched_ipi_unreg(void); DECLARE_EVENT_CLASS(x86_irq_vector, @@ -28,15 +31,22 @@ DECLARE_EVENT_CLASS(x86_irq_vector, #define DEFINE_IRQ_VECTOR_EVENT(name) \ DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ TP_PROTO(int vector), \ + TP_ARGS(vector), NULL, NULL); \ +DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ + TP_PROTO(int vector), \ + TP_ARGS(vector), NULL, NULL); + +#define DEFINE_RESCHED_IPI_EVENT(name) \ +DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ + TP_PROTO(int vector), \ TP_ARGS(vector), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); \ + trace_resched_ipi_reg, \ + trace_resched_ipi_unreg); \ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ TP_PROTO(int vector), \ TP_ARGS(vector), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); - + trace_resched_ipi_reg, \ + trace_resched_ipi_unreg); /* * local_timer - called when entering/exiting a local timer interrupt @@ -45,11 +55,6 @@ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ DEFINE_IRQ_VECTOR_EVENT(local_timer); /* - * reschedule - called when entering/exiting a reschedule vector handler - */ -DEFINE_IRQ_VECTOR_EVENT(reschedule); - -/* * spurious_apic - called when entering/exiting a spurious apic vector handler */ DEFINE_IRQ_VECTOR_EVENT(spurious_apic); @@ -65,6 +70,7 @@ DEFINE_IRQ_VECTOR_EVENT(error_apic); */ DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi); +#ifdef CONFIG_IRQ_WORK /* * irq_work - called when entering/exiting a irq work interrupt * vector handler @@ -81,6 +87,18 @@ DEFINE_IRQ_VECTOR_EVENT(irq_work); * 4) goto 1 */ TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0); +#endif + +/* + * The ifdef is required because that tracepoint macro hell emits tracepoint + * code in files which include this header even if the tracepoint is not + * enabled. Brilliant stuff that. + */ +#ifdef CONFIG_SMP +/* + * reschedule - called when entering/exiting a reschedule vector handler + */ +DEFINE_RESCHED_IPI_EVENT(reschedule); /* * call_function - called when entering/exiting a call function interrupt @@ -93,24 +111,33 @@ DEFINE_IRQ_VECTOR_EVENT(call_function); * single interrupt vector handler */ DEFINE_IRQ_VECTOR_EVENT(call_function_single); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD /* * threshold_apic - called when entering/exiting a threshold apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(threshold_apic); +#endif +#ifdef CONFIG_X86_MCE_AMD /* * deferred_error_apic - called when entering/exiting a deferred apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR /* * thermal_apic - called when entering/exiting a thermal apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(thermal_apic); +#endif + +#endif /* CONFIG_X86_LOCAL_APIC */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 01fd0a7f48cd..5545f6459bf5 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -13,9 +13,6 @@ asmlinkage void divide_error(void); asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); -asmlinkage void xen_debug(void); -asmlinkage void xen_int3(void); -asmlinkage void xen_stack_segment(void); asmlinkage void overflow(void); asmlinkage void bounds(void); asmlinkage void invalid_op(void); @@ -38,22 +35,29 @@ asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); -#ifdef CONFIG_TRACING -asmlinkage void trace_page_fault(void); -#define trace_stack_segment stack_segment -#define trace_divide_error divide_error -#define trace_bounds bounds -#define trace_invalid_op invalid_op -#define trace_device_not_available device_not_available -#define trace_coprocessor_segment_overrun coprocessor_segment_overrun -#define trace_invalid_TSS invalid_TSS -#define trace_segment_not_present segment_not_present -#define trace_general_protection general_protection -#define trace_spurious_interrupt_bug spurious_interrupt_bug -#define trace_coprocessor_error coprocessor_error -#define trace_alignment_check alignment_check -#define trace_simd_coprocessor_error simd_coprocessor_error -#define trace_async_page_fault async_page_fault +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +asmlinkage void xen_divide_error(void); +asmlinkage void xen_xendebug(void); +asmlinkage void xen_xenint3(void); +asmlinkage void xen_nmi(void); +asmlinkage void xen_overflow(void); +asmlinkage void xen_bounds(void); +asmlinkage void xen_invalid_op(void); +asmlinkage void xen_device_not_available(void); +asmlinkage void xen_double_fault(void); +asmlinkage void xen_coprocessor_segment_overrun(void); +asmlinkage void xen_invalid_TSS(void); +asmlinkage void xen_segment_not_present(void); +asmlinkage void xen_stack_segment(void); +asmlinkage void xen_general_protection(void); +asmlinkage void xen_page_fault(void); +asmlinkage void xen_spurious_interrupt_bug(void); +asmlinkage void xen_coprocessor_error(void); +asmlinkage void xen_alignment_check(void); +#ifdef CONFIG_X86_MCE +asmlinkage void xen_machine_check(void); +#endif /* CONFIG_X86_MCE */ +asmlinkage void xen_simd_coprocessor_error(void); #endif dotraplinkage void do_divide_error(struct pt_regs *, long); @@ -74,14 +78,6 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *); #endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); -#ifdef CONFIG_TRACING -dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); -#else -static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error) -{ - do_page_fault(regs, error); -} -#endif dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); dotraplinkage void do_coprocessor_error(struct pt_regs *, long); dotraplinkage void do_alignment_check(struct pt_regs *, long); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 30269dafec47..184eb9894dae 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -26,7 +26,12 @@ #define get_ds() (KERNEL_DS) #define get_fs() (current->thread.addr_limit) -#define set_fs(x) (current->thread.addr_limit = (x)) +static inline void set_fs(mm_segment_t fs) +{ + current->thread.addr_limit = fs; + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); +} #define segment_eq(a, b) ((a).seg == (b).seg) diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e6676495b125..e9f793e2df7a 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -12,11 +12,14 @@ struct unwind_state { struct task_struct *task; int graph_idx; bool error; -#ifdef CONFIG_FRAME_POINTER +#if defined(CONFIG_ORC_UNWINDER) + bool signal, full_regs; + unsigned long sp, bp, ip; + struct pt_regs *regs; +#elif defined(CONFIG_FRAME_POINTER_UNWINDER) bool got_irq; - unsigned long *bp, *orig_sp; + unsigned long *bp, *orig_sp, ip; struct pt_regs *regs; - unsigned long ip; #else unsigned long *sp; #endif @@ -24,41 +27,30 @@ struct unwind_state { void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); - bool unwind_next_frame(struct unwind_state *state); - unsigned long unwind_get_return_address(struct unwind_state *state); +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } -static inline -void unwind_start(struct unwind_state *state, struct task_struct *task, - struct pt_regs *regs, unsigned long *first_frame) -{ - first_frame = first_frame ? : get_stack_pointer(task, regs); - - __unwind_start(state, task, regs, first_frame); -} - static inline bool unwind_error(struct unwind_state *state) { return state->error; } -#ifdef CONFIG_FRAME_POINTER - static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +void unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) { - if (unwind_done(state)) - return NULL; + first_frame = first_frame ? : get_stack_pointer(task, regs); - return state->regs ? &state->regs->ip : state->bp + 1; + __unwind_start(state, task, regs, first_frame); } +#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) @@ -66,20 +58,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) return state->regs; } - -#else /* !CONFIG_FRAME_POINTER */ - -static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +#else +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { return NULL; } +#endif -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +#ifdef CONFIG_ORC_UNWINDER +void unwind_init(void); +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size); +#else +static inline void unwind_init(void) {} +static inline +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size) {} +#endif + +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) + +static inline bool task_on_another_cpu(struct task_struct *task) { - return NULL; +#ifdef CONFIG_SMP + return task != current && task->on_cpu; +#else + return false; +#endif } -#endif /* CONFIG_FRAME_POINTER */ - #endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h new file mode 100644 index 000000000000..bae46fc6b9de --- /dev/null +++ b/arch/x86/include/asm/unwind_hints.h @@ -0,0 +1,105 @@ +#ifndef _ASM_X86_UNWIND_HINTS_H +#define _ASM_X86_UNWIND_HINTS_H + +#include "orc_types.h" + +#ifdef __ASSEMBLY__ + +/* + * In asm, there are two kinds of code: normal C-type callable functions and + * the rest. The normal callable functions can be called by other code, and + * don't do anything unusual with the stack. Such normal callable functions + * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this + * category. In this case, no special debugging annotations are needed because + * objtool can automatically generate the ORC data for the ORC unwinder to read + * at runtime. + * + * Anything which doesn't fall into the above category, such as syscall and + * interrupt handlers, tends to not be called directly by other functions, and + * often does unusual non-C-function-type things with the stack pointer. Such + * code needs to be annotated such that objtool can understand it. The + * following CFI hint macros are for this type of code. + * + * These macros provide hints to objtool about the state of the stack at each + * instruction. Objtool starts from the hints and follows the code flow, + * making automatic CFI adjustments when it sees pushes and pops, filling out + * the debuginfo as necessary. It will also warn if it sees any + * inconsistencies. + */ +.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL +#ifdef CONFIG_STACK_VALIDATION +.Lunwind_hint_ip_\@: + .pushsection .discard.unwind_hints + /* struct unwind_hint */ + .long .Lunwind_hint_ip_\@ - . + .short \sp_offset + .byte \sp_reg + .byte \type + .popsection +#endif +.endm + +.macro UNWIND_HINT_EMPTY + UNWIND_HINT sp_reg=ORC_REG_UNDEFINED +.endm + +.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0 + .if \base == %rsp + .if \indirect + .set sp_reg, ORC_REG_SP_INDIRECT + .else + .set sp_reg, ORC_REG_SP + .endif + .elseif \base == %rbp + .set sp_reg, ORC_REG_BP + .elseif \base == %rdi + .set sp_reg, ORC_REG_DI + .elseif \base == %rdx + .set sp_reg, ORC_REG_DX + .elseif \base == %r10 + .set sp_reg, ORC_REG_R10 + .else + .error "UNWIND_HINT_REGS: bad base register" + .endif + + .set sp_offset, \offset + + .if \iret + .set type, ORC_TYPE_REGS_IRET + .elseif \extra == 0 + .set type, ORC_TYPE_REGS_IRET + .set sp_offset, \offset + (16*8) + .else + .set type, ORC_TYPE_REGS + .endif + + UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type +.endm + +.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 + UNWIND_HINT_REGS base=\base offset=\offset iret=1 +.endm + +.macro UNWIND_HINT_FUNC sp_offset=8 + UNWIND_HINT sp_offset=\sp_offset +.endm + +#else /* !__ASSEMBLY__ */ + +#define UNWIND_HINT(sp_reg, sp_offset, type) \ + "987: \n\t" \ + ".pushsection .discard.unwind_hints\n\t" \ + /* struct unwind_hint */ \ + ".long 987b - .\n\t" \ + ".short " __stringify(sp_offset) "\n\t" \ + ".byte " __stringify(sp_reg) "\n\t" \ + ".byte " __stringify(type) "\n\t" \ + ".popsection\n\t" + +#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE) + +#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE) + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 11071fcd630e..9606688caa4b 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -552,6 +552,8 @@ static inline void MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { + u32 *p = (u32 *) &desc; + mcl->op = __HYPERVISOR_update_descriptor; if (sizeof(maddr) == sizeof(long)) { mcl->args[0] = maddr; @@ -559,8 +561,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, } else { mcl->args[0] = maddr; mcl->args[1] = maddr >> 32; - mcl->args[2] = desc.a; - mcl->args[3] = desc.b; + mcl->args[2] = *p++; + mcl->args[3] = *p; } trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4); diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index ddef37b16af2..66b8f93333d1 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -201,7 +201,7 @@ struct boot_params { * * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow. - * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest + * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, * which start at asm startup_xen() entry point and later jump to the C * xen_start_kernel() entry point. Both domU and dom0 type of guests are diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 127ddadee1a5..7032f4d8dff3 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -149,6 +149,9 @@ */ #define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9) +/* Recommend using the newer ExProcessorMasks interface */ +#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11) + /* * HV_VP_SET available */ @@ -242,7 +245,11 @@ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) /* Declare the various hypercall operations. */ +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 #define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d @@ -259,6 +266,16 @@ #define HV_PROCESSOR_POWER_STATE_C2 2 #define HV_PROCESSOR_POWER_STATE_C3 3 +#define HV_FLUSH_ALL_PROCESSORS BIT(0) +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) + +enum HV_GENERIC_SET_FORMAT { + HV_GENERIC_SET_SPARCE_4K, + HV_GENERIC_SET_ALL, +}; + /* hypercall status code */ #define HV_STATUS_SUCCESS 0 #define HV_STATUS_INVALID_HYPERCALL_CODE 2 diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 39bca7fac087..3be08f07695c 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -3,9 +3,6 @@ #define MAP_32BIT 0x40 /* only give out 32bit addresses */ -#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) -#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) - #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS /* * Take the 4 protection key bits out of the vma->vm_flags diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a01892bdd61a..fd0a7895b63f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -42,7 +42,7 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o obj-$(CONFIG_COMPAT) += signal_compat.o -obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o +obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o @@ -111,6 +111,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o +obj-$(CONFIG_EISA) += eisa.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o @@ -126,11 +127,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -ifdef CONFIG_FRAME_POINTER -obj-y += unwind_frame.o -else -obj-y += unwind_guess.o -endif +obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o +obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o +obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o ### # 64 bit specific files diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 97bb2caf3428..f8ae286c1502 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -118,7 +118,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { * This is just a simple wrapper around early_memremap(), * with sanity checks for phys == 0 and size == 0. */ -char *__init __acpi_map_table(unsigned long phys, unsigned long size) +void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size) { if (!phys || !size) @@ -127,7 +127,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) return early_memremap(phys, size); } -void __init __acpi_unmap_table(char *map, unsigned long size) +void __init __acpi_unmap_table(void __iomem *map, unsigned long size) { if (!map || !size) return; @@ -199,8 +199,10 @@ static int __init acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_local_x2apic *processor = NULL; +#ifdef CONFIG_X86_X2APIC int apic_id; u8 enabled; +#endif processor = (struct acpi_madt_local_x2apic *)header; @@ -209,9 +211,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) acpi_table_print_madt_entry(header); +#ifdef CONFIG_X86_X2APIC apic_id = processor->local_apic_id; enabled = processor->lapic_flags & ACPI_MADT_ENABLED; -#ifdef CONFIG_X86_X2APIC + /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size @@ -1083,7 +1086,7 @@ static void __init mp_config_acpi_legacy_irqs(void) mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; #endif set_bit(MP_ISA_BUS, mp_bus_not_pci); - pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); + pr_debug("Bus #%d is ISA (nIRQs: %d)\n", MP_ISA_BUS, nr_legacy_irqs()); /* * Use the default configuration for the IRQs 0-15. Unless diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 32e14d137416..3344d3382e91 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -742,7 +742,16 @@ static void *bp_int3_handler, *bp_int3_addr; int poke_int3_handler(struct pt_regs *regs) { - /* bp_patching_in_progress */ + /* + * Having observed our INT3 instruction, we now must observe + * bp_patching_in_progress. + * + * in_progress = TRUE INT3 + * WMB RMB + * write INT3 if (in_progress) + * + * Idem for bp_int3_handler. + */ smp_rmb(); if (likely(!bp_patching_in_progress)) @@ -788,9 +797,8 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) bp_int3_addr = (u8 *)addr + sizeof(int3); bp_patching_in_progress = true; /* - * Corresponding read barrier in int3 notifier for - * making sure the in_progress flags is correctly ordered wrt. - * patching + * Corresponding read barrier in int3 notifier for making sure the + * in_progress and handler are correctly ordered wrt. patching. */ smp_wmb(); @@ -815,9 +823,11 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) text_poke(addr, opcode, sizeof(int3)); on_each_cpu(do_sync_core, NULL, 1); - + /* + * sync_core() implies an smp_mb() and orders this store against + * the writing of the new instruction. + */ bp_patching_in_progress = false; - smp_wmb(); return addr; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 98b3dd8cf2bf..7834f73efbf1 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -177,8 +177,6 @@ static int disable_apic_timer __initdata; int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); -int first_system_vector = FIRST_SYSTEM_VECTOR; - /* * Debug level, exported for io_apic.c */ @@ -599,9 +597,13 @@ static const struct x86_cpu_id deadline_match[] = { static void apic_check_deadline_errata(void) { - const struct x86_cpu_id *m = x86_match_cpu(deadline_match); + const struct x86_cpu_id *m; u32 rev; + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return; + + m = x86_match_cpu(deadline_match); if (!m) return; @@ -990,8 +992,7 @@ void setup_secondary_APIC_clock(void) */ static void local_apic_timer_interrupt(void) { - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + struct clock_event_device *evt = this_cpu_ptr(&lapic_events); /* * Normally we should not be here till LAPIC has been initialized but @@ -1005,7 +1006,8 @@ static void local_apic_timer_interrupt(void) * spurious. */ if (!evt->event_handler) { - pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu); + pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", + smp_processor_id()); /* Switch it off */ lapic_timer_shutdown(evt); return; @@ -1040,25 +1042,6 @@ __visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) * interrupt lock, which is the WrongThing (tm) to do. */ entering_ack_irq(); - local_apic_timer_interrupt(); - exiting_irq(); - - set_irq_regs(old_regs); -} - -__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - * - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - entering_ack_irq(); trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); trace_local_timer_exit(LOCAL_TIMER_VECTOR); @@ -1920,10 +1903,14 @@ void __init register_lapic_address(unsigned long address) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -static void __smp_spurious_interrupt(u8 vector) +__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) { + u8 vector = ~regs->orig_ax; u32 v; + entering_irq(); + trace_spurious_apic_entry(vector); + /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1938,22 +1925,7 @@ static void __smp_spurious_interrupt(u8 vector) /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt through vector %02x on CPU#%d, " "should never happen.\n", vector, smp_processor_id()); -} -__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_spurious_interrupt(~regs->orig_ax); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) -{ - u8 vector = ~regs->orig_ax; - - entering_irq(); - trace_spurious_apic_entry(vector); - __smp_spurious_interrupt(vector); trace_spurious_apic_exit(vector); exiting_irq(); } @@ -1961,10 +1933,8 @@ __visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) /* * This interrupt should never happen with our APIC/SMP architecture */ -static void __smp_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) { - u32 v; - u32 i = 0; static const char * const error_interrupt_reason[] = { "Send CS error", /* APIC Error Bit 0 */ "Receive CS error", /* APIC Error Bit 1 */ @@ -1975,6 +1945,10 @@ static void __smp_error_interrupt(struct pt_regs *regs) "Received illegal vector", /* APIC Error Bit 6 */ "Illegal register address", /* APIC Error Bit 7 */ }; + u32 v, i = 0; + + entering_irq(); + trace_error_apic_entry(ERROR_APIC_VECTOR); /* First tickle the hardware, only then report what went on. -- REW */ if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ @@ -1996,20 +1970,6 @@ static void __smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT "\n"); -} - -__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_error_interrupt(regs); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs) -{ - entering_irq(); - trace_error_apic_entry(ERROR_APIC_VECTOR); - __smp_error_interrupt(regs); trace_error_apic_exit(ERROR_APIC_VECTOR); exiting_irq(); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 237e9c2341c7..70e48aa6af98 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1243,7 +1243,7 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) entry.vector, entry.irr, entry.delivery_status); if (ir_entry->format) printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", - buf, (ir_entry->index << 15) | ir_entry->index, + buf, (ir_entry->index2 << 15) | ir_entry->index, ir_entry->zero); else printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b3af457ed667..88c214e75a6b 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -166,7 +166,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, offset = current_offset; next: vector += 16; - if (vector >= first_system_vector) { + if (vector >= FIRST_SYSTEM_VECTOR) { offset = (offset + 1) % 16; vector = FIRST_EXTERNAL_VECTOR + offset; } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 880aa093268d..710edab9e644 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -4,9 +4,6 @@ #include <asm/ucontext.h> -#include <linux/lguest.h> -#include "../../../drivers/lguest/lg.h" - #define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include <asm/syscalls_32.h> @@ -62,23 +59,6 @@ void foo(void) OFFSET(stack_canary_offset, stack_canary, canary); #endif -#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) - BLANK(); - OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); - OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); - - BLANK(); - OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); - OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); - OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); - OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); - OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); - OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); - OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); - OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); - OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); - OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); -#endif BLANK(); DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); DEFINE(NR_syscalls, sizeof(syscalls)); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 99332f550c48..cf42206926af 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -20,7 +20,6 @@ static char syscalls_ia32[] = { int main(void) { #ifdef CONFIG_PARAVIRT - OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); BLANK(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cdf82492b770..e17942c131c8 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o +obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 110ca5d2bb87..9862e2cd6d93 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -297,13 +297,29 @@ static int nearby_node(int apicid) } #endif +#ifdef CONFIG_SMP +/* + * Fix up cpu_core_id for pre-F17h systems to be in the + * [0 .. cores_per_node - 1] range. Not really needed but + * kept so as not to break existing setups. + */ +static void legacy_fixup_core_id(struct cpuinfo_x86 *c) +{ + u32 cus_per_node; + + if (c->x86 >= 0x17) + return; + + cus_per_node = c->x86_max_cores / nodes_per_socket; + c->cpu_core_id %= cus_per_node; +} + /* * Fixup core topology information for * (1) AMD multi-node processors * Assumption: Number of cores in each internal node is the same. * (2) AMD processors supporting compute units */ -#ifdef CONFIG_SMP static void amd_get_topology(struct cpuinfo_x86 *c) { u8 node_id; @@ -354,15 +370,9 @@ static void amd_get_topology(struct cpuinfo_x86 *c) } else return; - /* fixup multi-node processor information */ if (nodes_per_socket > 1) { - u32 cus_per_node; - set_cpu_cap(c, X86_FEATURE_AMD_DCM); - cus_per_node = c->x86_max_cores / nodes_per_socket; - - /* core id has to be in the [0 .. cores_per_node - 1] range */ - c->cpu_core_id %= cus_per_node; + legacy_fixup_core_id(c); } } #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b95cd94ca97b..fb1d3358a4af 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -333,6 +333,19 @@ static void setup_pcid(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_PCID)) { if (cpu_has(c, X86_FEATURE_PGE)) { + /* + * We'd like to use cr4_set_bits_and_update_boot(), + * but we can't. CR4.PCIDE is special and can only + * be set in long mode, and the early CPU init code + * doesn't know this and would try to restore CR4.PCIDE + * prior to entering long mode. + * + * Instead, we rely on the fact that hotplug, resume, + * etc all fully restore CR4 before they write anything + * that could have nonzero PCID bits to CR3. CR4.PCIDE + * has no effect on the page tables themselves, so we + * don't need it to be restored early. + */ cr4_set_bits(X86_CR4_PCIDE); } else { /* @@ -1329,15 +1342,6 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr __ro_after_init = { - .size = NR_VECTORS * 16 - 1, - .address = (unsigned long) idt_table, -}; -const struct desc_ptr debug_idt_descr = { - .size = NR_VECTORS * 16 - 1, - .address = (unsigned long) debug_idt_table, -}; - DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; @@ -1592,6 +1596,7 @@ void cpu_init(void) mmgrab(&init_mm); me->active_mm = &init_mm; BUG_ON(me->mm); + initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); load_sp0(t, ¤t->thread); @@ -1646,6 +1651,7 @@ void cpu_init(void) mmgrab(&init_mm); curr->active_mm = &init_mm; BUG_ON(curr->mm); + initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); load_sp0(t, thread); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index c55fb2cb2acc..24f749324c0f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -811,7 +811,24 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, struct cacheinfo *this_leaf; int i, sibling; - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + /* + * For L3, always use the pre-calculated cpu_llc_shared_mask + * to derive shared_cpu_map. + */ + if (index == 3) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) + continue; + this_leaf = this_cpu_ci->info_list + index; + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { + if (!cpu_online(sibling)) + continue; + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); + } + } + } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { unsigned int apicid, nshared, first, last; this_leaf = this_cpu_ci->info_list + index; @@ -839,19 +856,6 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, &this_leaf->shared_cpu_map); } } - } else if (index == 3) { - for_each_cpu(i, cpu_llc_shared_mask(cpu)) { - this_cpu_ci = get_cpu_cacheinfo(i); - if (!this_cpu_ci->info_list) - continue; - this_leaf = this_cpu_ci->info_list + index; - for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { - if (!cpu_online(sibling)) - continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); - } - } } else return 0; diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 5b366462f579..cd5fc61ba450 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -30,7 +30,8 @@ #include <linux/cpuhotplug.h> #include <asm/intel-family.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> +#include "intel_rdt.h" #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 @@ -38,7 +39,13 @@ /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); +/* + * The cached intel_pqr_state is strictly per CPU and can never be + * updated from a remote CPU. Functions which modify the state + * are called with interrupts disabled and no preemption, which + * is sufficient for the protection. + */ +DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); /* * Used to store the max resource name width and max resource data width @@ -46,6 +53,12 @@ DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); */ int max_name_width, max_data_width; +/* + * Global boolean for rdt_alloc which is true if any + * resource allocation is enabled. + */ +bool rdt_alloc_capable; + static void mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); static void @@ -54,7 +67,9 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) struct rdt_resource rdt_resources_all[] = { + [RDT_RESOURCE_L3] = { + .rid = RDT_RESOURCE_L3, .name = "L3", .domains = domain_init(RDT_RESOURCE_L3), .msr_base = IA32_L3_CBM_BASE, @@ -67,8 +82,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L3DATA] = { + .rid = RDT_RESOURCE_L3DATA, .name = "L3DATA", .domains = domain_init(RDT_RESOURCE_L3DATA), .msr_base = IA32_L3_CBM_BASE, @@ -81,8 +99,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L3CODE] = { + .rid = RDT_RESOURCE_L3CODE, .name = "L3CODE", .domains = domain_init(RDT_RESOURCE_L3CODE), .msr_base = IA32_L3_CBM_BASE, @@ -95,8 +116,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L2] = { + .rid = RDT_RESOURCE_L2, .name = "L2", .domains = domain_init(RDT_RESOURCE_L2), .msr_base = IA32_L2_CBM_BASE, @@ -109,8 +133,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_MBA] = { + .rid = RDT_RESOURCE_MBA, .name = "MB", .domains = domain_init(RDT_RESOURCE_MBA), .msr_base = IA32_MBA_THRTL_BASE, @@ -118,6 +145,7 @@ struct rdt_resource rdt_resources_all[] = { .cache_level = 3, .parse_ctrlval = parse_bw, .format_str = "%d=%*d", + .fflags = RFTYPE_RES_MB, }, }; @@ -144,33 +172,28 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) * is always 20 on hsw server parts. The minimum cache bitmask length * allowed for HSW server is always 2 bits. Hardcode all of them. */ -static inline bool cache_alloc_hsw_probe(void) +static inline void cache_alloc_hsw_probe(void) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; - u32 l, h, max_cbm = BIT_MASK(20) - 1; - - if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) - return false; - rdmsr(IA32_L3_CBM_BASE, l, h); + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + u32 l, h, max_cbm = BIT_MASK(20) - 1; - /* If all the bits were set in MSR, return success */ - if (l != max_cbm) - return false; + if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) + return; + rdmsr(IA32_L3_CBM_BASE, l, h); - r->num_closid = 4; - r->default_ctrl = max_cbm; - r->cache.cbm_len = 20; - r->cache.min_cbm_bits = 2; - r->capable = true; - r->enabled = true; + /* If all the bits were set in MSR, return success */ + if (l != max_cbm) + return; - return true; - } + r->num_closid = 4; + r->default_ctrl = max_cbm; + r->cache.cbm_len = 20; + r->cache.shareable_bits = 0xc0000; + r->cache.min_cbm_bits = 2; + r->alloc_capable = true; + r->alloc_enabled = true; - return false; + rdt_alloc_capable = true; } /* @@ -213,15 +236,14 @@ static bool rdt_get_mem_config(struct rdt_resource *r) return false; } r->data_width = 3; - rdt_get_mba_infofile(r); - r->capable = true; - r->enabled = true; + r->alloc_capable = true; + r->alloc_enabled = true; return true; } -static void rdt_get_cache_config(int idx, struct rdt_resource *r) +static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) { union cpuid_0x10_1_eax eax; union cpuid_0x10_x_edx edx; @@ -231,10 +253,10 @@ static void rdt_get_cache_config(int idx, struct rdt_resource *r) r->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; - rdt_get_cache_infofile(r); - r->capable = true; - r->enabled = true; + r->alloc_capable = true; + r->alloc_enabled = true; } static void rdt_get_cdp_l3_config(int type) @@ -246,12 +268,12 @@ static void rdt_get_cdp_l3_config(int type) r->cache.cbm_len = r_l3->cache.cbm_len; r->default_ctrl = r_l3->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; - r->capable = true; + r->alloc_capable = true; /* * By default, CDP is disabled. CDP can be enabled by mount parameter * "cdp" during resctrl file system mount time. */ - r->enabled = false; + r->alloc_enabled = false; } static int get_cache_id(int cpu, int level) @@ -300,6 +322,19 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); } +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->cpu_mask)) + return d; + } + + return NULL; +} + void rdt_ctrl_update(void *arg) { struct msr_param *m = arg; @@ -307,12 +342,10 @@ void rdt_ctrl_update(void *arg) int cpu = smp_processor_id(); struct rdt_domain *d; - list_for_each_entry(d, &r->domains, list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - r->msr_update(d, m, r); - return; - } + d = get_domain_from_cpu(cpu, r); + if (d) { + r->msr_update(d, m, r); + return; } pr_warn_once("cpu %d not found in any domain for resource %s\n", cpu, r->name); @@ -326,8 +359,8 @@ void rdt_ctrl_update(void *arg) * caller, return the first domain whose id is bigger than the input id. * The domain list is sorted by id in ascending order. */ -static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, - struct list_head **pos) +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos) { struct rdt_domain *d; struct list_head *l; @@ -377,6 +410,44 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) return 0; } +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +{ + size_t tsize; + + if (is_llc_occupancy_enabled()) { + d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid), + sizeof(unsigned long), + GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + } + if (is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_total) { + kfree(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_local) { + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + if (is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + } + + return 0; +} + /* * domain_add_cpu - Add a cpu to a resource's domain list. * @@ -412,14 +483,26 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; d->id = id; + cpumask_set_cpu(cpu, &d->cpu_mask); - if (domain_setup_ctrlval(r, d)) { + if (r->alloc_capable && domain_setup_ctrlval(r, d)) { + kfree(d); + return; + } + + if (r->mon_capable && domain_setup_mon_state(r, d)) { kfree(d); return; } - cpumask_set_cpu(cpu, &d->cpu_mask); list_add_tail(&d->list, add_pos); + + /* + * If resctrl is mounted, add + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + mkdir_mondata_subdir_allrdtgrp(r, d); } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -435,19 +518,58 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + rmdir_mondata_subdir_allrdtgrp(r, d->id); kfree(d->ctrl_val); + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); list_del(&d->list); + if (is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + kfree(d); + return; + } + + if (r == &rdt_resources_all[RDT_RESOURCE_L3]) { + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0); + } + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && + has_busy_rmid(r, d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0); + } } } -static void clear_closid(int cpu) +static void clear_closid_rmid(int cpu) { struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - per_cpu(cpu_closid, cpu) = 0; - state->closid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); + state->default_closid = 0; + state->default_rmid = 0; + state->cur_closid = 0; + state->cur_rmid = 0; + wrmsr(IA32_PQR_ASSOC, 0, 0); } static int intel_rdt_online_cpu(unsigned int cpu) @@ -459,12 +581,23 @@ static int intel_rdt_online_cpu(unsigned int cpu) domain_add_cpu(cpu, r); /* The cpu is set in default rdtgroup after online. */ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); - clear_closid(cpu); + clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; } +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { + break; + } + } +} + static int intel_rdt_offline_cpu(unsigned int cpu) { struct rdtgroup *rdtgrp; @@ -474,10 +607,12 @@ static int intel_rdt_offline_cpu(unsigned int cpu) for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); break; + } } - clear_closid(cpu); + clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; @@ -492,7 +627,7 @@ static __init void rdt_init_padding(void) struct rdt_resource *r; int cl; - for_each_capable_rdt_resource(r) { + for_each_alloc_capable_rdt_resource(r) { cl = strlen(r->name); if (cl > max_name_width) max_name_width = cl; @@ -502,38 +637,153 @@ static __init void rdt_init_padding(void) } } -static __init bool get_rdt_resources(void) +enum { + RDT_FLAG_CMT, + RDT_FLAG_MBM_TOTAL, + RDT_FLAG_MBM_LOCAL, + RDT_FLAG_L3_CAT, + RDT_FLAG_L3_CDP, + RDT_FLAG_L2_CAT, + RDT_FLAG_MBA, +}; + +#define RDT_OPT(idx, n, f) \ +[idx] = { \ + .name = n, \ + .flag = f \ +} + +struct rdt_options { + char *name; + int flag; + bool force_off, force_on; +}; + +static struct rdt_options rdt_options[] __initdata = { + RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), + RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), + RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), + RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3), + RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3), + RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2), + RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), +}; +#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) + +static int __init set_rdt_options(char *str) +{ + struct rdt_options *o; + bool force_off; + char *tok; + + if (*str == '=') + str++; + while ((tok = strsep(&str, ",")) != NULL) { + force_off = *tok == '!'; + if (force_off) + tok++; + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (strcmp(tok, o->name) == 0) { + if (force_off) + o->force_off = true; + else + o->force_on = true; + break; + } + } + } + return 1; +} +__setup("rdt", set_rdt_options); + +static bool __init rdt_cpu_has(int flag) +{ + bool ret = boot_cpu_has(flag); + struct rdt_options *o; + + if (!ret) + return ret; + + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (flag == o->flag) { + if (o->force_off) + ret = false; + if (o->force_on) + ret = true; + break; + } + } + return ret; +} + +static __init bool get_rdt_alloc_resources(void) { bool ret = false; - if (cache_alloc_hsw_probe()) + if (rdt_alloc_capable) return true; if (!boot_cpu_has(X86_FEATURE_RDT_A)) return false; - if (boot_cpu_has(X86_FEATURE_CAT_L3)) { - rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); - if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + if (rdt_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (rdt_cpu_has(X86_FEATURE_CDP_L3)) { rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); } ret = true; } - if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { /* CPUID 0x10.2 fields are same format at 0x10.1 */ - rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]); ret = true; } - if (boot_cpu_has(X86_FEATURE_MBA)) { + if (rdt_cpu_has(X86_FEATURE_MBA)) { if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA])) ret = true; } - return ret; } +static __init bool get_rdt_mon_resources(void) +{ + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) + rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + + if (!rdt_mon_features) + return false; + + return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]); +} + +static __init void rdt_quirks(void) +{ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_HASWELL_X: + if (!rdt_options[RDT_FLAG_L3_CAT].force_off) + cache_alloc_hsw_probe(); + break; + case INTEL_FAM6_SKYLAKE_X: + if (boot_cpu_data.x86_mask <= 4) + set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); + } +} + +static __init bool get_rdt_resources(void) +{ + rdt_quirks(); + rdt_alloc_capable = get_rdt_alloc_resources(); + rdt_mon_capable = get_rdt_mon_resources(); + + return (rdt_mon_capable || rdt_alloc_capable); +} + static int __init intel_rdt_late_init(void) { struct rdt_resource *r; @@ -556,9 +806,12 @@ static int __init intel_rdt_late_init(void) return ret; } - for_each_capable_rdt_resource(r) + for_each_alloc_capable_rdt_resource(r) pr_info("Intel RDT %s allocation detected\n", r->name); + for_each_mon_capable_rdt_resource(r) + pr_info("Intel RDT %s monitoring detected\n", r->name); + return 0; } diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h new file mode 100644 index 000000000000..ebaddaeef023 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -0,0 +1,440 @@ +#ifndef _ASM_X86_INTEL_RDT_H +#define _ASM_X86_INTEL_RDT_H + +#include <linux/sched.h> +#include <linux/kernfs.h> +#include <linux/jump_label.h> + +#define IA32_L3_QOS_CFG 0xc81 +#define IA32_L3_CBM_BASE 0xc90 +#define IA32_L2_CBM_BASE 0xd10 +#define IA32_MBA_THRTL_BASE 0xd50 + +#define L3_QOS_CDP_ENABLE 0x01ULL + +/* + * Event IDs are used to program IA32_QM_EVTSEL before reading event + * counter from IA32_QM_CTR + */ +#define QOS_L3_OCCUP_EVENT_ID 0x01 +#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02 +#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03 + +#define CQM_LIMBOCHECK_INTERVAL 1000 + +#define MBM_CNTR_WIDTH 24 +#define MBM_OVERFLOW_INTERVAL 1000 + +#define RMID_VAL_ERROR BIT_ULL(63) +#define RMID_VAL_UNAVAIL BIT_ULL(62) + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); + +/** + * struct mon_evt - Entry in the event list of a resource + * @evtid: event id + * @name: name of the event + */ +struct mon_evt { + u32 evtid; + char *name; + struct list_head list; +}; + +/** + * struct mon_data_bits - Monitoring details for each event file + * @rid: Resource id associated with the event file. + * @evtid: Event id associated with the event file + * @domid: The domain to which the event file belongs + */ +union mon_data_bits { + void *priv; + struct { + unsigned int rid : 10; + unsigned int evtid : 8; + unsigned int domid : 14; + } u; +}; + +struct rmid_read { + struct rdtgroup *rgrp; + struct rdt_domain *d; + int evtid; + bool first; + u64 val; +}; + +extern unsigned int intel_cqm_threshold; +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; +extern unsigned int rdt_mon_features; + +enum rdt_group_type { + RDTCTRL_GROUP = 0, + RDTMON_GROUP, + RDT_NUM_GROUP, +}; + +/** + * struct mongroup - store mon group's data in resctrl fs. + * @mon_data_kn kernlfs node for the mon_data directory + * @parent: parent rdtgrp + * @crdtgrp_list: child rdtgroup node list + * @rmid: rmid for this rdtgroup + */ +struct mongroup { + struct kernfs_node *mon_data_kn; + struct rdtgroup *parent; + struct list_head crdtgrp_list; + u32 rmid; +}; + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + * @type: indicates type of this rdtgroup - either + * monitor only or ctrl_mon group + * @mon: mongroup related data + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + u32 closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; + enum rdt_group_type type; + struct mongroup mon; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* rftype.flags */ +#define RFTYPE_FLAGS_CPUS_LIST 1 + +/* + * Define the file type flags for base and info directories. + */ +#define RFTYPE_INFO BIT(0) +#define RFTYPE_BASE BIT(1) +#define RF_CTRLSHIFT 4 +#define RF_MONSHIFT 5 +#define RFTYPE_CTRL BIT(RF_CTRLSHIFT) +#define RFTYPE_MON BIT(RF_MONSHIFT) +#define RFTYPE_RES_CACHE BIT(8) +#define RFTYPE_RES_MB BIT(9) +#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) +#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +extern int max_name_width, max_data_width; + +int __init rdtgroup_init(void); + +/** + * struct rftype - describe each file in the resctrl file system + * @name: File name + * @mode: Access mode + * @kf_ops: File operations + * @flags: File specific RFTYPE_FLAGS_* flags + * @fflags: File specific RF_* or RFTYPE_* flags + * @seq_show: Show content of the file + * @write: Write to the file + */ +struct rftype { + char *name; + umode_t mode; + struct kernfs_ops *kf_ops; + unsigned long flags; + unsigned long fflags; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct mbm_state - status for each MBM counter in each domain + * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) + * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it + */ +struct mbm_state { + u64 chunks; + u64 prev_msr; +}; + +/** + * struct rdt_domain - group of cpus sharing an RDT resource + * @list: all instances of this resource + * @id: unique id for this instance + * @cpu_mask: which cpus share this resource + * @rmid_busy_llc: + * bitmap of which limbo RMIDs are above threshold + * @mbm_total: saved state for MBM total bandwidth + * @mbm_local: saved state for MBM local bandwidth + * @mbm_over: worker to periodically read MBM h/w counters + * @cqm_limbo: worker to periodically read CQM h/w counters + * @mbm_work_cpu: + * worker cpu for MBM h/w counters + * @cqm_work_cpu: + * worker cpu for CQM h/w counters + * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * @new_ctrl: new ctrl value to be loaded + * @have_new_ctrl: did user provide new_ctrl for this domain + */ +struct rdt_domain { + struct list_head list; + int id; + struct cpumask cpu_mask; + unsigned long *rmid_busy_llc; + struct mbm_state *mbm_total; + struct mbm_state *mbm_local; + struct delayed_work mbm_over; + struct delayed_work cqm_limbo; + int mbm_work_cpu; + int cqm_work_cpu; + u32 *ctrl_val; + u32 new_ctrl; + bool have_new_ctrl; +}; + +/** + * struct msr_param - set a range of MSRs from a domain + * @res: The resource to use + * @low: Beginning index from base MSR + * @high: End index + */ +struct msr_param { + struct rdt_resource *res; + int low; + int high; +}; + +/** + * struct rdt_cache - Cache allocation related data + * @cbm_len: Length of the cache bit mask + * @min_cbm_bits: Minimum number of consecutive bits to be set + * @cbm_idx_mult: Multiplier of CBM index + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: + * closid * cbm_idx_multi + cbm_idx_offset + * in a cache bit mask + * @shareable_bits: Bitmask of shareable resource with other + * executing entities + */ +struct rdt_cache { + unsigned int cbm_len; + unsigned int min_cbm_bits; + unsigned int cbm_idx_mult; + unsigned int cbm_idx_offset; + unsigned int shareable_bits; +}; + +/** + * struct rdt_membw - Memory bandwidth allocation related data + * @max_delay: Max throttle delay. Delay is the hardware + * representation for memory bandwidth. + * @min_bw: Minimum memory bandwidth percentage user can request + * @bw_gran: Granularity at which the memory bandwidth is allocated + * @delay_linear: True if memory B/W delay is in linear scale + * @mb_map: Mapping of memory B/W percentage to memory B/W delay + */ +struct rdt_membw { + u32 max_delay; + u32 min_bw; + u32 bw_gran; + u32 delay_linear; + u32 *mb_map; +}; + +static inline bool is_llc_occupancy_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); +} + +static inline bool is_mbm_total_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); +} + +static inline bool is_mbm_local_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); +} + +static inline bool is_mbm_enabled(void) +{ + return (is_mbm_total_enabled() || is_mbm_local_enabled()); +} + +static inline bool is_mbm_event(int e) +{ + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && + e <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/** + * struct rdt_resource - attributes of an RDT resource + * @rid: The index of the resource + * @alloc_enabled: Is allocation enabled on this machine + * @mon_enabled: Is monitoring enabled for this feature + * @alloc_capable: Is allocation available on this machine + * @mon_capable: Is monitor feature available on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @cache_level: Which cache level defines scope of this resource + * @default_ctrl: Specifies default cache cbm or memory B/W percent. + * @msr_base: Base MSR address for CBMs + * @msr_update: Function pointer to update QOS MSRs + * @data_width: Character width of data when displaying + * @domains: All domains for this resource + * @cache: Cache allocation related data + * @format_str: Per resource format string to show domain value + * @parse_ctrlval: Per resource function pointer to parse control values + * @evt_list: List of monitoring events + * @num_rmid: Number of RMIDs available + * @mon_scale: cqm counter * mon_scale = occupancy in bytes + * @fflags: flags to choose base and info files + */ +struct rdt_resource { + int rid; + bool alloc_enabled; + bool mon_enabled; + bool alloc_capable; + bool mon_capable; + char *name; + int num_closid; + int cache_level; + u32 default_ctrl; + unsigned int msr_base; + void (*msr_update) (struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); + int data_width; + struct list_head domains; + struct rdt_cache cache; + struct rdt_membw membw; + const char *format_str; + int (*parse_ctrlval) (char *buf, struct rdt_resource *r, + struct rdt_domain *d); + struct list_head evt_list; + int num_rmid; + unsigned int mon_scale; + unsigned long fflags; +}; + +int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); + +extern struct mutex rdtgroup_mutex; + +extern struct rdt_resource rdt_resources_all[]; +extern struct rdtgroup rdtgroup_default; +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +int __init rdtgroup_init(void); + +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +#define for_each_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable || r->mon_capable) + +#define for_each_alloc_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable) + +#define for_each_mon_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_capable) + +#define for_each_alloc_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_enabled) + +#define for_each_mon_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_enabled) + +/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ +union cpuid_0x10_1_eax { + struct { + unsigned int cbm_len:5; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ +union cpuid_0x10_3_eax { + struct { + unsigned int max_delay:12; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID).EDX */ +union cpuid_0x10_x_edx { + struct { + unsigned int cos_max:16; + } split; + unsigned int full; +}; + +void rdt_ctrl_update(void *arg); +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); +void rdtgroup_kn_unlock(struct kernfs_node *kn); +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos); +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); +int alloc_rmid(void); +void free_rmid(u32 rmid); +int rdt_get_mon_l3_config(struct rdt_resource *r); +void mon_event_count(void *info); +int rdtgroup_mondata_show(struct seq_file *m, void *arg); +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + unsigned int dom_id); +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d); +void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, + struct rdtgroup *rdtgrp, int evtid, int first); +void mbm_setup_overflow_handler(struct rdt_domain *dom, + unsigned long delay_ms); +void mbm_handle_overflow(struct work_struct *work); +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_handle_limbo(struct work_struct *work); +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); +void __check_limbo(struct rdt_domain *d, bool force_free); + +#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 406d7a6532f9..f6ea94f8954a 100644 --- a/arch/x86/kernel/cpu/intel_rdt_schemata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -26,7 +26,7 @@ #include <linux/kernfs.h> #include <linux/seq_file.h> #include <linux/slab.h> -#include <asm/intel_rdt.h> +#include "intel_rdt.h" /* * Check whether MBA bandwidth percentage value is correct. The value is @@ -192,7 +192,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid) { struct rdt_resource *r; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { if (!strcmp(resname, r->name) && closid < r->num_closid) return parse_line(tok, r); } @@ -221,7 +221,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, closid = rdtgrp->closid; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { list_for_each_entry(dom, &r->domains, list) dom->have_new_ctrl = false; } @@ -237,7 +237,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, goto out; } - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { ret = update_domains(r, closid); if (ret) goto out; @@ -269,12 +269,13 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, { struct rdtgroup *rdtgrp; struct rdt_resource *r; - int closid, ret = 0; + int ret = 0; + u32 closid; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (rdtgrp) { closid = rdtgrp->closid; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { if (closid < r->num_closid) show_doms(s, r, closid); } @@ -284,3 +285,57 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, rdtgroup_kn_unlock(of->kn); return ret; } + +void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, + struct rdtgroup *rdtgrp, int evtid, int first) +{ + /* + * setup the parameters to send to the IPI to read the data. + */ + rr->rgrp = rdtgrp; + rr->evtid = evtid; + rr->d = d; + rr->val = 0; + rr->first = first; + + smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); +} + +int rdtgroup_mondata_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + u32 resid, evtid, domid; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + union mon_data_bits md; + struct rdt_domain *d; + struct rmid_read rr; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + md.priv = of->kn->priv; + resid = md.u.rid; + domid = md.u.domid; + evtid = md.u.evtid; + + r = &rdt_resources_all[resid]; + d = rdt_find_domain(r, domid, NULL); + if (!d) { + ret = -ENOENT; + goto out; + } + + mon_event_read(&rr, d, rdtgrp, evtid, false); + + if (rr.val & RMID_VAL_ERROR) + seq_puts(m, "Error\n"); + else if (rr.val & RMID_VAL_UNAVAIL) + seq_puts(m, "Unavailable\n"); + else + seq_printf(m, "%llu\n", rr.val * r->mon_scale); + +out: + rdtgroup_kn_unlock(of->kn); + return ret; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c new file mode 100644 index 000000000000..30827510094b --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c @@ -0,0 +1,499 @@ +/* + * Resource Director Technology(RDT) + * - Monitoring code + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Vikas Shivappa <vikas.shivappa@intel.com> + * + * This replaces the cqm.c based on perf but we reuse a lot of + * code and datastructures originally from Peter Zijlstra and Matt Fleming. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <asm/cpu_device_id.h> +#include "intel_rdt.h" + +#define MSR_IA32_QM_CTR 0x0c8e +#define MSR_IA32_QM_EVTSEL 0x0c8d + +struct rmid_entry { + u32 rmid; + int busy; + struct list_head list; +}; + +/** + * @rmid_free_lru A least recently used list of free RMIDs + * These RMIDs are guaranteed to have an occupancy less than the + * threshold occupancy + */ +static LIST_HEAD(rmid_free_lru); + +/** + * @rmid_limbo_count count of currently unused but (potentially) + * dirty RMIDs. + * This counts RMIDs that no one is currently using but that + * may have a occupancy value > intel_cqm_threshold. User can change + * the threshold occupancy value. + */ +unsigned int rmid_limbo_count; + +/** + * @rmid_entry - The entry in the limbo and free lists. + */ +static struct rmid_entry *rmid_ptrs; + +/* + * Global boolean for rdt_monitor which is true if any + * resource monitoring is enabled. + */ +bool rdt_mon_capable; + +/* + * Global to indicate which monitoring events are enabled. + */ +unsigned int rdt_mon_features; + +/* + * This is the threshold cache occupancy at which we will consider an + * RMID available for re-allocation. + */ +unsigned int intel_cqm_threshold; + +static inline struct rmid_entry *__rmid_entry(u32 rmid) +{ + struct rmid_entry *entry; + + entry = &rmid_ptrs[rmid]; + WARN_ON(entry->rmid != rmid); + + return entry; +} + +static u64 __rmid_read(u32 rmid, u32 eventid) +{ + u64 val; + + /* + * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured + * with a valid event code for supported resource type and the bits + * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, + * IA32_QM_CTR.data (bits 61:0) reports the monitored data. + * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) + * are error bits. + */ + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + + return val; +} + +static bool rmid_dirty(struct rmid_entry *entry) +{ + u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + + return val >= intel_cqm_threshold; +} + +/* + * Check the RMIDs that are marked as busy for this domain. If the + * reported LLC occupancy is below the threshold clear the busy bit and + * decrement the count. If the busy count gets to zero on an RMID, we + * free the RMID + */ +void __check_limbo(struct rdt_domain *d, bool force_free) +{ + struct rmid_entry *entry; + struct rdt_resource *r; + u32 crmid = 1, nrmid; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + /* + * Skip RMID 0 and start from RMID 1 and check all the RMIDs that + * are marked as busy for occupancy < threshold. If the occupancy + * is less than the threshold decrement the busy counter of the + * RMID and move it to the free list when the counter reaches 0. + */ + for (;;) { + nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); + if (nrmid >= r->num_rmid) + break; + + entry = __rmid_entry(nrmid); + if (force_free || !rmid_dirty(entry)) { + clear_bit(entry->rmid, d->rmid_busy_llc); + if (!--entry->busy) { + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + } + } + crmid = nrmid + 1; + } +} + +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +{ + return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; +} + +/* + * As of now the RMIDs allocation is global. + * However we keep track of which packages the RMIDs + * are used to optimize the limbo list management. + */ +int alloc_rmid(void) +{ + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? -EBUSY : -ENOSPC; + + entry = list_first_entry(&rmid_free_lru, + struct rmid_entry, list); + list_del(&entry->list); + + return entry->rmid; +} + +static void add_rmid_to_limbo(struct rmid_entry *entry) +{ + struct rdt_resource *r; + struct rdt_domain *d; + int cpu; + u64 val; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + entry->busy = 0; + cpu = get_cpu(); + list_for_each_entry(d, &r->domains, list) { + if (cpumask_test_cpu(cpu, &d->cpu_mask)) { + val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + if (val <= intel_cqm_threshold) + continue; + } + + /* + * For the first limbo RMID in the domain, + * setup up the limbo worker. + */ + if (!has_busy_rmid(r, d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); + set_bit(entry->rmid, d->rmid_busy_llc); + entry->busy++; + } + put_cpu(); + + if (entry->busy) + rmid_limbo_count++; + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +void free_rmid(u32 rmid) +{ + struct rmid_entry *entry; + + if (!rmid) + return; + + lockdep_assert_held(&rdtgroup_mutex); + + entry = __rmid_entry(rmid); + + if (is_llc_occupancy_enabled()) + add_rmid_to_limbo(entry); + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +static int __mon_event_count(u32 rmid, struct rmid_read *rr) +{ + u64 chunks, shift, tval; + struct mbm_state *m; + + tval = __rmid_read(rmid, rr->evtid); + if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { + rr->val = tval; + return -EINVAL; + } + switch (rr->evtid) { + case QOS_L3_OCCUP_EVENT_ID: + rr->val += tval; + return 0; + case QOS_L3_MBM_TOTAL_EVENT_ID: + m = &rr->d->mbm_total[rmid]; + break; + case QOS_L3_MBM_LOCAL_EVENT_ID: + m = &rr->d->mbm_local[rmid]; + break; + default: + /* + * Code would never reach here because + * an invalid event id would fail the __rmid_read. + */ + return -EINVAL; + } + + if (rr->first) { + m->prev_msr = tval; + m->chunks = 0; + return 0; + } + + shift = 64 - MBM_CNTR_WIDTH; + chunks = (tval << shift) - (m->prev_msr << shift); + chunks >>= shift; + m->chunks += chunks; + m->prev_msr = tval; + + rr->val += m->chunks; + return 0; +} + +/* + * This is called via IPI to read the CQM/MBM counters + * on a domain. + */ +void mon_event_count(void *info) +{ + struct rdtgroup *rdtgrp, *entry; + struct rmid_read *rr = info; + struct list_head *head; + + rdtgrp = rr->rgrp; + + if (__mon_event_count(rdtgrp->mon.rmid, rr)) + return; + + /* + * For Ctrl groups read data from child monitor groups. + */ + head = &rdtgrp->mon.crdtgrp_list; + + if (rdtgrp->type == RDTCTRL_GROUP) { + list_for_each_entry(entry, head, mon.crdtgrp_list) { + if (__mon_event_count(entry->mon.rmid, rr)) + return; + } + } +} + +static void mbm_update(struct rdt_domain *d, int rmid) +{ + struct rmid_read rr; + + rr.first = false; + rr.d = d; + + /* + * This is protected from concurrent reads from user + * as both the user and we hold the global mutex. + */ + if (is_mbm_total_enabled()) { + rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } + if (is_mbm_local_enabled()) { + rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } +} + +/* + * Handler to scan the limbo list and move the RMIDs + * to free list whose occupancy < threshold_occupancy. + */ +void cqm_handle_limbo(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); + int cpu = smp_processor_id(); + struct rdt_resource *r; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + d = get_domain_from_cpu(cpu, r); + + if (!d) { + pr_warn_once("Failure to get domain for limbo worker\n"); + goto out_unlock; + } + + __check_limbo(d, false); + + if (has_busy_rmid(r, d)) + schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + struct rdt_resource *r; + int cpu; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + cpu = cpumask_any(&dom->cpu_mask); + dom->cqm_work_cpu = cpu; + + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); +} + +void mbm_handle_overflow(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); + struct rdtgroup *prgrp, *crgrp; + int cpu = smp_processor_id(); + struct list_head *head; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + if (!static_branch_likely(&rdt_enable_key)) + goto out_unlock; + + d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); + if (!d) + goto out_unlock; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mbm_update(d, prgrp->mon.rmid); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) + mbm_update(d, crgrp->mon.rmid); + } + + schedule_delayed_work_on(cpu, &d->mbm_over, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + if (!static_branch_likely(&rdt_enable_key)) + return; + cpu = cpumask_any(&dom->cpu_mask); + dom->mbm_work_cpu = cpu; + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); +} + +static int dom_data_init(struct rdt_resource *r) +{ + struct rmid_entry *entry = NULL; + int i, nr_rmids; + + nr_rmids = r->num_rmid; + rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) + return -ENOMEM; + + for (i = 0; i < nr_rmids; i++) { + entry = &rmid_ptrs[i]; + INIT_LIST_HEAD(&entry->list); + + entry->rmid = i; + list_add_tail(&entry->list, &rmid_free_lru); + } + + /* + * RMID 0 is special and is always allocated. It's used for all + * tasks that are not monitored. + */ + entry = __rmid_entry(0); + list_del(&entry->list); + + return 0; +} + +static struct mon_evt llc_occupancy_event = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, +}; + +static struct mon_evt mbm_total_event = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +}; + +static struct mon_evt mbm_local_event = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +}; + +/* + * Initialize the event list for the resource. + * + * Note that MBM events are also part of RDT_RESOURCE_L3 resource + * because as per the SDM the total and local memory bandwidth + * are enumerated as part of L3 monitoring. + */ +static void l3_mon_evt_init(struct rdt_resource *r) +{ + INIT_LIST_HEAD(&r->evt_list); + + if (is_llc_occupancy_enabled()) + list_add_tail(&llc_occupancy_event.list, &r->evt_list); + if (is_mbm_total_enabled()) + list_add_tail(&mbm_total_event.list, &r->evt_list); + if (is_mbm_local_enabled()) + list_add_tail(&mbm_local_event.list, &r->evt_list); +} + +int rdt_get_mon_l3_config(struct rdt_resource *r) +{ + int ret; + + r->mon_scale = boot_cpu_data.x86_cache_occ_scale; + r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + + /* + * A reasonable upper limit on the max threshold is the number + * of lines tagged per RMID if all RMIDs have the same number of + * lines tagged in the LLC. + * + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. + */ + intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid; + + /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ + intel_cqm_threshold /= r->mon_scale; + + ret = dom_data_init(r); + if (ret) + return ret; + + l3_mon_evt_init(r); + + r->mon_capable = true; + r->mon_enabled = true; + + return 0; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 9257bd9dc664..a869d4a073c5 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -32,17 +32,25 @@ #include <uapi/linux/magic.h> -#include <asm/intel_rdt.h> -#include <asm/intel_rdt_common.h> +#include <asm/intel_rdt_sched.h> +#include "intel_rdt.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); -struct kernfs_root *rdt_root; +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +static struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); /* Kernel fs node for "info" directory under root */ static struct kernfs_node *kn_info; +/* Kernel fs node for "mon_groups" directory under root */ +static struct kernfs_node *kn_mongrp; + +/* Kernel fs node for "mon_data" directory under root */ +static struct kernfs_node *kn_mondata; + /* * Trivial allocator for CLOSIDs. Since h/w only supports a small number, * we can keep a bitmap of free CLOSIDs in a single integer. @@ -66,7 +74,7 @@ static void closid_init(void) int rdt_min_closid = 32; /* Compute rdt_min_closid across all resources */ - for_each_enabled_rdt_resource(r) + for_each_alloc_enabled_rdt_resource(r) rdt_min_closid = min(rdt_min_closid, r->num_closid); closid_free_map = BIT_MASK(rdt_min_closid) - 1; @@ -75,9 +83,9 @@ static void closid_init(void) closid_free_map &= ~1; } -int closid_alloc(void) +static int closid_alloc(void) { - int closid = ffs(closid_free_map); + u32 closid = ffs(closid_free_map); if (closid == 0) return -ENOSPC; @@ -125,28 +133,6 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) return 0; } -static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts, - int len) -{ - struct rftype *rft; - int ret; - - lockdep_assert_held(&rdtgroup_mutex); - - for (rft = rfts; rft < rfts + len; rft++) { - ret = rdtgroup_add_file(kn, rft); - if (ret) - goto error; - } - - return 0; -error: - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); - while (--rft >= rfts) - kernfs_remove_by_name(kn, rft->name); - return ret; -} - static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; @@ -174,6 +160,11 @@ static struct kernfs_ops rdtgroup_kf_single_ops = { .seq_show = rdtgroup_seqfile_show, }; +static struct kernfs_ops kf_mondata_ops = { + .atomic_write_len = PAGE_SIZE, + .seq_show = rdtgroup_mondata_show, +}; + static bool is_cpu_list(struct kernfs_open_file *of) { struct rftype *rft = of->kn->priv; @@ -203,13 +194,18 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, /* * This is safe against intel_rdt_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call - * from rdt_update_closid() is proteced against __switch_to() because + * from update_closid_rmid() is proteced against __switch_to() because * preemption is disabled. */ -static void rdt_update_cpu_closid(void *closid) +static void update_cpu_closid_rmid(void *info) { - if (closid) - this_cpu_write(cpu_closid, *(int *)closid); + struct rdtgroup *r = info; + + if (r) { + this_cpu_write(pqr_state.default_closid, r->closid); + this_cpu_write(pqr_state.default_rmid, r->mon.rmid); + } + /* * We cannot unconditionally write the MSR because the current * executing task might have its own closid selected. Just reuse @@ -221,28 +217,128 @@ static void rdt_update_cpu_closid(void *closid) /* * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, * - * Per task closids must have been set up before calling this function. - * - * The per cpu closids are updated with the smp function call, when @closid - * is not NULL. If @closid is NULL then all affected percpu closids must - * have been set up before calling this function. + * Per task closids/rmids must have been set up before calling this function. */ static void -rdt_update_closid(const struct cpumask *cpu_mask, int *closid) +update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) { int cpu = get_cpu(); if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_update_cpu_closid(closid); - smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1); + update_cpu_closid_rmid(r); + smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1); put_cpu(); } +static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; + struct list_head *head; + + /* Check whether cpus belong to parent ctrl group */ + cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); + if (cpumask_weight(tmpmask)) + return -EINVAL; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Give any dropped cpus to parent rdtgroup */ + cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); + update_closid_rmid(tmpmask, prgrp); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu rmid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + if (crgrp == rdtgrp) + continue; + cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, + tmpmask); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + return 0; +} + +static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) +{ + struct rdtgroup *crgrp; + + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); + /* update the child mon group masks as well*/ + list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) + cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); +} + +static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask, cpumask_var_t tmpmask1) +{ + struct rdtgroup *r, *crgrp; + struct list_head *head; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) + return -EINVAL; + + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + update_closid_rmid(tmpmask, &rdtgroup_default); + } + + /* + * If we added cpus, remove them from previous group and + * the prev group's child groups that owned them + * and update per-cpu closid/rmid. + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); + if (cpumask_weight(tmpmask1)) + cpumask_rdtgrp_clear(r, tmpmask1); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + /* + * Clear child mon group masks since there is a new parent mask + * now and update the rmid for the cpus the child lost. + */ + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); + update_closid_rmid(tmpmask, rdtgrp); + cpumask_clear(&crgrp->cpu_mask); + } + + return 0; +} + static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - cpumask_var_t tmpmask, newmask; - struct rdtgroup *rdtgrp, *r; + cpumask_var_t tmpmask, newmask, tmpmask1; + struct rdtgroup *rdtgrp; int ret; if (!buf) @@ -254,6 +350,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, free_cpumask_var(tmpmask); return -ENOMEM; } + if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + return -ENOMEM; + } rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -276,41 +377,18 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, goto unlock; } - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { - /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) { - ret = -EINVAL; - goto unlock; - } - /* Give any dropped cpus to rdtgroup_default */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, tmpmask); - rdt_update_closid(tmpmask, &rdtgroup_default.closid); - } - - /* - * If we added cpus, remove them from previous group that owned them - * and update per-cpu closid - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { - if (r == rdtgrp) - continue; - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); - } - rdt_update_closid(tmpmask, &rdtgrp->closid); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); + if (rdtgrp->type == RDTCTRL_GROUP) + ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); + else if (rdtgrp->type == RDTMON_GROUP) + ret = cpus_mon_write(rdtgrp, newmask, tmpmask); + else + ret = -EINVAL; unlock: rdtgroup_kn_unlock(of->kn); free_cpumask_var(tmpmask); free_cpumask_var(newmask); + free_cpumask_var(tmpmask1); return ret ?: nbytes; } @@ -336,6 +414,7 @@ static void move_myself(struct callback_head *head) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { current->closid = 0; + current->rmid = 0; kfree(rdtgrp); } @@ -374,7 +453,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk, atomic_dec(&rdtgrp->waitcount); kfree(callback); } else { - tsk->closid = rdtgrp->closid; + /* + * For ctrl_mon groups move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. + */ + if (rdtgrp->type == RDTCTRL_GROUP) { + tsk->closid = rdtgrp->closid; + tsk->rmid = rdtgrp->mon.rmid; + } else if (rdtgrp->type == RDTMON_GROUP) { + if (rdtgrp->mon.parent->closid == tsk->closid) + tsk->rmid = rdtgrp->mon.rmid; + else + ret = -EINVAL; + } } return ret; } @@ -454,7 +546,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) rcu_read_lock(); for_each_process_thread(p, t) { - if (t->closid == r->closid) + if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || + (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) seq_printf(s, "%d\n", t->pid); } rcu_read_unlock(); @@ -476,39 +569,6 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } -/* Files in each rdtgroup */ -static struct rftype rdtgroup_base_files[] = { - { - .name = "cpus", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - }, - { - .name = "cpus_list", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .flags = RFTYPE_FLAGS_CPUS_LIST, - }, - { - .name = "tasks", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_tasks_write, - .seq_show = rdtgroup_tasks_show, - }, - { - .name = "schemata", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_schemata_write, - .seq_show = rdtgroup_schemata_show, - }, -}; - static int rdt_num_closids_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -536,6 +596,15 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, return 0; } +static int rdt_shareable_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->cache.shareable_bits); + return 0; +} + static int rdt_min_bw_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -545,6 +614,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of, return 0; } +static int rdt_num_rmids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->num_rmid); + + return 0; +} + +static int rdt_mon_features_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + struct mon_evt *mevt; + + list_for_each_entry(mevt, &r->evt_list, list) + seq_printf(seq, "%s\n", mevt->name); + + return 0; +} + static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -563,74 +654,200 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, return 0; } +static int max_threshold_occ_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale); + + return 0; +} + +static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdt_resource *r = of->kn->parent->priv; + unsigned int bytes; + int ret; + + ret = kstrtouint(buf, 0, &bytes); + if (ret) + return ret; + + if (bytes > (boot_cpu_data.x86_cache_size * 1024)) + return -EINVAL; + + intel_cqm_threshold = bytes / r->mon_scale; + + return nbytes; +} + /* rdtgroup information files for one cache resource. */ -static struct rftype res_cache_info_files[] = { +static struct rftype res_common_files[] = { { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_closids_show, + .fflags = RF_CTRL_INFO, + }, + { + .name = "mon_features", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_mon_features_show, + .fflags = RF_MON_INFO, + }, + { + .name = "num_rmids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_rmids_show, + .fflags = RF_MON_INFO, }, { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_default_ctrl_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_cbm_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, -}; - -/* rdtgroup information files for memory bandwidth. */ -static struct rftype res_mba_info_files[] = { { - .name = "num_closids", + .name = "shareable_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_closids_show, + .seq_show = rdt_shareable_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_bandwidth", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_bw_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "bandwidth_gran", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bw_gran_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "delay_linear", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_delay_linear_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "max_threshold_occupancy", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = max_threshold_occ_write, + .seq_show = max_threshold_occ_show, + .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + .fflags = RFTYPE_BASE, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + .fflags = RF_CTRL_BASE, }, }; -void rdt_get_mba_infofile(struct rdt_resource *r) +static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) { - r->info_files = res_mba_info_files; - r->nr_info_files = ARRAY_SIZE(res_mba_info_files); + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + lockdep_assert_held(&rdtgroup_mutex); + + for (rft = rfts; rft < rfts + len; rft++) { + if ((fflags & rft->fflags) == rft->fflags) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) { + if ((fflags & rft->fflags) == rft->fflags) + kernfs_remove_by_name(kn, rft->name); + } + return ret; } -void rdt_get_cache_infofile(struct rdt_resource *r) +static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, + unsigned long fflags) { - r->info_files = res_cache_info_files; - r->nr_info_files = ARRAY_SIZE(res_cache_info_files); + struct kernfs_node *kn_subdir; + int ret; + + kn_subdir = kernfs_create_dir(kn_info, name, + kn_info->mode, r); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + kernfs_get(kn_subdir); + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + ret = rdtgroup_add_files(kn_subdir, fflags); + if (!ret) + kernfs_activate(kn_subdir); + + return ret; } static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) { - struct kernfs_node *kn_subdir; - struct rftype *res_info_files; struct rdt_resource *r; - int ret, len; + unsigned long fflags; + char name[32]; + int ret; /* create the directory */ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); @@ -638,25 +855,19 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) return PTR_ERR(kn_info); kernfs_get(kn_info); - for_each_enabled_rdt_resource(r) { - kn_subdir = kernfs_create_dir(kn_info, r->name, - kn_info->mode, r); - if (IS_ERR(kn_subdir)) { - ret = PTR_ERR(kn_subdir); - goto out_destroy; - } - kernfs_get(kn_subdir); - ret = rdtgroup_kn_set_ugid(kn_subdir); + for_each_alloc_enabled_rdt_resource(r) { + fflags = r->fflags | RF_CTRL_INFO; + ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags); if (ret) goto out_destroy; + } - res_info_files = r->info_files; - len = r->nr_info_files; - - ret = rdtgroup_add_files(kn_subdir, res_info_files, len); + for_each_mon_enabled_rdt_resource(r) { + fflags = r->fflags | RF_MON_INFO; + sprintf(name, "%s_MON", r->name); + ret = rdtgroup_mkdir_info_resdir(r, name, fflags); if (ret) goto out_destroy; - kernfs_activate(kn_subdir); } /* @@ -678,6 +889,39 @@ out_destroy: return ret; } +static int +mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, + char *name, struct kernfs_node **dest_kn) +{ + struct kernfs_node *kn; + int ret; + + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + if (dest_kn) + *dest_kn = kn; + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that @rdtgrp->kn is always accessible. + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} static void l3_qos_cfg_update(void *arg) { bool *enable = arg; @@ -718,14 +962,15 @@ static int cdp_enable(void) struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; int ret; - if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable) + if (!r_l3->alloc_capable || !r_l3data->alloc_capable || + !r_l3code->alloc_capable) return -EINVAL; ret = set_l3_qos_cfg(r_l3, true); if (!ret) { - r_l3->enabled = false; - r_l3data->enabled = true; - r_l3code->enabled = true; + r_l3->alloc_enabled = false; + r_l3data->alloc_enabled = true; + r_l3code->alloc_enabled = true; } return ret; } @@ -734,11 +979,11 @@ static void cdp_disable(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; - r->enabled = r->capable; + r->alloc_enabled = r->alloc_capable; - if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) { - rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false; - rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false; + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) { + rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false; + rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false; set_l3_qos_cfg(r, false); } } @@ -823,10 +1068,16 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) } } +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **mon_data_kn); + static struct dentry *rdt_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct rdt_domain *dom; + struct rdt_resource *r; struct dentry *dentry; int ret; @@ -853,15 +1104,54 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, goto out_cdp; } + if (rdt_mon_capable) { + ret = mongroup_create_dir(rdtgroup_default.kn, + NULL, "mon_groups", + &kn_mongrp); + if (ret) { + dentry = ERR_PTR(ret); + goto out_info; + } + kernfs_get(kn_mongrp); + + ret = mkdir_mondata_all(rdtgroup_default.kn, + &rdtgroup_default, &kn_mondata); + if (ret) { + dentry = ERR_PTR(ret); + goto out_mongrp; + } + kernfs_get(kn_mondata); + rdtgroup_default.mon.mon_data_kn = kn_mondata; + } + dentry = kernfs_mount(fs_type, flags, rdt_root, RDTGROUP_SUPER_MAGIC, NULL); if (IS_ERR(dentry)) - goto out_destroy; + goto out_mondata; + + if (rdt_alloc_capable) + static_branch_enable(&rdt_alloc_enable_key); + if (rdt_mon_capable) + static_branch_enable(&rdt_mon_enable_key); + + if (rdt_alloc_capable || rdt_mon_capable) + static_branch_enable(&rdt_enable_key); + + if (is_mbm_enabled()) { + r = &rdt_resources_all[RDT_RESOURCE_L3]; + list_for_each_entry(dom, &r->domains, list) + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + } - static_branch_enable(&rdt_enable_key); goto out; -out_destroy: +out_mondata: + if (rdt_mon_capable) + kernfs_remove(kn_mondata); +out_mongrp: + if (rdt_mon_capable) + kernfs_remove(kn_mongrp); +out_info: kernfs_remove(kn_info); out_cdp: cdp_disable(); @@ -909,6 +1199,18 @@ static int reset_all_ctrls(struct rdt_resource *r) return 0; } +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_alloc_capable && + (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_mon_capable && + (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); +} + /* * Move tasks from one to the other group. If @from is NULL, then all tasks * in the systems are moved unconditionally (used for teardown). @@ -924,8 +1226,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, read_lock(&tasklist_lock); for_each_process_thread(p, t) { - if (!from || t->closid == from->closid) { + if (!from || is_closid_match(t, from) || + is_rmid_match(t, from)) { t->closid = to->closid; + t->rmid = to->mon.rmid; + #ifdef CONFIG_SMP /* * This is safe on x86 w/o barriers as the ordering @@ -944,6 +1249,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, read_unlock(&tasklist_lock); } +static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) +{ + struct rdtgroup *sentry, *stmp; + struct list_head *head; + + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + free_rmid(sentry->mon.rmid); + list_del(&sentry->mon.crdtgrp_list); + kfree(sentry); + } +} + /* * Forcibly remove all of subdirectories under root. */ @@ -955,6 +1273,9 @@ static void rmdir_all_sub(void) rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Free any child rmids */ + free_all_child_rdtgrp(rdtgrp); + /* Remove each rdtgroup other than root */ if (rdtgrp == &rdtgroup_default) continue; @@ -967,16 +1288,20 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + free_rmid(rdtgrp->mon.rmid); + kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); kfree(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ get_online_cpus(); - rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid); + update_closid_rmid(cpu_online_mask, &rdtgroup_default); put_online_cpus(); kernfs_remove(kn_info); + kernfs_remove(kn_mongrp); + kernfs_remove(kn_mondata); } static void rdt_kill_sb(struct super_block *sb) @@ -986,10 +1311,12 @@ static void rdt_kill_sb(struct super_block *sb) mutex_lock(&rdtgroup_mutex); /*Put everything back to default values. */ - for_each_enabled_rdt_resource(r) + for_each_alloc_enabled_rdt_resource(r) reset_all_ctrls(r); cdp_disable(); rmdir_all_sub(); + static_branch_disable(&rdt_alloc_enable_key); + static_branch_disable(&rdt_mon_enable_key); static_branch_disable(&rdt_enable_key); kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); @@ -1001,46 +1328,223 @@ static struct file_system_type rdt_fs_type = { .kill_sb = rdt_kill_sb, }; -static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +static int mon_addfile(struct kernfs_node *parent_kn, const char *name, + void *priv) { - struct rdtgroup *parent, *rdtgrp; struct kernfs_node *kn; - int ret, closid; + int ret = 0; - /* Only allow mkdir in the root directory */ - if (parent_kn != rdtgroup_default.kn) - return -EPERM; + kn = __kernfs_create_file(parent_kn, name, 0444, 0, + &kf_mondata_ops, priv, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); - /* Do not accept '\n' to avoid unparsable situation. */ - if (strchr(name, '\n')) - return -EINVAL; + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } - parent = rdtgroup_kn_lock_live(parent_kn); - if (!parent) { - ret = -ENODEV; - goto out_unlock; + return ret; +} + +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups with given domain id. + */ +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id) +{ + struct rdtgroup *prgrp, *crgrp; + char name[32]; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + sprintf(name, "mon_%s_%02d", r->name, dom_id); + kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); } +} - ret = closid_alloc(); - if (ret < 0) +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + union mon_data_bits priv; + struct kernfs_node *kn; + struct mon_evt *mevt; + struct rmid_read rr; + char name[32]; + int ret; + + sprintf(name, "mon_%s_%02d", r->name, d->id); + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that kn is always accessible. + */ + kernfs_get(kn); + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + if (WARN_ON(list_empty(&r->evt_list))) { + ret = -EPERM; + goto out_destroy; + } + + priv.u.rid = r->rid; + priv.u.domid = d->id; + list_for_each_entry(mevt, &r->evt_list, list) { + priv.u.evtid = mevt->evtid; + ret = mon_addfile(kn, mevt->name, priv.priv); + if (ret) + goto out_destroy; + + if (is_mbm_event(mevt->evtid)) + mon_event_read(&rr, d, prgrp, mevt->evtid, true); + } + kernfs_activate(kn); + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/* + * Add all subdirectories of mon_data for "ctrl_mon" groups + * and "monitor" groups with given domain id. + */ +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *prgrp, *crgrp; + struct list_head *head; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + parent_kn = prgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, prgrp); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + parent_kn = crgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, crgrp); + } + } +} + +static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, + struct rdt_resource *r, + struct rdtgroup *prgrp) +{ + struct rdt_domain *dom; + int ret; + + list_for_each_entry(dom, &r->domains, list) { + ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + if (ret) + return ret; + } + + return 0; +} + +/* + * This creates a directory mon_data which contains the monitored data. + * + * mon_data has one directory for each domain whic are named + * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data + * with L3 domain looks as below: + * ./mon_data: + * mon_L3_00 + * mon_L3_01 + * mon_L3_02 + * ... + * + * Each domain directory has one file per event: + * ./mon_L3_00/: + * llc_occupancy + * + */ +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **dest_kn) +{ + struct rdt_resource *r; + struct kernfs_node *kn; + int ret; + + /* + * Create the mon_data directory first. + */ + ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn); + if (ret) + return ret; + + if (dest_kn) + *dest_kn = kn; + + /* + * Create the subdirectories for each domain. Note that all events + * in a domain like L3 are grouped into a resource whose domain is L3 + */ + for_each_mon_enabled_rdt_resource(r) { + ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); + if (ret) + goto out_destroy; + } + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, umode_t mode, + enum rdt_group_type rtype, struct rdtgroup **r) +{ + struct rdtgroup *prdtgrp, *rdtgrp; + struct kernfs_node *kn; + uint files = 0; + int ret; + + prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); + if (!prdtgrp) { + ret = -ENODEV; goto out_unlock; - closid = ret; + } /* allocate the rdtgroup. */ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); if (!rdtgrp) { ret = -ENOSPC; - goto out_closid_free; + goto out_unlock; } - rdtgrp->closid = closid; - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + *r = rdtgrp; + rdtgrp->mon.parent = prdtgrp; + rdtgrp->type = rtype; + INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); /* kernfs creates the directory for rdtgrp */ - kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp); + kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); if (IS_ERR(kn)) { ret = PTR_ERR(kn); - goto out_cancel_ref; + goto out_free_rgrp; } rdtgrp->kn = kn; @@ -1056,43 +1560,211 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - ret = rdtgroup_add_files(kn, rdtgroup_base_files, - ARRAY_SIZE(rdtgroup_base_files)); + files = RFTYPE_BASE | RFTYPE_CTRL; + files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); + ret = rdtgroup_add_files(kn, files); if (ret) goto out_destroy; + if (rdt_mon_capable) { + ret = alloc_rmid(); + if (ret < 0) + goto out_destroy; + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) + goto out_idfree; + } kernfs_activate(kn); - ret = 0; - goto out_unlock; + /* + * The caller unlocks the prgrp_kn upon success. + */ + return 0; +out_idfree: + free_rmid(rdtgrp->mon.rmid); out_destroy: kernfs_remove(rdtgrp->kn); -out_cancel_ref: - list_del(&rdtgrp->rdtgroup_list); +out_free_rgrp: kfree(rdtgrp); -out_closid_free: - closid_free(closid); out_unlock: - rdtgroup_kn_unlock(parent_kn); + rdtgroup_kn_unlock(prgrp_kn); return ret; } -static int rdtgroup_rmdir(struct kernfs_node *kn) +static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) +{ + kernfs_remove(rgrp->kn); + free_rmid(rgrp->mon.rmid); + kfree(rgrp); +} + +/* + * Create a monitor group under "mon_groups" directory of a control + * and monitor group(ctrl_mon). This is a resource group + * to monitor a subset of tasks and cpus in its parent ctrl_mon group. + */ +static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, + umode_t mode) +{ + struct rdtgroup *rdtgrp, *prgrp; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP, + &rdtgrp); + if (ret) + return ret; + + prgrp = rdtgrp->mon.parent; + rdtgrp->closid = prgrp->closid; + + /* + * Add the rdtgrp to the list of rdtgrps the parent + * ctrl_mon group has to track. + */ + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); + + rdtgroup_kn_unlock(prgrp_kn); + return ret; +} + +/* + * These are rdtgroups created under the root directory. Can be used + * to allocate and monitor resources. + */ +static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, umode_t mode) { - int ret, cpu, closid = rdtgroup_default.closid; struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; + struct kernfs_node *kn; + u32 closid; + int ret; - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; + ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP, + &rdtgrp); + if (ret) + return ret; - rdtgrp = rdtgroup_kn_lock_live(kn); - if (!rdtgrp) { - ret = -EPERM; - goto out; + kn = rdtgrp->kn; + ret = closid_alloc(); + if (ret < 0) + goto out_common_fail; + closid = ret; + + rdtgrp->closid = closid; + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + if (rdt_mon_capable) { + /* + * Create an empty mon_groups directory to hold the subset + * of tasks and cpus to monitor. + */ + ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); + if (ret) + goto out_id_free; } + goto out_unlock; + +out_id_free: + closid_free(closid); + list_del(&rdtgrp->rdtgroup_list); +out_common_fail: + mkdir_rdt_prepare_clean(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(prgrp_kn); + return ret; +} + +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(kn->name, "mon_groups") && + strcmp(name, "mon_groups")); +} + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + /* + * If the parent directory is the root directory and RDT + * allocation is supported, add a control and monitoring + * subdirectory + */ + if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) + return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode); + + /* + * If RDT monitoring is supported and the parent directory is a valid + * "mon_groups" directory, add a monitoring subdirectory. + */ + if (rdt_mon_capable && is_mon_groups(parent_kn, name)) + return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode); + + return -EPERM; +} + +static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + int cpu; + + /* Give any tasks back to the parent group */ + rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); + + /* Update per cpu rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) + per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid; + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + free_rmid(rdtgrp->mon.rmid); + + /* + * Remove the rdtgrp from the parent ctrl_mon group's list + */ + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_del(&rdtgrp->mon.crdtgrp_list); + + /* + * one extra hold on this, will drop when we kfree(rdtgrp) + * in rdtgroup_kn_unlock() + */ + kernfs_get(kn); + kernfs_remove(rdtgrp->kn); + + return 0; +} + +static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + int cpu; + /* Give any tasks back to the default group */ rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); @@ -1100,18 +1772,28 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - /* Update per cpu closid of the moved CPUs first */ - for_each_cpu(cpu, &rdtgrp->cpu_mask) - per_cpu(cpu_closid, cpu) = closid; + /* Update per cpu closid and rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) { + per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid; + per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid; + } + /* * Update the MSR on moved CPUs and CPUs which have moved * task running on them. */ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - rdt_update_closid(tmpmask, NULL); + update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; closid_free(rdtgrp->closid); + free_rmid(rdtgrp->mon.rmid); + + /* + * Free all the child monitor group rmids. + */ + free_all_child_rdtgrp(rdtgrp); + list_del(&rdtgrp->rdtgroup_list); /* @@ -1120,7 +1802,41 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) */ kernfs_get(kn); kernfs_remove(rdtgrp->kn); - ret = 0; + + return 0; +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + struct kernfs_node *parent_kn = kn->parent; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret = 0; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + + /* + * If the rdtgroup is a ctrl_mon group and parent directory + * is the root directory, remove the ctrl_mon group. + * + * If the rdtgroup is a mon group and parent directory + * is a valid "mon_groups" directory, remove the mon group. + */ + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) + ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); + else if (rdtgrp->type == RDTMON_GROUP && + is_mon_groups(parent_kn, kn->name)) + ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); + else + ret = -EPERM; + out: rdtgroup_kn_unlock(kn); free_cpumask_var(tmpmask); @@ -1129,7 +1845,7 @@ out: static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) { - if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) seq_puts(seq, ",cdp"); return 0; } @@ -1153,10 +1869,13 @@ static int __init rdtgroup_setup_root(void) mutex_lock(&rdtgroup_mutex); rdtgroup_default.closid = 0; + rdtgroup_default.mon.rmid = 0; + rdtgroup_default.type = RDTCTRL_GROUP; + INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files, - ARRAY_SIZE(rdtgroup_base_files)); + ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE); if (ret) { kernfs_destroy_root(rdt_root); goto out; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9e314bcf67cc..40e28ed77fbf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -201,8 +201,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu) wrmsr(smca_config, low, high); } - /* Collect bank_info using CPU 0 for now. */ - if (cpu) + /* Return early if this bank was already initialized. */ + if (smca_banks[bank].hwid) return; if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { @@ -216,11 +216,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { s_hwid = &smca_hwid_mcatypes[i]; if (hwid_mcatype == s_hwid->hwid_mcatype) { - - WARN(smca_banks[bank].hwid, - "Bank %s already initialized!\n", - smca_get_name(s_hwid->bank_type)); - smca_banks[bank].hwid = s_hwid; smca_banks[bank].id = low; smca_banks[bank].sysfs_id = s_hwid->count++; @@ -776,24 +771,12 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) mce_log(&m); } -static inline void __smp_deferred_error_interrupt(void) -{ - inc_irq_stat(irq_deferred_error_count); - deferred_error_int_vector(); -} - asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) { entering_irq(); - __smp_deferred_error_interrupt(); - exiting_ack_irq(); -} - -asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) -{ - entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); - __smp_deferred_error_interrupt(); + inc_irq_stat(irq_deferred_error_count); + deferred_error_int_vector(); trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index f7370abd33c6..2da67b70ba98 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -390,26 +390,12 @@ static void unexpected_thermal_interrupt(void) static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; -static inline void __smp_thermal_interrupt(void) -{ - inc_irq_stat(irq_thermal_count); - smp_thermal_vector(); -} - -asmlinkage __visible void __irq_entry -smp_thermal_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_thermal_interrupt(); - exiting_ack_irq(); -} - -asmlinkage __visible void __irq_entry -smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *r) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); - __smp_thermal_interrupt(); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); trace_thermal_apic_exit(THERMAL_APIC_VECTOR); exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index bb0e75eed10a..5e7249e42f8f 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -17,24 +17,12 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -static inline void __smp_threshold_interrupt(void) -{ - inc_irq_stat(irq_threshold_count); - mce_threshold_vector(); -} - asmlinkage __visible void __irq_entry smp_threshold_interrupt(void) { entering_irq(); - __smp_threshold_interrupt(); - exiting_ack_irq(); -} - -asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void) -{ - entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); - __smp_threshold_interrupt(); + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 21b185793c80..c6daec4bdba5 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -400,9 +400,12 @@ static void update_cache(struct ucode_patch *new_patch) list_for_each_entry(p, µcode_cache, plist) { if (p->equiv_cpu == new_patch->equiv_cpu) { - if (p->patch_id >= new_patch->patch_id) + if (p->patch_id >= new_patch->patch_id) { /* we already have the latest patch */ + kfree(new_patch->data); + kfree(new_patch); return; + } list_replace(&p->plist, &new_patch->plist); kfree(p->data); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 59edbe9d4ccb..8f7a9bbad514 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -146,18 +146,18 @@ static bool microcode_matches(struct microcode_header_intel *mc_header, return false; } -static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) +static struct ucode_patch *memdup_patch(void *data, unsigned int size) { struct ucode_patch *p; p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL); if (!p) - return ERR_PTR(-ENOMEM); + return NULL; p->data = kmemdup(data, size, GFP_KERNEL); if (!p->data) { kfree(p); - return ERR_PTR(-ENOMEM); + return NULL; } return p; @@ -183,8 +183,8 @@ static void save_microcode_patch(void *data, unsigned int size) if (mc_hdr->rev <= mc_saved_hdr->rev) continue; - p = __alloc_microcode_buf(data, size); - if (IS_ERR(p)) + p = memdup_patch(data, size); + if (!p) pr_err("Error allocating buffer %p\n", data); else list_replace(&iter->plist, &p->plist); @@ -196,24 +196,25 @@ static void save_microcode_patch(void *data, unsigned int size) * newly found. */ if (!prev_found) { - p = __alloc_microcode_buf(data, size); - if (IS_ERR(p)) + p = memdup_patch(data, size); + if (!p) pr_err("Error allocating buffer for %p\n", data); else list_add_tail(&p->plist, µcode_cache); } + if (!p) + return; + /* * Save for early loading. On 32-bit, that needs to be a physical * address as the APs are running from physical addresses, before * paging has been enabled. */ - if (p) { - if (IS_ENABLED(CONFIG_X86_32)) - intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data); - else - intel_ucode_patch = p->data; - } + if (IS_ENABLED(CONFIG_X86_32)) + intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data); + else + intel_ucode_patch = p->data; } static int microcode_sanity_check(void *mc, int print_err) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 70e717fccdd6..3b3f713e15e5 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -59,13 +59,8 @@ void hyperv_vector_handler(struct pt_regs *regs) void hv_setup_vmbus_irq(void (*handler)(void)) { vmbus_handler = handler; - /* - * Setup the IDT for hypervisor callback. Prevent reallocation - * at module reload. - */ - if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, - hyperv_callback_vector); + /* Setup the IDT for hypervisor callback */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); } void hv_remove_vmbus_irq(void) @@ -184,9 +179,15 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - pr_info("HyperV: features 0x%x, hints 0x%x\n", + pr_info("Hyper-V: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); + ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS); + ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS); + + pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", + ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); + /* * Extract host information. */ @@ -219,7 +220,7 @@ static void __init ms_hyperv_init_platform(void) rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_frequency = hv_lapic_frequency; - pr_info("HyperV: LAPIC Timer Frequency: %#x\n", + pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", lapic_timer_frequency); } @@ -249,11 +250,12 @@ static void __init ms_hyperv_init_platform(void) * Setup the hook to get control post apic initialization. */ x86_platform.apic_post_init = hyperv_init; + hyperv_setup_mmu_ops(); #endif } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { - .name = "Microsoft HyperV", + .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, .init_platform = ms_hyperv_init_platform, }; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index dbce3cca94cb..f13b4c00a5de 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -94,6 +94,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack_name) printk("%s <%s>\n", log_lvl, stack_name); + if (regs && on_stack(&stack_info, regs, sizeof(*regs))) + __show_regs(regs, 0); + /* * Scan the stack, printing any text addresses we find. At the * same time, follow proper stack frames with the unwinder. @@ -118,10 +121,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * Don't print regs->ip again if it was already printed * by __show_regs() below. */ - if (regs && stack == ®s->ip) { - unwind_next_frame(&state); - continue; - } + if (regs && stack == ®s->ip) + goto next; if (stack == ret_addr_p) reliable = 1; @@ -144,6 +145,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (!reliable) continue; +next: /* * Get the next frame from the unwinder. No need to * check for an error: if anything goes wrong, the rest @@ -153,7 +155,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state); - if (regs) + if (regs && on_stack(&stack_info, regs, sizeof(*regs))) __show_regs(regs, 0); } @@ -265,7 +267,7 @@ int __die(const char *str, struct pt_regs *regs, long err) #ifdef CONFIG_X86_32 if (user_mode(regs)) { sp = regs->sp; - ss = regs->ss & 0xffff; + ss = regs->ss; } else { sp = kernel_stack_pointer(regs); savesegment(ss, ss); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e5f0b40e66d2..4f0481474903 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -37,7 +37,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -62,7 +62,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 3e1471d57487..225af4184f06 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -55,7 +55,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) begin = end - (exception_stack_sizes[k] / sizeof(long)); regs = (struct pt_regs *)end - 1; - if (stack < begin || stack >= end) + if (stack <= begin || stack >= end) continue; info->type = STACK_TYPE_EXCEPTION + k; @@ -78,7 +78,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_IRQ; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index d907c3d8633f..927abeaf63e2 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -12,10 +12,10 @@ #include <linux/pci.h> #include <linux/acpi.h> #include <linux/delay.h> -#include <linux/dmi.h> #include <linux/pci_ids.h> #include <linux/bcma/bcma.h> #include <linux/bcma/bcma_regs.h> +#include <linux/platform_data/x86/apple.h> #include <drm/i915_drm.h> #include <asm/pci-direct.h> #include <asm/dma.h> @@ -527,6 +527,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_BXT_IDS(&gen9_early_ops), INTEL_KBL_IDS(&gen9_early_ops), INTEL_GLK_IDS(&gen9_early_ops), + INTEL_CNL_IDS(&gen9_early_ops), }; static void __init @@ -593,7 +594,7 @@ static void __init apple_airport_reset(int bus, int slot, int func) u64 addr; int i; - if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc.")) + if (!x86_apple_machine) return; /* Card may have been put into PCI_D3hot by grub quirk */ diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c new file mode 100644 index 000000000000..f260e452e4f8 --- /dev/null +++ b/arch/x86/kernel/eisa.c @@ -0,0 +1,19 @@ +/* + * EISA specific code + * + * This file is licensed under the GPL V2 + */ +#include <linux/ioport.h> +#include <linux/eisa.h> +#include <linux/io.h> + +static __init int eisa_bus_probe(void) +{ + void __iomem *p = ioremap(0x0FFFD9, 4); + + if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) + EISA_bus = 1; + iounmap(p); + return 0; +} +subsys_initcall(eisa_bus_probe); diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 538ec012b371..cf2ce063f65a 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/memblock.h> +#include <asm/desc.h> #include <asm/setup.h> #include <asm/sections.h> #include <asm/e820/api.h> @@ -30,6 +31,9 @@ static void __init i386_default_early_setup(void) asmlinkage __visible void __init i386_start_kernel(void) { cr4_init_shadow(); + + idt_setup_early_handler(); + sanitize_boot_params(&boot_params); x86_early_init_platform_quirks(); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6a193b93fd95..bab4fa579450 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -311,8 +311,6 @@ static void __init copy_bootdata(char *real_mode_data) asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) { - int i; - /* * Build-time sanity checks on the kernel image and module * area mappings. (these are purely build-time and produce no code) @@ -345,9 +343,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) kasan_early_init(); - for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handler_array[i]); - load_idt((const struct desc_ptr *)&idt_descr); + idt_setup_early_handler(); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 1f85ee8f9439..9ed3074d0d27 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -155,7 +155,6 @@ ENTRY(startup_32) jmp *%eax .Lbad_subarch: -WEAK(lguest_entry) WEAK(xen_entry) /* Unknown implementation; there's really nothing we can do at this point. */ @@ -165,7 +164,6 @@ WEAK(xen_entry) subarch_entries: .long .Ldefault_entry /* normal x86/PC */ - .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ .long .Ldefault_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 @@ -347,7 +345,6 @@ ENTRY(startup_32_smp) movl %eax,%cr0 lgdt early_gdt_descr - lidt idt_descr ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers movl %eax,%ss # after changing gdt. @@ -380,37 +377,6 @@ ENDPROC(startup_32_smp) */ __INIT setup_once: - /* - * Set up a idt with 256 interrupt gates that push zero if there - * is no error code and then jump to early_idt_handler_common. - * It doesn't actually load the idt - that needs to be done on - * each CPU. Interrupts are enabled elsewhere, when we can be - * relatively sure everything is ok. - */ - - movl $idt_table,%edi - movl $early_idt_handler_array,%eax - movl $NUM_EXCEPTION_VECTORS,%ecx -1: - movl %eax,(%edi) - movl %eax,4(%edi) - /* interrupt gate, dpl=0, present */ - movl $(0x8E000000 + __KERNEL_CS),2(%edi) - addl $EARLY_IDT_HANDLER_SIZE,%eax - addl $8,%edi - loop 1b - - movl $256 - NUM_EXCEPTION_VECTORS,%ecx - movl $ignore_int,%edx - movl $(__KERNEL_CS << 16),%eax - movw %dx,%ax /* selector = 0x0010 = cs */ - movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ -2: - movl %eax,(%edi) - movl %edx,4(%edi) - addl $8,%edi - loop 2b - #ifdef CONFIG_CC_STACKPROTECTOR /* * Configure the stack canary. The linker can't handle this by @@ -457,12 +423,9 @@ early_idt_handler_common: /* The vector number is in pt_regs->gs */ cld - pushl %fs /* pt_regs->fs */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ - pushl %es /* pt_regs->es */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ - pushl %ds /* pt_regs->ds */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %fs /* pt_regs->fs (__fsh varies by model) */ + pushl %es /* pt_regs->es (__esh varies by model) */ + pushl %ds /* pt_regs->ds (__dsh varies by model) */ pushl %eax /* pt_regs->ax */ pushl %ebp /* pt_regs->bp */ pushl %edi /* pt_regs->di */ @@ -479,9 +442,8 @@ early_idt_handler_common: /* Load the vector number into EDX */ movl PT_GS(%esp), %edx - /* Load GS into pt_regs->gs and clear high bits */ + /* Load GS into pt_regs->gs (and maybe clobber __gsh) */ movw %gs, PT_GS(%esp) - movw $0, PT_GS+2(%esp) movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */ call early_fixup_exception @@ -493,18 +455,17 @@ early_idt_handler_common: popl %edi /* pt_regs->di */ popl %ebp /* pt_regs->bp */ popl %eax /* pt_regs->ax */ - popl %ds /* pt_regs->ds */ - popl %es /* pt_regs->es */ - popl %fs /* pt_regs->fs */ - popl %gs /* pt_regs->gs */ + popl %ds /* pt_regs->ds (always ignores __dsh) */ + popl %es /* pt_regs->es (always ignores __esh) */ + popl %fs /* pt_regs->fs (always ignores __fsh) */ + popl %gs /* pt_regs->gs (always ignores __gsh) */ decl %ss:early_recursion_flag addl $4, %esp /* pop pt_regs->orig_ax */ iret ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ - ALIGN -ignore_int: +ENTRY(early_ignore_irq) cld #ifdef CONFIG_PRINTK pushl %eax @@ -539,7 +500,8 @@ ignore_int: hlt_loop: hlt jmp hlt_loop -ENDPROC(ignore_int) +ENDPROC(early_ignore_irq) + __INITDATA .align 4 GLOBAL(early_recursion_flag) @@ -628,7 +590,6 @@ int_msg: .data .globl boot_gdt_descr -.globl idt_descr ALIGN # early boot GDT descriptor (must use 1:1 address mapping) @@ -637,11 +598,6 @@ boot_gdt_descr: .word __BOOT_DS+7 .long boot_gdt - __PAGE_OFFSET - .word 0 # 32-bit align idt_desc.address -idt_descr: - .word IDT_ENTRIES*8-1 # idt contains 256 entries - .long idt_table - # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c new file mode 100644 index 000000000000..6107ee1cb8d5 --- /dev/null +++ b/arch/x86/kernel/idt.c @@ -0,0 +1,371 @@ +/* + * Interrupt descriptor table related code + * + * This file is licensed under the GPL V2 + */ +#include <linux/interrupt.h> + +#include <asm/traps.h> +#include <asm/proto.h> +#include <asm/desc.h> + +struct idt_data { + unsigned int vector; + unsigned int segment; + struct idt_bits bits; + const void *addr; +}; + +#define DPL0 0x0 +#define DPL3 0x3 + +#define DEFAULT_STACK 0 + +#define G(_vector, _addr, _ist, _type, _dpl, _segment) \ + { \ + .vector = _vector, \ + .bits.ist = _ist, \ + .bits.type = _type, \ + .bits.dpl = _dpl, \ + .bits.p = 1, \ + .addr = _addr, \ + .segment = _segment, \ + } + +/* Interrupt gate */ +#define INTG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* System interrupt gate */ +#define SYSG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) + +/* Interrupt gate with interrupt stack */ +#define ISTG(_vector, _addr, _ist) \ + G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* System interrupt gate with interrupt stack */ +#define SISTG(_vector, _addr, _ist) \ + G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS) + +/* Task gate */ +#define TSKG(_vector, _gdt) \ + G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) + +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initdata struct idt_data early_idts[] = { + INTG(X86_TRAP_DB, debug), + SYSG(X86_TRAP_BP, int3), +#ifdef CONFIG_X86_32 + INTG(X86_TRAP_PF, page_fault), +#endif +}; + +/* + * The default IDT entries which are set up in trap_init() before + * cpu_init() is invoked. Interrupt stacks cannot be used at that point and + * the traps which use them are reinitialized with IST after cpu_init() has + * set up TSS. + */ +static const __initdata struct idt_data def_idts[] = { + INTG(X86_TRAP_DE, divide_error), + INTG(X86_TRAP_NMI, nmi), + INTG(X86_TRAP_BR, bounds), + INTG(X86_TRAP_UD, invalid_op), + INTG(X86_TRAP_NM, device_not_available), + INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), + INTG(X86_TRAP_TS, invalid_TSS), + INTG(X86_TRAP_NP, segment_not_present), + INTG(X86_TRAP_SS, stack_segment), + INTG(X86_TRAP_GP, general_protection), + INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), + INTG(X86_TRAP_MF, coprocessor_error), + INTG(X86_TRAP_AC, alignment_check), + INTG(X86_TRAP_XF, simd_coprocessor_error), + +#ifdef CONFIG_X86_32 + TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), +#else + INTG(X86_TRAP_DF, double_fault), +#endif + INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_NMI, nmi), + INTG(X86_TRAP_BP, int3), + +#ifdef CONFIG_X86_MCE + INTG(X86_TRAP_MC, &machine_check), +#endif + + SYSG(X86_TRAP_OF, overflow), +#if defined(CONFIG_IA32_EMULATION) + SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), +#elif defined(CONFIG_X86_32) + SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32), +#endif +}; + +/* + * The APIC and SMP idt entries + */ +static const __initdata struct idt_data apic_idts[] = { +#ifdef CONFIG_SMP + INTG(RESCHEDULE_VECTOR, reschedule_interrupt), + INTG(CALL_FUNCTION_VECTOR, call_function_interrupt), + INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt), + INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt), + INTG(REBOOT_VECTOR, reboot_interrupt), +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR + INTG(THERMAL_APIC_VECTOR, thermal_interrupt), +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt), +#endif + +#ifdef CONFIG_X86_MCE_AMD + INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt), +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt), + INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), +# ifdef CONFIG_HAVE_KVM + INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), + INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), + INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), +# endif +# ifdef CONFIG_IRQ_WORK + INTG(IRQ_WORK_VECTOR, irq_work_interrupt), +# endif + INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt), + INTG(ERROR_APIC_VECTOR, error_interrupt), +#endif +}; + +#ifdef CONFIG_X86_64 +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initdata struct idt_data early_pf_idts[] = { + INTG(X86_TRAP_PF, page_fault), +}; + +/* + * Override for the debug_idt. Same as the default, but with interrupt + * stack set to DEFAULT_STACK (0). Required for NMI trap handling. + */ +static const __initdata struct idt_data dbg_idts[] = { + INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_BP, int3), +}; +#endif + +/* Must be page-aligned because the real IDT is used in a fixmap. */ +gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; + +struct desc_ptr idt_descr __ro_after_init = { + .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1, + .address = (unsigned long) idt_table, +}; + +#ifdef CONFIG_X86_64 +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; + +/* + * The exceptions which use Interrupt stacks. They are setup after + * cpu_init() when the TSS has been initialized. + */ +static const __initdata struct idt_data ist_idts[] = { + ISTG(X86_TRAP_DB, debug, DEBUG_STACK), + ISTG(X86_TRAP_NMI, nmi, NMI_STACK), + SISTG(X86_TRAP_BP, int3, DEBUG_STACK), + ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), +#ifdef CONFIG_X86_MCE + ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), +#endif +}; + +/* + * Override for the debug_idt. Same as the default, but with interrupt + * stack set to DEFAULT_STACK (0). Required for NMI trap handling. + */ +const struct desc_ptr debug_idt_descr = { + .size = IDT_ENTRIES * 16 - 1, + .address = (unsigned long) debug_idt_table, +}; +#endif + +static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) +{ + unsigned long addr = (unsigned long) d->addr; + + gate->offset_low = (u16) addr; + gate->segment = (u16) d->segment; + gate->bits = d->bits; + gate->offset_middle = (u16) (addr >> 16); +#ifdef CONFIG_X86_64 + gate->offset_high = (u32) (addr >> 32); + gate->reserved = 0; +#endif +} + +static void +idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) +{ + gate_desc desc; + + for (; size > 0; t++, size--) { + idt_init_desc(&desc, t); + write_idt_entry(idt, t->vector, &desc); + if (sys) + set_bit(t->vector, used_vectors); + } +} + +static void set_intr_gate(unsigned int n, const void *addr) +{ + struct idt_data data; + + BUG_ON(n > 0xFF); + + memset(&data, 0, sizeof(data)); + data.vector = n; + data.addr = addr; + data.segment = __KERNEL_CS; + data.bits.type = GATE_INTERRUPT; + data.bits.p = 1; + + idt_setup_from_table(idt_table, &data, 1, false); +} + +/** + * idt_setup_early_traps - Initialize the idt table with early traps + * + * On X8664 these traps do not use interrupt stacks as they can't work + * before cpu_init() is invoked and sets up TSS. The IST variants are + * installed after that. + */ +void __init idt_setup_early_traps(void) +{ + idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts), + true); + load_idt(&idt_descr); +} + +/** + * idt_setup_traps - Initialize the idt table with default traps + */ +void __init idt_setup_traps(void) +{ + idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true); +} + +#ifdef CONFIG_X86_64 +/** + * idt_setup_early_pf - Initialize the idt table with early pagefault handler + * + * On X8664 this does not use interrupt stacks as they can't work before + * cpu_init() is invoked and sets up TSS. The IST variant is installed + * after that. + * + * FIXME: Why is 32bit and 64bit installing the PF handler at different + * places in the early setup code? + */ +void __init idt_setup_early_pf(void) +{ + idt_setup_from_table(idt_table, early_pf_idts, + ARRAY_SIZE(early_pf_idts), true); +} + +/** + * idt_setup_ist_traps - Initialize the idt table with traps using IST + */ +void __init idt_setup_ist_traps(void) +{ + idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true); +} + +/** + * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps + */ +void __init idt_setup_debugidt_traps(void) +{ + memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); + + idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false); +} +#endif + +/** + * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates + */ +void __init idt_setup_apic_and_irq_gates(void) +{ + int i = FIRST_EXTERNAL_VECTOR; + void *entry; + + idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true); + + for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) { + entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + set_intr_gate(i, entry); + } + + for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { +#ifdef CONFIG_X86_LOCAL_APIC + set_bit(i, used_vectors); + set_intr_gate(i, spurious_interrupt); +#else + entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + set_intr_gate(i, entry); +#endif + } +} + +/** + * idt_setup_early_handler - Initializes the idt table with early handlers + */ +void __init idt_setup_early_handler(void) +{ + int i; + + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) + set_intr_gate(i, early_idt_handler_array[i]); +#ifdef CONFIG_X86_32 + for ( ; i < NR_VECTORS; i++) + set_intr_gate(i, early_ignore_irq); +#endif + load_idt(&idt_descr); +} + +/** + * idt_invalidate - Invalidate interrupt descriptor table + * @addr: The virtual address of the 'invalid' IDT + */ +void idt_invalidate(void *addr) +{ + struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 }; + + load_idt(&idt); +} + +void __init update_intr_gate(unsigned int n, const void *addr) +{ + if (WARN_ON_ONCE(!test_bit(n, used_vectors))) + return; + set_intr_gate(n, addr); +} + +void alloc_intr_gate(unsigned int n, const void *addr) +{ + BUG_ON(n < FIRST_SYSTEM_VECTOR); + if (!test_and_set_bit(n, used_vectors)) + set_intr_gate(n, addr); +} diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 4ed0aba8dbc8..52089c043160 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -29,9 +29,6 @@ EXPORT_PER_CPU_SYMBOL(irq_regs); atomic_t irq_err_count; -/* Function pointer for generic interrupt vector handling */ -void (*x86_platform_ipi_callback)(void) = NULL; - /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -87,13 +84,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); seq_puts(p, " APIC ICR read retries\n"); -#endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); seq_puts(p, " Platform interrupts\n"); } +#endif #ifdef CONFIG_SMP seq_printf(p, "%*s: ", prec, "RES"); for_each_online_cpu(j) @@ -183,9 +180,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_irq_work_irqs; sum += irq_stats(cpu)->icr_read_retry_count; -#endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; +#endif #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; @@ -259,26 +256,26 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) return 1; } +#ifdef CONFIG_X86_LOCAL_APIC +/* Function pointer for generic interrupt vector handling */ +void (*x86_platform_ipi_callback)(void) = NULL; /* * Handler for X86_PLATFORM_IPI_VECTOR. */ -void __smp_x86_platform_ipi(void) -{ - inc_irq_stat(x86_platform_ipis); - - if (x86_platform_ipi_callback) - x86_platform_ipi_callback(); -} - __visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); entering_ack_irq(); - __smp_x86_platform_ipi(); + trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); + inc_irq_stat(x86_platform_ipis); + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); + trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); exiting_irq(); set_irq_regs(old_regs); } +#endif #ifdef CONFIG_HAVE_KVM static void dummy_handler(void) {} @@ -334,19 +331,6 @@ __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) } #endif -__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); - trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); - __smp_x86_platform_ipi(); - trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); - exiting_irq(); - set_irq_regs(old_regs); -} - -EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); #ifdef CONFIG_HOTPLUG_CPU @@ -431,7 +415,7 @@ int check_irq_vectors_for_cpu_disable(void) * this w/o holding vector_lock. */ for (vector = FIRST_EXTERNAL_VECTOR; - vector < first_system_vector; vector++) { + vector < FIRST_SYSTEM_VECTOR; vector++) { if (!test_bit(vector, used_vectors) && IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) { if (++count == this_count) diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 275487872be2..70dee056f92b 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -11,35 +11,23 @@ #include <asm/trace/irq_vectors.h> #include <linux/interrupt.h> -static inline void __smp_irq_work_interrupt(void) -{ - inc_irq_stat(apic_irq_work_irqs); - irq_work_run(); -} - +#ifdef CONFIG_X86_LOCAL_APIC __visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); - __smp_irq_work_interrupt(); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); - __smp_irq_work_interrupt(); + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); trace_irq_work_exit(IRQ_WORK_VECTOR); exiting_irq(); } void arch_irq_work_raise(void) { -#ifdef CONFIG_X86_LOCAL_APIC if (!arch_irq_work_has_interrupt()) return; apic->send_IPI_self(IRQ_WORK_VECTOR); apic_wait_icr_idle(); -#endif } +#endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c7fd18526c3e..1add9e08e83e 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -55,18 +55,6 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, }; -int vector_used_by_percpu_irq(unsigned int vector) -{ - int cpu; - - for_each_online_cpu(cpu) { - if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) - return 1; - } - - return 0; -} - void __init init_ISA_irqs(void) { struct irq_chip *chip = legacy_pic->chip; @@ -99,100 +87,12 @@ void __init init_IRQ(void) x86_init.irqs.intr_init(); } -static void __init smp_intr_init(void) -{ -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for generic single function call */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); - - /* IPI used for rebooting/stopping */ - alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); -#endif /* CONFIG_SMP */ -} - -static void __init apic_intr_init(void) -{ - smp_intr_init(); - -#ifdef CONFIG_X86_THERMAL_VECTOR - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -#ifdef CONFIG_X86_MCE_THRESHOLD - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -#endif - -#ifdef CONFIG_X86_MCE_AMD - alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt); -#endif - -#ifdef CONFIG_X86_LOCAL_APIC - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI for X86 platform specific use */ - alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); -#ifdef CONFIG_HAVE_KVM - /* IPI for KVM to deliver posted interrupt */ - alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); - /* IPI for KVM to deliver interrupt to wake up tasks */ - alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi); - /* IPI for KVM to deliver nested posted interrupt */ - alloc_intr_gate(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi); -#endif - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - /* IRQ work interrupts: */ -# ifdef CONFIG_IRQ_WORK - alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt); -# endif - -#endif -} - void __init native_init_IRQ(void) { - int i; - /* Execute any quirks before the call gates are initialised: */ x86_init.irqs.pre_vector_init(); - apic_intr_init(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - i = FIRST_EXTERNAL_VECTOR; -#ifndef CONFIG_X86_LOCAL_APIC -#define first_system_vector NR_VECTORS -#endif - for_each_clear_bit_from(i, used_vectors, first_system_vector) { - /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ - set_intr_gate(i, irq_entries_start + - 8 * (i - FIRST_EXTERNAL_VECTOR)); - } -#ifdef CONFIG_X86_LOCAL_APIC - for_each_clear_bit_from(i, used_vectors, NR_VECTORS) - set_intr_gate(i, spurious_interrupt); -#endif + idt_setup_apic_and_irq_gates(); if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 69ea0bc1cfa3..4f98aad38237 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -39,6 +39,7 @@ #include <asm/insn.h> #include <asm/debugreg.h> #include <asm/set_memory.h> +#include <asm/sections.h> #include "common.h" @@ -251,10 +252,12 @@ static int can_optimize(unsigned long paddr) /* * Do not optimize in the entry code due to the unstable - * stack handling. + * stack handling and registers setup. */ - if ((paddr >= (unsigned long)__entry_text_start) && - (paddr < (unsigned long)__entry_text_end)) + if (((paddr >= (unsigned long)__entry_text_start) && + (paddr < (unsigned long)__entry_text_end)) || + ((paddr >= (unsigned long)__irqentry_text_start) && + (paddr < (unsigned long)__irqentry_text_end))) return 0; /* Check there is enough space for a relative jump. */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d04e30e3c0ff..874827b0d7ca 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -263,7 +263,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) switch (kvm_read_and_reset_pf_reason()) { default: - trace_do_page_fault(regs, error_code); + do_page_fault(regs, error_code); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ @@ -455,7 +455,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu) static void __init kvm_apf_trap_init(void) { - set_intr_gate(14, async_page_fault); + update_intr_gate(X86_TRAP_PF, async_page_fault); } void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a870910c8565..f0e64db18ac8 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -21,6 +21,25 @@ #include <asm/mmu_context.h> #include <asm/syscalls.h> +static void refresh_ldt_segments(void) +{ +#ifdef CONFIG_X86_64 + unsigned short sel; + + /* + * Make sure that the cached DS and ES descriptors match the updated + * LDT. + */ + savesegment(ds, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(ds, sel); + + savesegment(es, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(es, sel); +#endif +} + /* context.lock is held for us, so we don't need any locking. */ static void flush_ldt(void *__mm) { @@ -32,6 +51,8 @@ static void flush_ldt(void *__mm) pc = &mm->context; set_ldt(pc->ldt->entries, pc->ldt->nr_entries); + + refresh_ldt_segments(); } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8c53c5d7a1bc..00bc751c861c 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -26,18 +26,6 @@ #include <asm/set_memory.h> #include <asm/debugreg.h> -static void set_idt(void *newidt, __u16 limit) -{ - struct desc_ptr curidt; - - /* ia32 supports unaliged loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - load_idt(&curidt); -} - - static void set_gdt(void *newgdt, __u16 limit) { struct desc_ptr curgdt; @@ -245,7 +233,7 @@ void machine_kexec(struct kimage *image) * If you want to load them you must set up your own idt & gdt. */ set_gdt(phys_to_virt(0), 0); - set_idt(phys_to_virt(0), 0); + idt_invalidate(phys_to_virt(0)); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f67bd3205df7..62e7d70aadd5 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -35,6 +35,7 @@ #include <asm/page.h> #include <asm/pgtable.h> #include <asm/setup.h> +#include <asm/unwind.h> #if 0 #define DEBUGP(fmt, ...) \ @@ -213,7 +214,7 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL; + *para = NULL, *orc = NULL, *orc_ip = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -225,6 +226,10 @@ int module_finalize(const Elf_Ehdr *hdr, locks = s; if (!strcmp(".parainstructions", secstrings + s->sh_name)) para = s; + if (!strcmp(".orc_unwind", secstrings + s->sh_name)) + orc = s; + if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) + orc_ip = s; } if (alt) { @@ -248,6 +253,10 @@ int module_finalize(const Elf_Ehdr *hdr, /* make jump label nops */ jump_label_apply_nops(me); + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + return 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 446c8aa09b9b..35aafc95e4b8 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -39,26 +39,26 @@ #include <trace/events/nmi.h> struct nmi_desc { - spinlock_t lock; + raw_spinlock_t lock; struct list_head head; }; static struct nmi_desc nmi_desc[NMI_MAX] = { { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), .head = LIST_HEAD_INIT(nmi_desc[0].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), .head = LIST_HEAD_INIT(nmi_desc[1].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), .head = LIST_HEAD_INIT(nmi_desc[2].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), .head = LIST_HEAD_INIT(nmi_desc[3].head), }, @@ -163,7 +163,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) init_irq_work(&action->irq_work, nmi_max_handler); - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); /* * Indicate if there are multiple registrations on the @@ -181,7 +181,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) else list_add_tail_rcu(&action->list, &desc->head); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } EXPORT_SYMBOL(__register_nmi_handler); @@ -192,7 +192,7 @@ void unregister_nmi_handler(unsigned int type, const char *name) struct nmiaction *n; unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); list_for_each_entry_rcu(n, &desc->head, list) { /* @@ -207,7 +207,7 @@ void unregister_nmi_handler(unsigned int type, const char *name) } } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); synchronize_rcu(); } EXPORT_SYMBOL_GPL(unregister_nmi_handler); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bc0a849589bb..a14df9eecfed 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -319,9 +319,6 @@ __visible struct pv_irq_ops pv_irq_ops = { .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), .safe_halt = native_safe_halt, .halt = native_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = paravirt_nop, -#endif }; __visible struct pv_cpu_ops pv_cpu_ops = { diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 91271122f0df..502a77d0adb0 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void) x86_platform.legacy.reserve_bios_regions = 1; break; case X86_SUBARCH_XEN: - case X86_SUBARCH_LGUEST: x86_platform.legacy.devices.pnpbios = 0; x86_platform.legacy.rtc = 0; break; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c6d6dc5f8bb2..11966251cd42 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -56,7 +56,7 @@ #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/vm86.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> #include <asm/proto.h> void __show_regs(struct pt_regs *regs, int all) @@ -68,7 +68,7 @@ void __show_regs(struct pt_regs *regs, int all) if (user_mode(regs)) { sp = regs->sp; - ss = regs->ss & 0xffff; + ss = regs->ss; gs = get_user_gs(regs); } else { sp = kernel_stack_pointer(regs); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c3169be4c596..302e7b2572d1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> #include <asm/unistd.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ @@ -69,8 +69,7 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff, - (void *)regs->ip); + printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, regs->sp, regs->flags); if (regs->orig_ax != -1) @@ -149,6 +148,123 @@ void release_thread(struct task_struct *dead_task) } } +enum which_selector { + FS, + GS +}; + +/* + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are + * not available. The goal is to be reasonably fast on non-FSGSBASE systems. + * It's forcibly inlined because it'll generate better code and this function + * is hot. + */ +static __always_inline void save_base_legacy(struct task_struct *prev_p, + unsigned short selector, + enum which_selector which) +{ + if (likely(selector == 0)) { + /* + * On Intel (without X86_BUG_NULL_SEG), the segment base could + * be the pre-existing saved base or it could be zero. On AMD + * (with X86_BUG_NULL_SEG), the segment base could be almost + * anything. + * + * This branch is very hot (it's hit twice on almost every + * context switch between 64-bit programs), and avoiding + * the RDMSR helps a lot, so we just assume that whatever + * value is already saved is correct. This matches historical + * Linux behavior, so it won't break existing applications. + * + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we + * report that the base is zero, it needs to actually be zero: + * see the corresponding logic in load_seg_legacy. + */ + } else { + /* + * If the selector is 1, 2, or 3, then the base is zero on + * !X86_BUG_NULL_SEG CPUs and could be anything on + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux + * has never attempted to preserve the base across context + * switches. + * + * If selector > 3, then it refers to a real segment, and + * saving the base isn't necessary. + */ + if (which == FS) + prev_p->thread.fsbase = 0; + else + prev_p->thread.gsbase = 0; + } +} + +static __always_inline void save_fsgs(struct task_struct *task) +{ + savesegment(fs, task->thread.fsindex); + savesegment(gs, task->thread.gsindex); + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); +} + +static __always_inline void loadseg(enum which_selector which, + unsigned short sel) +{ + if (which == FS) + loadsegment(fs, sel); + else + load_gs_index(sel); +} + +static __always_inline void load_seg_legacy(unsigned short prev_index, + unsigned long prev_base, + unsigned short next_index, + unsigned long next_base, + enum which_selector which) +{ + if (likely(next_index <= 3)) { + /* + * The next task is using 64-bit TLS, is not using this + * segment at all, or is having fun with arcane CPU features. + */ + if (next_base == 0) { + /* + * Nasty case: on AMD CPUs, we need to forcibly zero + * the base. + */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + loadseg(which, __USER_DS); + loadseg(which, next_index); + } else { + /* + * We could try to exhaustively detect cases + * under which we can skip the segment load, + * but there's really only one case that matters + * for performance: if both the previous and + * next states are fully zeroed, we can skip + * the load. + * + * (This assumes that prev_base == 0 has no + * false positives. This is the case on + * Intel-style CPUs.) + */ + if (likely(prev_index | next_index | prev_base)) + loadseg(which, next_index); + } + } else { + if (prev_index != next_index) + loadseg(which, next_index); + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, + next_base); + } + } else { + /* + * The next task is using a real segment. Loading the selector + * is sufficient. + */ + loadseg(which, next_index); + } +} + int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { @@ -229,10 +345,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, unsigned int _cs, unsigned int _ss, unsigned int _ds) { + WARN_ON_ONCE(regs != current_pt_regs()); + + if (static_cpu_has(X86_BUG_NULL_SEG)) { + /* Loading zero below won't clear the base. */ + loadsegment(fs, __USER_DS); + load_gs_index(__USER_DS); + } + loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); + regs->ip = new_ip; regs->sp = new_sp; regs->cs = _cs; @@ -277,7 +402,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - unsigned prev_fsindex, prev_gsindex; + + WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && + this_cpu_read(irq_count) != -1); switch_fpu_prepare(prev_fpu, cpu); @@ -286,8 +413,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * * (e.g. xen_load_tls()) */ - savesegment(fs, prev_fsindex); - savesegment(gs, prev_gsindex); + save_fsgs(prev_p); /* * Load TLS before restoring any segments so that segment loads @@ -326,108 +452,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); - /* - * Switch FS and GS. - * - * These are even more complicated than DS and ES: they have - * 64-bit bases are that controlled by arch_prctl. The bases - * don't necessarily match the selectors, as user code can do - * any number of things to cause them to be inconsistent. - * - * We don't promise to preserve the bases if the selectors are - * nonzero. We also don't promise to preserve the base if the - * selector is zero and the base doesn't match whatever was - * most recently passed to ARCH_SET_FS/GS. (If/when the - * FSGSBASE instructions are enabled, we'll need to offer - * stronger guarantees.) - * - * As an invariant, - * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is - * impossible. - */ - if (next->fsindex) { - /* Loading a nonzero value into FS sets the index and base. */ - loadsegment(fs, next->fsindex); - } else { - if (next->fsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_fsindex) - loadsegment(fs, 0); - wrmsrl(MSR_FS_BASE, next->fsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - */ - loadsegment(fs, __USER_DS); - loadsegment(fs, 0); - } else { - /* - * If the previous index is zero and ARCH_SET_FS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->fsbase || prev_fsindex) - loadsegment(fs, 0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_fsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_fsindex) - prev->fsbase = 0; - prev->fsindex = prev_fsindex; - - if (next->gsindex) { - /* Loading a nonzero value into GS sets the index and base. */ - load_gs_index(next->gsindex); - } else { - if (next->gsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_gsindex) - load_gs_index(0); - wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - * - * This contains a pointless SWAPGS pair. - * Fixing it would involve an explicit check - * for Xen or a new pvop. - */ - load_gs_index(__USER_DS); - load_gs_index(0); - } else { - /* - * If the previous index is zero and ARCH_SET_GS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->gsbase || prev_gsindex) - load_gs_index(0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_gsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_gsindex) - prev->gsbase = 0; - prev->gsindex = prev_gsindex; + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); switch_fpu_finish(next_fpu, cpu); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 0bee04d41bed..eaa591cfd98b 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -1,6 +1,7 @@ /* * This file contains work-arounds for x86 and x86_64 platform bugs. */ +#include <linux/dmi.h> #include <linux/pci.h> #include <linux/irq.h> @@ -656,3 +657,12 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); #endif #endif + +bool x86_apple_machine; +EXPORT_SYMBOL(x86_apple_machine); + +void __init early_platform_quirks(void) +{ + x86_apple_machine = dmi_match(DMI_SYS_VENDOR, "Apple Inc.") || + dmi_match(DMI_SYS_VENDOR, "Apple Computer, Inc."); +} diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a56bf6051f4e..54984b142641 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -38,8 +38,6 @@ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); -static const struct desc_ptr no_idt = {}; - /* * This is set if we need to go through the 'emergency' path. * When machine_emergency_restart() is called, we may be on @@ -638,7 +636,7 @@ static void native_machine_emergency_restart(void) break; case BOOT_TRIPLE: - load_idt(&no_idt); + idt_invalidate(NULL); __asm__ __volatile__("int3"); /* We're probably dead after this, but... */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0bfe0c1628f6..d84afb0a322d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -116,6 +116,7 @@ #include <asm/microcode.h> #include <asm/mmu_context.h> #include <asm/kaslr.h> +#include <asm/unwind.h> /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -899,7 +900,7 @@ void __init setup_arch(char **cmdline_p) */ olpc_ofw_detect(); - early_trap_init(); + idt_setup_early_traps(); early_cpu_init(); early_ioremap_init(); @@ -1170,7 +1171,7 @@ void __init setup_arch(char **cmdline_p) init_mem_mapping(); - early_trap_pf_init(); + idt_setup_early_pf(); /* * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) @@ -1215,6 +1216,8 @@ void __init setup_arch(char **cmdline_p) io_delay_init(); + early_platform_quirks(); + /* * Parse the ACPI tables for possible boot-time SMP configuration. */ @@ -1319,6 +1322,8 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_apply_memmap_quirks(); #endif + + unwind_init(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 10edd1e69a68..6e8fcb6f7e1e 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -155,13 +155,10 @@ static void __init pcpup_populate_pte(unsigned long addr) static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 - struct desc_struct gdt; + struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu), + 0xFFFFF); - pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, - 0x2 | DESCTYPE_S, 0x8); - gdt.s = 1; - write_gdt_entry(get_cpu_gdt_rw(cpu), - GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S); #endif } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cc30a74e4adb..e04442345fc0 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -256,7 +256,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = current->sas_ss_sp + current->sas_ss_size; } else if (IS_ENABLED(CONFIG_X86_32) && !onsigstack && - (regs->ss & 0xffff) != __USER_DS && + regs->ss != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { /* This is the legacy signal stack switching. */ diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d798c0da451c..5c574dff4c1a 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -254,84 +254,45 @@ finish: } /* - * Reschedule call back. + * Reschedule call back. KVM uses this interrupt to force a cpu out of + * guest mode */ -static inline void __smp_reschedule_interrupt(void) -{ - inc_irq_stat(irq_resched_count); - scheduler_ipi(); -} - __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); - __smp_reschedule_interrupt(); - /* - * KVM uses this interrupt to force a cpu out of guest mode - */ -} - -__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs) -{ - /* - * Need to call irq_enter() before calling the trace point. - * __smp_reschedule_interrupt() calls irq_enter/exit() too (in - * scheduler_ipi(). This is OK, since those functions are allowed - * to nest. - */ - ipi_entering_ack_irq(); - trace_reschedule_entry(RESCHEDULE_VECTOR); - __smp_reschedule_interrupt(); - trace_reschedule_exit(RESCHEDULE_VECTOR); - exiting_irq(); - /* - * KVM uses this interrupt to force a cpu out of guest mode - */ -} + inc_irq_stat(irq_resched_count); -static inline void __smp_call_function_interrupt(void) -{ - generic_smp_call_function_interrupt(); - inc_irq_stat(irq_call_count); + if (trace_resched_ipi_enabled()) { + /* + * scheduler_ipi() might call irq_enter() as well, but + * nested calls are fine. + */ + irq_enter(); + trace_reschedule_entry(RESCHEDULE_VECTOR); + scheduler_ipi(); + trace_reschedule_exit(RESCHEDULE_VECTOR); + irq_exit(); + return; + } + scheduler_ipi(); } __visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); - __smp_call_function_interrupt(); - exiting_irq(); -} - -__visible void __irq_entry -smp_trace_call_function_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); - __smp_call_function_interrupt(); - trace_call_function_exit(CALL_FUNCTION_VECTOR); - exiting_irq(); -} - -static inline void __smp_call_function_single_interrupt(void) -{ - generic_smp_call_function_single_interrupt(); inc_irq_stat(irq_call_count); -} - -__visible void __irq_entry -smp_call_function_single_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); - __smp_call_function_single_interrupt(); + generic_smp_call_function_interrupt(); + trace_call_function_exit(CALL_FUNCTION_VECTOR); exiting_irq(); } -__visible void __irq_entry -smp_trace_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r) { ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); - __smp_call_function_single_interrupt(); + inc_irq_stat(irq_call_count); + generic_smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); exiting_irq(); } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 5f25cfbd952e..5ee663836c08 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -13,7 +13,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re unsigned long addr, seg; addr = regs->ip; - seg = regs->cs & 0xffff; + seg = regs->cs; if (v8086_mode(regs)) { addr = (addr & 0xffff) + (seg << 4); return addr; diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index dcd699baea1b..a106b9719c58 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -93,7 +93,7 @@ static void set_tls_desc(struct task_struct *p, int idx, while (n-- > 0) { if (LDT_empty(info) || LDT_zero(info)) { - desc->a = desc->b = 0; + memset(desc, 0, sizeof(*desc)); } else { fill_ldt(desc, info); diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 15515132bf0d..c6636d1f60b9 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -4,57 +4,38 @@ * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com> * */ -#include <asm/hw_irq.h> -#include <asm/desc.h> +#include <linux/jump_label.h> #include <linux/atomic.h> -atomic_t trace_idt_ctr = ATOMIC_INIT(0); -struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, - (unsigned long) trace_idt_table }; - -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; +#include <asm/hw_irq.h> +#include <asm/desc.h> -static int trace_irq_vector_refcount; -static DEFINE_MUTEX(irq_vector_mutex); +DEFINE_STATIC_KEY_FALSE(trace_pagefault_key); -static void set_trace_idt_ctr(int val) +int trace_pagefault_reg(void) { - atomic_set(&trace_idt_ctr, val); - /* Ensure the trace_idt_ctr is set before sending IPI */ - wmb(); + static_branch_inc(&trace_pagefault_key); + return 0; } -static void switch_idt(void *arg) +void trace_pagefault_unreg(void) { - unsigned long flags; - - local_irq_save(flags); - load_current_idt(); - local_irq_restore(flags); + static_branch_dec(&trace_pagefault_key); } -int trace_irq_vector_regfunc(void) +#ifdef CONFIG_SMP + +DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key); + +int trace_resched_ipi_reg(void) { - mutex_lock(&irq_vector_mutex); - if (!trace_irq_vector_refcount) { - set_trace_idt_ctr(1); - smp_call_function(switch_idt, NULL, 0); - switch_idt(NULL); - } - trace_irq_vector_refcount++; - mutex_unlock(&irq_vector_mutex); + static_branch_inc(&trace_resched_ipi_key); return 0; } -void trace_irq_vector_unregfunc(void) +void trace_resched_ipi_unreg(void) { - mutex_lock(&irq_vector_mutex); - trace_irq_vector_refcount--; - if (!trace_irq_vector_refcount) { - set_trace_idt_ctr(0); - smp_call_function(switch_idt, NULL, 0); - switch_idt(NULL); - } - mutex_unlock(&irq_vector_mutex); + static_branch_dec(&trace_resched_ipi_key); } + +#endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bf54309b85da..34ea3651362e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -38,11 +38,6 @@ #include <linux/smp.h> #include <linux/io.h> -#ifdef CONFIG_EISA -#include <linux/ioport.h> -#include <linux/eisa.h> -#endif - #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif @@ -70,20 +65,13 @@ #include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/proto.h> - -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; #else #include <asm/processor-flags.h> #include <asm/setup.h> #include <asm/proto.h> #endif -/* Must be page-aligned because the real IDT is used in a fixmap. */ -gate_desc idt_table[NR_VECTORS] __page_aligned_bss; - DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); static inline void cond_local_irq_enable(struct pt_regs *regs) { @@ -935,87 +923,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) } #endif -/* Set of traps needed for early debugging. */ -void __init early_trap_init(void) -{ - /* - * Don't use IST to set DEBUG_STACK as it doesn't work until TSS - * is ready in cpu_init() <-- trap_init(). Before trap_init(), - * CPU runs at ring 0 so it is impossible to hit an invalid - * stack. Using the original stack works well enough at this - * early stage. DEBUG_STACK will be equipped after cpu_init() in - * trap_init(). - * - * We don't need to set trace_idt_table like set_intr_gate(), - * since we don't have trace_debug and it will be reset to - * 'debug' in trap_init() by set_intr_gate_ist(). - */ - set_intr_gate_notrace(X86_TRAP_DB, debug); - /* int3 can be called from all */ - set_system_intr_gate(X86_TRAP_BP, &int3); -#ifdef CONFIG_X86_32 - set_intr_gate(X86_TRAP_PF, page_fault); -#endif - load_idt(&idt_descr); -} - -void __init early_trap_pf_init(void) -{ -#ifdef CONFIG_X86_64 - set_intr_gate(X86_TRAP_PF, page_fault); -#endif -} - void __init trap_init(void) { - int i; - -#ifdef CONFIG_EISA - void __iomem *p = early_ioremap(0x0FFFD9, 4); - - if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) - EISA_bus = 1; - early_iounmap(p, 4); -#endif - - set_intr_gate(X86_TRAP_DE, divide_error); - set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK); - /* int4 can be called from all */ - set_system_intr_gate(X86_TRAP_OF, &overflow); - set_intr_gate(X86_TRAP_BR, bounds); - set_intr_gate(X86_TRAP_UD, invalid_op); - set_intr_gate(X86_TRAP_NM, device_not_available); -#ifdef CONFIG_X86_32 - set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS); -#else - set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK); -#endif - set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun); - set_intr_gate(X86_TRAP_TS, invalid_TSS); - set_intr_gate(X86_TRAP_NP, segment_not_present); - set_intr_gate(X86_TRAP_SS, stack_segment); - set_intr_gate(X86_TRAP_GP, general_protection); - set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug); - set_intr_gate(X86_TRAP_MF, coprocessor_error); - set_intr_gate(X86_TRAP_AC, alignment_check); -#ifdef CONFIG_X86_MCE - set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK); -#endif - set_intr_gate(X86_TRAP_XF, simd_coprocessor_error); - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - -#ifdef CONFIG_IA32_EMULATION - set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat); - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#endif - -#ifdef CONFIG_X86_32 - set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#endif + idt_setup_traps(); /* * Set the IDT descriptor to a fixed read-only location, so that the @@ -1030,20 +940,9 @@ void __init trap_init(void) */ cpu_init(); - /* - * X86_TRAP_DB and X86_TRAP_BP have been set - * in early_trap_init(). However, ITS works only after - * cpu_init() loads TSS. See comments in early_trap_init(). - */ - set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); - /* int3 can be called from all */ - set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + idt_setup_ist_traps(); x86_init.irqs.trap_init(); -#ifdef CONFIG_X86_64 - memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); - set_nmi_gate(X86_TRAP_DB, &debug); - set_nmi_gate(X86_TRAP_BP, &int3); -#endif + idt_setup_debugidt_traps(); } diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index b9389d72b2f7..d145a0b1f529 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -10,20 +10,22 @@ #define FRAME_HEADER_SIZE (sizeof(long) * 2) -/* - * This disables KASAN checking when reading a value from another task's stack, - * since the other task could be running on another CPU and could have poisoned - * the stack in the meantime. - */ -#define READ_ONCE_TASK_STACK(task, x) \ -({ \ - unsigned long val; \ - if (task == current) \ - val = READ_ONCE(x); \ - else \ - val = READ_ONCE_NOCHECK(x); \ - val; \ -}) +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + return state->regs ? &state->regs->ip : state->bp + 1; +} static void unwind_dump(struct unwind_state *state) { @@ -66,15 +68,6 @@ static void unwind_dump(struct unwind_state *state) } } -unsigned long unwind_get_return_address(struct unwind_state *state) -{ - if (unwind_done(state)) - return 0; - - return __kernel_text_address(state->ip) ? state->ip : 0; -} -EXPORT_SYMBOL_GPL(unwind_get_return_address); - static size_t regs_size(struct pt_regs *regs) { /* x86_32 regs from kernel mode are two words shorter: */ @@ -91,10 +84,8 @@ static bool in_entry_code(unsigned long ip) if (addr >= __entry_text_start && addr < __entry_text_end) return true; -#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) if (addr >= __irqentry_text_start && addr < __irqentry_text_end) return true; -#endif return false; } diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index 039f36738e49..4f0e17b90463 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state) } EXPORT_SYMBOL_GPL(unwind_get_return_address); +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + return NULL; +} + bool unwind_next_frame(struct unwind_state *state) { struct stack_info *info = &state->stack_info; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c new file mode 100644 index 000000000000..570b70d3f604 --- /dev/null +++ b/arch/x86/kernel/unwind_orc.c @@ -0,0 +1,582 @@ +#include <linux/module.h> +#include <linux/sort.h> +#include <asm/ptrace.h> +#include <asm/stacktrace.h> +#include <asm/unwind.h> +#include <asm/orc_types.h> +#include <asm/orc_lookup.h> +#include <asm/sections.h> + +#define orc_warn(fmt, ...) \ + printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__) + +extern int __start_orc_unwind_ip[]; +extern int __stop_orc_unwind_ip[]; +extern struct orc_entry __start_orc_unwind[]; +extern struct orc_entry __stop_orc_unwind[]; + +static DEFINE_MUTEX(sort_mutex); +int *cur_orc_ip_table = __start_orc_unwind_ip; +struct orc_entry *cur_orc_table = __start_orc_unwind; + +unsigned int lookup_num_blocks; +bool orc_init; + +static inline unsigned long orc_ip(const int *ip) +{ + return (unsigned long)ip + *ip; +} + +static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table, + unsigned int num_entries, unsigned long ip) +{ + int *first = ip_table; + int *last = ip_table + num_entries - 1; + int *mid = first, *found = first; + + if (!num_entries) + return NULL; + + /* + * Do a binary range search to find the rightmost duplicate of a given + * starting address. Some entries are section terminators which are + * "weak" entries for ensuring there are no gaps. They should be + * ignored when they conflict with a real entry. + */ + while (first <= last) { + mid = first + ((last - first) / 2); + + if (orc_ip(mid) <= ip) { + found = mid; + first = mid + 1; + } else + last = mid - 1; + } + + return u_table + (found - ip_table); +} + +#ifdef CONFIG_MODULES +static struct orc_entry *orc_module_find(unsigned long ip) +{ + struct module *mod; + + mod = __module_address(ip); + if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip) + return NULL; + return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind, + mod->arch.num_orcs, ip); +} +#else +static struct orc_entry *orc_module_find(unsigned long ip) +{ + return NULL; +} +#endif + +static struct orc_entry *orc_find(unsigned long ip) +{ + if (!orc_init) + return NULL; + + /* For non-init vmlinux addresses, use the fast lookup table: */ + if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) { + unsigned int idx, start, stop; + + idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE; + + if (unlikely((idx >= lookup_num_blocks-1))) { + orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n", + idx, lookup_num_blocks, ip); + return NULL; + } + + start = orc_lookup[idx]; + stop = orc_lookup[idx + 1] + 1; + + if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) || + (__start_orc_unwind + stop > __stop_orc_unwind))) { + orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n", + idx, lookup_num_blocks, start, stop, ip); + return NULL; + } + + return __orc_find(__start_orc_unwind_ip + start, + __start_orc_unwind + start, stop - start, ip); + } + + /* vmlinux .init slow lookup: */ + if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext) + return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); + + /* Module lookup: */ + return orc_module_find(ip); +} + +static void orc_sort_swap(void *_a, void *_b, int size) +{ + struct orc_entry *orc_a, *orc_b; + struct orc_entry orc_tmp; + int *a = _a, *b = _b, tmp; + int delta = _b - _a; + + /* Swap the .orc_unwind_ip entries: */ + tmp = *a; + *a = *b + delta; + *b = tmp - delta; + + /* Swap the corresponding .orc_unwind entries: */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + orc_b = cur_orc_table + (b - cur_orc_ip_table); + orc_tmp = *orc_a; + *orc_a = *orc_b; + *orc_b = orc_tmp; +} + +static int orc_sort_cmp(const void *_a, const void *_b) +{ + struct orc_entry *orc_a; + const int *a = _a, *b = _b; + unsigned long a_val = orc_ip(a); + unsigned long b_val = orc_ip(b); + + if (a_val > b_val) + return 1; + if (a_val < b_val) + return -1; + + /* + * The "weak" section terminator entries need to always be on the left + * to ensure the lookup code skips them in favor of real entries. + * These terminator entries exist to handle any gaps created by + * whitelisted .o files which didn't get objtool generation. + */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1; +} + +#ifdef CONFIG_MODULES +void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size, + void *_orc, size_t orc_size) +{ + int *orc_ip = _orc_ip; + struct orc_entry *orc = _orc; + unsigned int num_entries = orc_ip_size / sizeof(int); + + WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(*orc) != 0 || + num_entries != orc_size / sizeof(*orc)); + + /* + * The 'cur_orc_*' globals allow the orc_sort_swap() callback to + * associate an .orc_unwind_ip table entry with its corresponding + * .orc_unwind entry so they can both be swapped. + */ + mutex_lock(&sort_mutex); + cur_orc_ip_table = orc_ip; + cur_orc_table = orc; + sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap); + mutex_unlock(&sort_mutex); + + mod->arch.orc_unwind_ip = orc_ip; + mod->arch.orc_unwind = orc; + mod->arch.num_orcs = num_entries; +} +#endif + +void __init unwind_init(void) +{ + size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip; + size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind; + size_t num_entries = orc_ip_size / sizeof(int); + struct orc_entry *orc; + int i; + + if (!num_entries || orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(struct orc_entry) != 0 || + num_entries != orc_size / sizeof(struct orc_entry)) { + orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n"); + return; + } + + /* Sort the .orc_unwind and .orc_unwind_ip tables: */ + sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp, + orc_sort_swap); + + /* Initialize the fast lookup table: */ + lookup_num_blocks = orc_lookup_end - orc_lookup; + for (i = 0; i < lookup_num_blocks-1; i++) { + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + num_entries, + LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i)); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + + orc_lookup[i] = orc - __start_orc_unwind; + } + + /* Initialize the ending block: */ + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries, + LOOKUP_STOP_IP); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind; + + orc_init = true; +} + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + if (state->regs) + return &state->regs->ip; + + if (state->sp) + return (unsigned long *)state->sp - 1; + + return NULL; +} + +static bool stack_access_ok(struct unwind_state *state, unsigned long addr, + size_t len) +{ + struct stack_info *info = &state->stack_info; + + /* + * If the address isn't on the current stack, switch to the next one. + * + * We may have to traverse multiple stacks to deal with the possibility + * that info->next_sp could point to an empty stack and the address + * could be on a subsequent stack. + */ + while (!on_stack(info, (void *)addr, len)) + if (get_stack_info(info->next_sp, state->task, info, + &state->stack_mask)) + return false; + + return true; +} + +static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, + unsigned long *val) +{ + if (!stack_access_ok(state, addr, sizeof(long))) + return false; + + *val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr); + return true; +} + +#define REGS_SIZE (sizeof(struct pt_regs)) +#define SP_OFFSET (offsetof(struct pt_regs, sp)) +#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) +#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) + +static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, + unsigned long *ip, unsigned long *sp, bool full) +{ + size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; + size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; + struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); + + if (IS_ENABLED(CONFIG_X86_64)) { + if (!stack_access_ok(state, addr, regs_size)) + return false; + + *ip = regs->ip; + *sp = regs->sp; + + return true; + } + + if (!stack_access_ok(state, addr, sp_offset)) + return false; + + *ip = regs->ip; + + if (user_mode(regs)) { + if (!stack_access_ok(state, addr + sp_offset, + REGS_SIZE - SP_OFFSET)) + return false; + + *sp = regs->sp; + } else + *sp = (unsigned long)®s->sp; + + return true; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; + enum stack_type prev_type = state->stack_info.type; + struct orc_entry *orc; + struct pt_regs *ptregs; + bool indirect = false; + + if (unwind_done(state)) + return false; + + /* Don't let modules unload while we're reading their ORC data. */ + preempt_disable(); + + /* Have we reached the end? */ + if (state->regs && user_mode(state->regs)) + goto done; + + /* + * Find the orc_entry associated with the text address. + * + * Decrement call return addresses by one so they work for sibling + * calls and calls to noreturn functions. + */ + orc = orc_find(state->signal ? state->ip : state->ip - 1); + if (!orc || orc->sp_reg == ORC_REG_UNDEFINED) + goto done; + orig_ip = state->ip; + + /* Find the previous frame's stack: */ + switch (orc->sp_reg) { + case ORC_REG_SP: + sp = state->sp + orc->sp_offset; + break; + + case ORC_REG_BP: + sp = state->bp + orc->sp_offset; + break; + + case ORC_REG_SP_INDIRECT: + sp = state->sp + orc->sp_offset; + indirect = true; + break; + + case ORC_REG_BP_INDIRECT: + sp = state->bp + orc->sp_offset; + indirect = true; + break; + + case ORC_REG_R10: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg R10 at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->r10; + break; + + case ORC_REG_R13: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg R13 at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->r13; + break; + + case ORC_REG_DI: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg DI at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->di; + break; + + case ORC_REG_DX: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg DX at ip %p\n", + (void *)state->ip); + goto done; + } + sp = state->regs->dx; + break; + + default: + orc_warn("unknown SP base reg %d for ip %p\n", + orc->sp_reg, (void *)state->ip); + goto done; + } + + if (indirect) { + if (!deref_stack_reg(state, sp, &sp)) + goto done; + } + + /* Find IP, SP and possibly regs: */ + switch (orc->type) { + case ORC_TYPE_CALL: + ip_p = sp - sizeof(long); + + if (!deref_stack_reg(state, ip_p, &state->ip)) + goto done; + + state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + state->ip, (void *)ip_p); + + state->sp = sp; + state->regs = NULL; + state->signal = false; + break; + + case ORC_TYPE_REGS: + if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { + orc_warn("can't dereference registers at %p for ip %p\n", + (void *)sp, (void *)orig_ip); + goto done; + } + + state->regs = (struct pt_regs *)sp; + state->full_regs = true; + state->signal = true; + break; + + case ORC_TYPE_REGS_IRET: + if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { + orc_warn("can't dereference iret registers at %p for ip %p\n", + (void *)sp, (void *)orig_ip); + goto done; + } + + ptregs = container_of((void *)sp, struct pt_regs, ip); + if ((unsigned long)ptregs >= prev_sp && + on_stack(&state->stack_info, ptregs, REGS_SIZE)) { + state->regs = ptregs; + state->full_regs = false; + } else + state->regs = NULL; + + state->signal = true; + break; + + default: + orc_warn("unknown .orc_unwind entry type %d\n", orc->type); + break; + } + + /* Find BP: */ + switch (orc->bp_reg) { + case ORC_REG_UNDEFINED: + if (state->regs && state->full_regs) + state->bp = state->regs->bp; + break; + + case ORC_REG_PREV_SP: + if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp)) + goto done; + break; + + case ORC_REG_BP: + if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp)) + goto done; + break; + + default: + orc_warn("unknown BP base reg %d for ip %p\n", + orc->bp_reg, (void *)orig_ip); + goto done; + } + + /* Prevent a recursive loop due to bad ORC data: */ + if (state->stack_info.type == prev_type && + on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) && + state->sp <= prev_sp) { + orc_warn("stack going in the wrong direction? ip=%p\n", + (void *)orig_ip); + goto done; + } + + preempt_enable(); + return true; + +done: + preempt_enable(); + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + state->task = task; + + /* + * Refuse to unwind the stack of a task while it's executing on another + * CPU. This check is racy, but that's ok: the unwinder has other + * checks to prevent it from going off the rails. + */ + if (task_on_another_cpu(task)) + goto done; + + if (regs) { + if (user_mode(regs)) + goto done; + + state->ip = regs->ip; + state->sp = kernel_stack_pointer(regs); + state->bp = regs->bp; + state->regs = regs; + state->full_regs = true; + state->signal = true; + + } else if (task == current) { + asm volatile("lea (%%rip), %0\n\t" + "mov %%rsp, %1\n\t" + "mov %%rbp, %2\n\t" + : "=r" (state->ip), "=r" (state->sp), + "=r" (state->bp)); + + } else { + struct inactive_task_frame *frame = (void *)task->thread.sp; + + state->sp = task->thread.sp; + state->bp = READ_ONCE_NOCHECK(frame->bp); + state->ip = READ_ONCE_NOCHECK(frame->ret_addr); + } + + if (get_stack_info((unsigned long *)state->sp, state->task, + &state->stack_info, &state->stack_mask)) + return; + + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + + /* When starting from regs, skip the regs frame: */ + if (regs) { + unwind_next_frame(state); + return; + } + + /* Otherwise, skip ahead to the user-specified starting frame: */ + while (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + state->sp <= (unsigned long)first_frame)) + unwind_next_frame(state); + + return; + +done: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return; +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index c8a3b61be0aa..f05f00acac89 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -24,6 +24,7 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/page_types.h> +#include <asm/orc_lookup.h> #include <asm/cache.h> #include <asm/boot.h> @@ -148,6 +149,8 @@ SECTIONS BUG_TABLE + ORC_UNWIND_TABLE + . = ALIGN(PAGE_SIZE); __vvar_page = .; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2688c7dc5323..3ea624452f93 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -89,6 +89,5 @@ config KVM_MMU_AUDIT # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig -source drivers/lguest/Kconfig endif # VIRTUALIZATION diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 59ca2eea522c..19adbb418443 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -469,7 +469,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; cpuid_mask(&entry->ecx, CPUID_7_ECX); /* PKU is not yet implemented for shadow paging. */ - if (!tdp_enabled) + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) entry->ecx &= ~F(PKU); entry->edx &= kvm_cpuid_7_0_edx_x86_features; entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 762cdf2595f9..e1e89ee4af75 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -84,11 +84,6 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); } -static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu) -{ - return kvm_x86_ops->get_pkru(vcpu); -} - static inline void enter_guest_mode(struct kvm_vcpu *vcpu) { vcpu->arch.hflags |= HF_GUEST_MASK; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ccb70b8d16cc..04d750813c9d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4109,16 +4109,28 @@ void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { bool uses_nx = context->nx || context->base_role.smep_andnot_wp; + struct rsvd_bits_validate *shadow_zero_check; + int i; /* * Passing "true" to the last argument is okay; it adds a check * on bit 8 of the SPTEs which KVM doesn't use anyway. */ - __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + shadow_zero_check = &context->shadow_zero_check; + __reset_rsvds_bits_mask(vcpu, shadow_zero_check, boot_cpu_data.x86_phys_bits, context->shadow_root_level, uses_nx, guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), true); + + if (!shadow_me_mask) + return; + + for (i = context->shadow_root_level; --i >= 0;) { + shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; + shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; + } + } EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); @@ -4136,17 +4148,29 @@ static void reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { + struct rsvd_bits_validate *shadow_zero_check; + int i; + + shadow_zero_check = &context->shadow_zero_check; + if (boot_cpu_is_amd()) - __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + __reset_rsvds_bits_mask(vcpu, shadow_zero_check, boot_cpu_data.x86_phys_bits, context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), true, true); else - __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, + __reset_rsvds_bits_mask_ept(shadow_zero_check, boot_cpu_data.x86_phys_bits, false); + if (!shadow_me_mask) + return; + + for (i = context->shadow_root_level; --i >= 0;) { + shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; + shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; + } } /* diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 3cc725590ab9..4b9a3ae6b725 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -48,7 +48,7 @@ static inline u64 rsvd_bits(int s, int e) { - return __sme_clr(((1ULL << (e - s + 1)) - 1) << s); + return ((1ULL << (e - s + 1)) - 1) << s; } void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value); @@ -185,7 +185,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, * index of the protection domain, so pte_pkey * 2 is * is the index of the first bit for the domain. */ - pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3; + pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3; /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ offset = (pfec & ~1) + diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 099ff08b4aff..8dbd8dbc83eb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1777,11 +1777,6 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } -static u32 svm_get_pkru(struct kvm_vcpu *vcpu) -{ - return 0; -} - static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { switch (reg) { @@ -5414,8 +5409,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, - .get_pkru = svm_get_pkru, - .tlb_flush = svm_flush_tlb, .run = svm_vcpu_run, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 416d5ed320b6..70b90c0810d0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -636,8 +636,6 @@ struct vcpu_vmx { u64 current_tsc_ratio; - bool guest_pkru_valid; - u32 guest_pkru; u32 host_pkru; /* @@ -2383,11 +2381,6 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_vmx(vcpu)->emulation_required = emulation_required(vcpu); } -static u32 vmx_get_pkru(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->guest_pkru; -} - static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); @@ -8786,7 +8779,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) vector = exit_intr_info & INTR_INFO_VECTOR_MASK; desc = (gate_desc *)vmx->host_idt_base + vector; - entry = gate_offset(*desc); + entry = gate_offset(desc); asm volatile( #ifdef CONFIG_X86_64 "mov %%" _ASM_SP ", %[sp]\n\t" @@ -9020,8 +9013,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - if (vmx->guest_pkru_valid) - __write_pkru(vmx->guest_pkru); + if (static_cpu_has(X86_FEATURE_PKU) && + kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && + vcpu->arch.pkru != vmx->host_pkru) + __write_pkru(vcpu->arch.pkru); atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); @@ -9169,13 +9164,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * back on host, so it is safe to read guest PKRU from current * XSAVE. */ - if (boot_cpu_has(X86_FEATURE_OSPKE)) { - vmx->guest_pkru = __read_pkru(); - if (vmx->guest_pkru != vmx->host_pkru) { - vmx->guest_pkru_valid = true; + if (static_cpu_has(X86_FEATURE_PKU) && + kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { + vcpu->arch.pkru = __read_pkru(); + if (vcpu->arch.pkru != vmx->host_pkru) __write_pkru(vmx->host_pkru); - } else - vmx->guest_pkru_valid = false; } /* @@ -11682,8 +11675,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, - .get_pkru = vmx_get_pkru, - .tlb_flush = vmx_flush_tlb, .run = vmx_vcpu_run, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eda4bdbd7e5e..ef5102f80497 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3246,7 +3246,12 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) u32 size, offset, ecx, edx; cpuid_count(XSTATE_CPUID, index, &size, &offset, &ecx, &edx); - memcpy(dest + offset, src, size); + if (feature == XFEATURE_MASK_PKRU) + memcpy(dest + offset, &vcpu->arch.pkru, + sizeof(vcpu->arch.pkru)); + else + memcpy(dest + offset, src, size); + } valid -= feature; @@ -3284,7 +3289,11 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) u32 size, offset, ecx, edx; cpuid_count(XSTATE_CPUID, index, &size, &offset, &ecx, &edx); - memcpy(dest, src + offset, size); + if (feature == XFEATURE_MASK_PKRU) + memcpy(&vcpu->arch.pkru, src + offset, + sizeof(vcpu->arch.pkru)); + else + memcpy(dest, src + offset, size); } valid -= feature; @@ -6726,17 +6735,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); -void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address) -{ - /* - * The physical address of apic access page is stored in the VMCS. - * Update it when it becomes invalid. - */ - if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT)) - kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); -} - /* * Returns 1 to let vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the @@ -7634,7 +7632,9 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) */ vcpu->guest_fpu_loaded = 1; __kernel_fpu_begin(); - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state); + /* PKRU is separately restored in kvm_x86_ops->run. */ + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, + ~XFEATURE_MASK_PKRU); trace_kvm_fpu(1); } diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig deleted file mode 100644 index 08f41caada45..000000000000 --- a/arch/x86/lguest/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -config LGUEST_GUEST - bool "Lguest guest support" - depends on X86_32 && PARAVIRT && PCI - select TTY - select VIRTUALIZATION - select VIRTIO - select VIRTIO_CONSOLE - help - Lguest is a tiny in-kernel hypervisor. Selecting this will - allow your kernel to boot under lguest. This option will increase - your kernel size by about 10k. If in doubt, say N. - - If you say Y here, make sure you say Y (or M) to the virtio block - and net drivers which lguest needs. diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile deleted file mode 100644 index 8f38d577a2fa..000000000000 --- a/arch/x86/lguest/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -obj-y := head_32.o boot.o -CFLAGS_boot.o := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c deleted file mode 100644 index 99472698c931..000000000000 --- a/arch/x86/lguest/boot.c +++ /dev/null @@ -1,1558 +0,0 @@ -/*P:010 - * A hypervisor allows multiple Operating Systems to run on a single machine. - * To quote David Wheeler: "Any problem in computer science can be solved with - * another layer of indirection." - * - * We keep things simple in two ways. First, we start with a normal Linux - * kernel and insert a module (lg.ko) which allows us to run other Linux - * kernels the same way we'd run processes. We call the first kernel the Host, - * and the others the Guests. The program which sets up and configures Guests - * (such as the example in tools/lguest/lguest.c) is called the Launcher. - * - * Secondly, we only run specially modified Guests, not normal kernels: setting - * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows - * how to be a Guest at boot time. This means that you can use the same kernel - * you boot normally (ie. as a Host) as a Guest. - * - * These Guests know that they cannot do privileged operations, such as disable - * interrupts, and that they have to ask the Host to do such things explicitly. - * This file consists of all the replacements for such low-level native - * hardware operations: these special Guest versions call the Host. - * - * So how does the kernel know it's a Guest? We'll see that later, but let's - * just say that we end up here where we replace the native functions various - * "paravirt" structures with our Guest versions, then boot like normal. -:*/ - -/* - * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ -#include <linux/kernel.h> -#include <linux/start_kernel.h> -#include <linux/string.h> -#include <linux/console.h> -#include <linux/screen_info.h> -#include <linux/irq.h> -#include <linux/interrupt.h> -#include <linux/clocksource.h> -#include <linux/clockchips.h> -#include <linux/lguest.h> -#include <linux/lguest_launcher.h> -#include <linux/virtio_console.h> -#include <linux/pm.h> -#include <linux/export.h> -#include <linux/pci.h> -#include <linux/virtio_pci.h> -#include <asm/acpi.h> -#include <asm/apic.h> -#include <asm/lguest.h> -#include <asm/paravirt.h> -#include <asm/param.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/desc.h> -#include <asm/setup.h> -#include <asm/e820/api.h> -#include <asm/mce.h> -#include <asm/io.h> -#include <asm/fpu/api.h> -#include <asm/stackprotector.h> -#include <asm/reboot.h> /* for struct machine_ops */ -#include <asm/kvm_para.h> -#include <asm/pci_x86.h> -#include <asm/pci-direct.h> - -/*G:010 - * Welcome to the Guest! - * - * The Guest in our tale is a simple creature: identical to the Host but - * behaving in simplified but equivalent ways. In particular, the Guest is the - * same kernel as the Host (or at least, built from the same source code). -:*/ - -struct lguest_data lguest_data = { - .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, - .noirq_iret = (u32)lguest_noirq_iret, - .kernel_address = PAGE_OFFSET, - .blocked_interrupts = { 1 }, /* Block timer interrupts */ - .syscall_vec = IA32_SYSCALL_VECTOR, -}; - -/*G:037 - * async_hcall() is pretty simple: I'm quite proud of it really. We have a - * ring buffer of stored hypercalls which the Host will run though next time we - * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall - * arguments, and a "hcall_status" word which is 0 if the call is ready to go, - * and 255 once the Host has finished with it. - * - * If we come around to a slot which hasn't been finished, then the table is - * full and we just make the hypercall directly. This has the nice side - * effect of causing the Host to run all the stored calls in the ring buffer - * which empties it for next time! - */ -static void async_hcall(unsigned long call, unsigned long arg1, - unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* Note: This code assumes we're uniprocessor. */ - static unsigned int next_call; - unsigned long flags; - - /* - * Disable interrupts if not already disabled: we don't want an - * interrupt handler making a hypercall while we're already doing - * one! - */ - local_irq_save(flags); - if (lguest_data.hcall_status[next_call] != 0xFF) { - /* Table full, so do normal hcall which will flush table. */ - hcall(call, arg1, arg2, arg3, arg4); - } else { - lguest_data.hcalls[next_call].arg0 = call; - lguest_data.hcalls[next_call].arg1 = arg1; - lguest_data.hcalls[next_call].arg2 = arg2; - lguest_data.hcalls[next_call].arg3 = arg3; - lguest_data.hcalls[next_call].arg4 = arg4; - /* Arguments must all be written before we mark it to go */ - wmb(); - lguest_data.hcall_status[next_call] = 0; - if (++next_call == LHCALL_RING_SIZE) - next_call = 0; - } - local_irq_restore(flags); -} - -/*G:035 - * Notice the lazy_hcall() above, rather than hcall(). This is our first real - * optimization trick! - * - * When lazy_mode is set, it means we're allowed to defer all hypercalls and do - * them as a batch when lazy_mode is eventually turned off. Because hypercalls - * are reasonably expensive, batching them up makes sense. For example, a - * large munmap might update dozens of page table entries: that code calls - * paravirt_enter_lazy_mmu(), does the dozen updates, then calls - * lguest_leave_lazy_mode(). - * - * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing: - */ -static void lazy_hcall1(unsigned long call, unsigned long arg1) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, 0, 0, 0); - else - async_hcall(call, arg1, 0, 0, 0); -} - -/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ -static void lazy_hcall2(unsigned long call, - unsigned long arg1, - unsigned long arg2) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, 0, 0); - else - async_hcall(call, arg1, arg2, 0, 0); -} - -static void lazy_hcall3(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, arg3, 0); - else - async_hcall(call, arg1, arg2, arg3, 0); -} - -#ifdef CONFIG_X86_PAE -static void lazy_hcall4(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, arg3, arg4); - else - async_hcall(call, arg1, arg2, arg3, arg4); -} -#endif - -/*G:036 - * When lazy mode is turned off, we issue the do-nothing hypercall to - * flush any stored calls, and call the generic helper to reset the - * per-cpu lazy mode variable. - */ -static void lguest_leave_lazy_mmu_mode(void) -{ - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); - paravirt_leave_lazy_mmu(); -} - -/* - * We also catch the end of context switch; we enter lazy mode for much of - * that too, so again we need to flush here. - * - * (Technically, this is lazy CPU mode, and normally we're in lazy MMU - * mode, but unlike Xen, lguest doesn't care about the difference). - */ -static void lguest_end_context_switch(struct task_struct *next) -{ - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); - paravirt_end_context_switch(next); -} - -/*G:032 - * After that diversion we return to our first native-instruction - * replacements: four functions for interrupt control. - * - * The simplest way of implementing these would be to have "turn interrupts - * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: - * these are by far the most commonly called functions of those we override. - * - * So instead we keep an "irq_enabled" field inside our "struct lguest_data", - * which the Guest can update with a single instruction. The Host knows to - * check there before it tries to deliver an interrupt. - */ - -/* - * save_flags() is expected to return the processor state (ie. "flags"). The - * flags word contains all kind of stuff, but in practice Linux only cares - * about the interrupt flag. Our "save_flags()" just returns that. - */ -asmlinkage __visible unsigned long lguest_save_fl(void) -{ - return lguest_data.irq_enabled; -} - -/* Interrupts go off... */ -asmlinkage __visible void lguest_irq_disable(void) -{ - lguest_data.irq_enabled = 0; -} - -/* - * Let's pause a moment. Remember how I said these are called so often? - * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to - * break some rules. In particular, these functions are assumed to save their - * own registers if they need to: normal C functions assume they can trash the - * eax register. To use normal C functions, we use - * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the - * C function, then restores it. - */ -PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl); -PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable); -/*:*/ - -/* These are in head_32.S */ -extern void lg_irq_enable(void); -extern void lg_restore_fl(unsigned long flags); - -/*M:003 - * We could be more efficient in our checking of outstanding interrupts, rather - * than using a branch. One way would be to put the "irq_enabled" field in a - * page by itself, and have the Host write-protect it when an interrupt comes - * in when irqs are disabled. There will then be a page fault as soon as - * interrupts are re-enabled. - * - * A better method is to implement soft interrupt disable generally for x86: - * instead of disabling interrupts, we set a flag. If an interrupt does come - * in, we then disable them for real. This is uncommon, so we could simply use - * a hypercall for interrupt control and not worry about efficiency. -:*/ - -/*G:034 - * The Interrupt Descriptor Table (IDT). - * - * The IDT tells the processor what to do when an interrupt comes in. Each - * entry in the table is a 64-bit descriptor: this holds the privilege level, - * address of the handler, and... well, who cares? The Guest just asks the - * Host to make the change anyway, because the Host controls the real IDT. - */ -static void lguest_write_idt_entry(gate_desc *dt, - int entrynum, const gate_desc *g) -{ - /* - * The gate_desc structure is 8 bytes long: we hand it to the Host in - * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors - * around like this; typesafety wasn't a big concern in Linux's early - * years. - */ - u32 *desc = (u32 *)g; - /* Keep the local copy up to date. */ - native_write_idt_entry(dt, entrynum, g); - /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0); -} - -/* - * Changing to a different IDT is very rare: we keep the IDT up-to-date every - * time it is written, so we can simply loop through all entries and tell the - * Host about them. - */ -static void lguest_load_idt(const struct desc_ptr *desc) -{ - unsigned int i; - struct desc_struct *idt = (void *)desc->address; - - for (i = 0; i < (desc->size+1)/8; i++) - hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0); -} - -/* - * The Global Descriptor Table. - * - * The Intel architecture defines another table, called the Global Descriptor - * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt" - * instruction, and then several other instructions refer to entries in the - * table. There are three entries which the Switcher needs, so the Host simply - * controls the entire thing and the Guest asks it to make changes using the - * LOAD_GDT hypercall. - * - * This is the exactly like the IDT code. - */ -static void lguest_load_gdt(const struct desc_ptr *desc) -{ - unsigned int i; - struct desc_struct *gdt = (void *)desc->address; - - for (i = 0; i < (desc->size+1)/8; i++) - hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0); -} - -/* - * For a single GDT entry which changes, we simply change our copy and - * then tell the host about it. - */ -static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, - const void *desc, int type) -{ - native_write_gdt_entry(dt, entrynum, desc, type); - /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_GDT_ENTRY, entrynum, - dt[entrynum].a, dt[entrynum].b, 0); -} - -/* - * There are three "thread local storage" GDT entries which change - * on every context switch (these three entries are how glibc implements - * __thread variables). As an optimization, we have a hypercall - * specifically for this case. - * - * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall - * which took a range of entries? - */ -static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) -{ - /* - * There's one problem which normal hardware doesn't have: the Host - * can't handle us removing entries we're currently using. So we clear - * the GS register here: if it's needed it'll be reloaded anyway. - */ - lazy_load_gs(0); - lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); -} - -/*G:038 - * That's enough excitement for now, back to ploughing through each of the - * different pv_ops structures (we're about 1/3 of the way through). - * - * This is the Local Descriptor Table, another weird Intel thingy. Linux only - * uses this for some strange applications like Wine. We don't do anything - * here, so they'll get an informative and friendly Segmentation Fault. - */ -static void lguest_set_ldt(const void *addr, unsigned entries) -{ -} - -/* - * This loads a GDT entry into the "Task Register": that entry points to a - * structure called the Task State Segment. Some comments scattered though the - * kernel code indicate that this used for task switching in ages past, along - * with blood sacrifice and astrology. - * - * Now there's nothing interesting in here that we don't get told elsewhere. - * But the native version uses the "ltr" instruction, which makes the Host - * complain to the Guest about a Segmentation Fault and it'll oops. So we - * override the native version with a do-nothing version. - */ -static void lguest_load_tr_desc(void) -{ -} - -/* - * The "cpuid" instruction is a way of querying both the CPU identity - * (manufacturer, model, etc) and its features. It was introduced before the - * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. - * As you might imagine, after a decade and a half this treatment, it is now a - * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. - * - * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 6 languages. I am not making this up! - * - * We could get funky here and identify ourselves as "GenuineLguest", but - * instead we just use the real "cpuid" instruction. Then I pretty much turned - * off feature bits until the Guest booted. (Don't say that: you'll damage - * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is - * hardly future proof.) No one's listening! They don't like you anyway, - * parenthetic weirdo! - * - * Replacing the cpuid so we can turn features off is great for the kernel, but - * anyone (including userspace) can just use the raw "cpuid" instruction and - * the Host won't even notice since it isn't privileged. So we try not to get - * too worked up about it. - */ -static void lguest_cpuid(unsigned int *ax, unsigned int *bx, - unsigned int *cx, unsigned int *dx) -{ - int function = *ax; - - native_cpuid(ax, bx, cx, dx); - switch (function) { - /* - * CPUID 0 gives the highest legal CPUID number (and the ID string). - * We futureproof our code a little by sticking to known CPUID values. - */ - case 0: - if (*ax > 5) - *ax = 5; - break; - - /* - * CPUID 1 is a basic feature request. - * - * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 - * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. - */ - case 1: - *cx &= 0x00002201; - *dx &= 0x07808151; - /* - * The Host can do a nice optimization if it knows that the - * kernel mappings (addresses above 0xC0000000 or whatever - * PAGE_OFFSET is set to) haven't changed. But Linux calls - * flush_tlb_user() for both user and kernel mappings unless - * the Page Global Enable (PGE) feature bit is set. - */ - *dx |= 0x00002000; - /* - * We also lie, and say we're family id 5. 6 or greater - * leads to a rdmsr in early_init_intel which we can't handle. - * Family ID is returned as bits 8-12 in ax. - */ - *ax &= 0xFFFFF0FF; - *ax |= 0x00000500; - break; - - /* - * This is used to detect if we're running under KVM. We might be, - * but that's a Host matter, not us. So say we're not. - */ - case KVM_CPUID_SIGNATURE: - *bx = *cx = *dx = 0; - break; - - /* - * 0x80000000 returns the highest Extended Function, so we futureproof - * like we do above by limiting it to known fields. - */ - case 0x80000000: - if (*ax > 0x80000008) - *ax = 0x80000008; - break; - - /* - * PAE systems can mark pages as non-executable. Linux calls this the - * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced - * Virus Protection). We just switch it off here, since we don't - * support it. - */ - case 0x80000001: - *dx &= ~(1 << 20); - break; - } -} - -/* - * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. - * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother - * it. The Host needs to know when the Guest wants to change them, so we have - * a whole series of functions like read_cr0() and write_cr0(). - * - * We start with cr0. cr0 allows you to turn on and off all kinds of basic - * features, but the only cr0 bit that Linux ever used at runtime was the - * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8) - * - * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if - * the floating point unit is used. Which allows us to restore FPU state - * lazily after a task switch if we wanted to, but wouldn't a name like - * "FPUTRAP bit" be a little less cryptic? - * - * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore - * cr0. - */ -static void lguest_write_cr0(unsigned long val) -{ -} - -static unsigned long lguest_read_cr0(void) -{ - return 0; -} - -/* - * cr2 is the virtual address of the last page fault, which the Guest only ever - * reads. The Host kindly writes this into our "struct lguest_data", so we - * just read it out of there. - */ -static unsigned long lguest_read_cr2(void) -{ - return lguest_data.cr2; -} - -/* See lguest_set_pte() below. */ -static bool cr3_changed = false; -static unsigned long current_cr3; - -/* - * cr3 is the current toplevel pagetable page: the principle is the same as - * cr0. Keep a local copy, and tell the Host when it changes. - */ -static void lguest_write_cr3(unsigned long cr3) -{ - lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); - current_cr3 = cr3; - - /* These two page tables are simple, linear, and used during boot */ - if (cr3 != __pa_symbol(swapper_pg_dir) && - cr3 != __pa_symbol(initial_page_table)) - cr3_changed = true; -} - -static unsigned long lguest_read_cr3(void) -{ - return current_cr3; -} - -/* cr4 is used to enable and disable PGE, but we don't care. */ -static unsigned long lguest_read_cr4(void) -{ - return 0; -} - -static void lguest_write_cr4(unsigned long val) -{ -} - -/* - * Page Table Handling. - * - * Now would be a good time to take a rest and grab a coffee or similarly - * relaxing stimulant. The easy parts are behind us, and the trek gradually - * winds uphill from here. - * - * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU - * maps virtual addresses to physical addresses using "page tables". We could - * use one huge index of 1 million entries: each address is 4 bytes, so that's - * 1024 pages just to hold the page tables. But since most virtual addresses - * are unused, we use a two level index which saves space. The cr3 register - * contains the physical address of the top level "page directory" page, which - * contains physical addresses of up to 1024 second-level pages. Each of these - * second level pages contains up to 1024 physical addresses of actual pages, - * or Page Table Entries (PTEs). - * - * Here's a diagram, where arrows indicate physical addresses: - * - * cr3 ---> +---------+ - * | --------->+---------+ - * | | | PADDR1 | - * Mid-level | | PADDR2 | - * (PMD) page | | | - * | | Lower-level | - * | | (PTE) page | - * | | | | - * .... .... - * - * So to convert a virtual address to a physical address, we look up the top - * level, which points us to the second level, which gives us the physical - * address of that page. If the top level entry was not present, or the second - * level entry was not present, then the virtual address is invalid (we - * say "the page was not mapped"). - * - * Put another way, a 32-bit virtual address is divided up like so: - * - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>| - * Index into top Index into second Offset within page - * page directory page pagetable page - * - * Now, unfortunately, this isn't the whole story: Intel added Physical Address - * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). - * These are held in 64-bit page table entries, so we can now only fit 512 - * entries in a page, and the neat three-level tree breaks down. - * - * The result is a four level page table: - * - * cr3 --> [ 4 Upper ] - * [ Level ] - * [ Entries ] - * [(PUD Page)]---> +---------+ - * | --------->+---------+ - * | | | PADDR1 | - * Mid-level | | PADDR2 | - * (PMD) page | | | - * | | Lower-level | - * | | (PTE) page | - * | | | | - * .... .... - * - * - * And the virtual address is decoded as: - * - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| - * Index into Index into mid Index into lower Offset within page - * top entries directory page pagetable page - * - * It's too hard to switch between these two formats at runtime, so Linux only - * supports one or the other depending on whether CONFIG_X86_PAE is set. Many - * distributions turn it on, and not just for people with silly amounts of - * memory: the larger PTE entries allow room for the NX bit, which lets the - * kernel disable execution of pages and increase security. - * - * This was a problem for lguest, which couldn't run on these distributions; - * then Matias Zabaljauregui figured it all out and implemented it, and only a - * handful of puppies were crushed in the process! - * - * Back to our point: the kernel spends a lot of time changing both the - * top-level page directory and lower-level pagetable pages. The Guest doesn't - * know physical addresses, so while it maintains these page tables exactly - * like normal, it also needs to keep the Host informed whenever it makes a - * change: the Host will create the real page tables based on the Guests'. - */ - -/* - * The Guest calls this after it has set a second-level entry (pte), ie. to map - * a page into a process' address space. We tell the Host the toplevel and - * address this corresponds to. The Guest uses one pagetable per process, so - * we need to tell the Host which one we're changing (mm->pgd). - */ -static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ -#ifdef CONFIG_X86_PAE - /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ - lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, - ptep->pte_low, ptep->pte_high); -#else - lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); -#endif -} - -/* This is the "set and update" combo-meal-deal version. */ -static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - native_set_pte(ptep, pteval); - lguest_pte_update(mm, addr, ptep); -} - -/* - * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd - * to set a middle-level entry when PAE is activated. - * - * Again, we set the entry then tell the Host which page we changed, - * and the index of the entry we changed. - */ -#ifdef CONFIG_X86_PAE -static void lguest_set_pud(pud_t *pudp, pud_t pudval) -{ - native_set_pud(pudp, pudval); - - /* 32 bytes aligned pdpt address and the index. */ - lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, - (__pa(pudp) & 0x1F) / sizeof(pud_t)); -} - -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#else - -/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#endif - -/* - * There are a couple of legacy places where the kernel sets a PTE, but we - * don't know the top level any more. This is useless for us, since we don't - * know which pagetable is changing or what address, so we just tell the Host - * to forget all of them. Fortunately, this is very rare. - * - * ... except in early boot when the kernel sets up the initial pagetables, - * which makes booting astonishingly slow: 48 seconds! So we don't even tell - * the Host anything changed until we've done the first real page table switch, - * which brings boot back to 4.3 seconds. - */ -static void lguest_set_pte(pte_t *ptep, pte_t pteval) -{ - native_set_pte(ptep, pteval); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -#ifdef CONFIG_X86_PAE -/* - * With 64-bit PTE values, we need to be careful setting them: if we set 32 - * bits at a time, the hardware could see a weird half-set entry. These - * versions ensure we update all 64 bits at once. - */ -static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) -{ - native_set_pte_atomic(ptep, pte); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - native_pte_clear(mm, addr, ptep); - lguest_pte_update(mm, addr, ptep); -} - -static void lguest_pmd_clear(pmd_t *pmdp) -{ - lguest_set_pmd(pmdp, __pmd(0)); -} -#endif - -/* - * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on - * native page table operations. On native hardware you can set a new page - * table entry whenever you want, but if you want to remove one you have to do - * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). - * - * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only - * called when a valid entry is written, not when it's removed (ie. marked not - * present). Instead, this is where we come when the Guest wants to remove a - * page table entry: we tell the Host to set that entry to 0 (ie. the present - * bit is zero). - */ -static void lguest_flush_tlb_single(unsigned long addr) -{ - /* Simply set it to zero: if it was not, it will fault back in. */ - lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0); -} - -/* - * This is what happens after the Guest has removed a large number of entries. - * This tells the Host that any of the page table entries for userspace might - * have changed, ie. virtual addresses below PAGE_OFFSET. - */ -static void lguest_flush_tlb_user(void) -{ - lazy_hcall1(LHCALL_FLUSH_TLB, 0); -} - -/* - * This is called when the kernel page tables have changed. That's not very - * common (unless the Guest is using highmem, which makes the Guest extremely - * slow), so it's worth separating this from the user flushing above. - */ -static void lguest_flush_tlb_kernel(void) -{ - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -/* - * The Unadvanced Programmable Interrupt Controller. - * - * This is an attempt to implement the simplest possible interrupt controller. - * I spent some time looking though routines like set_irq_chip_and_handler, - * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and - * I *think* this is as simple as it gets. - * - * We can tell the Host what interrupts we want blocked ready for using the - * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as - * simple as setting a bit. We don't actually "ack" interrupts as such, we - * just mask and unmask them. I wonder if we should be cleverer? - */ -static void disable_lguest_irq(struct irq_data *data) -{ - set_bit(data->irq, lguest_data.blocked_interrupts); -} - -static void enable_lguest_irq(struct irq_data *data) -{ - clear_bit(data->irq, lguest_data.blocked_interrupts); -} - -/* This structure describes the lguest IRQ controller. */ -static struct irq_chip lguest_irq_controller = { - .name = "lguest", - .irq_mask = disable_lguest_irq, - .irq_mask_ack = disable_lguest_irq, - .irq_unmask = enable_lguest_irq, -}; - -/* - * Interrupt descriptors are allocated as-needed, but low-numbered ones are - * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it - * tells us the irq is already used: other errors (ie. ENOMEM) we take - * seriously. - */ -static int lguest_setup_irq(unsigned int irq) -{ - struct irq_desc *desc; - int err; - - /* Returns -ve error or vector number. */ - err = irq_alloc_desc_at(irq, 0); - if (err < 0 && err != -EEXIST) - return err; - - /* - * Tell the Linux infrastructure that the interrupt is - * controlled by our level-based lguest interrupt controller. - */ - irq_set_chip_and_handler_name(irq, &lguest_irq_controller, - handle_level_irq, "level"); - - /* Some systems map "vectors" to interrupts weirdly. Not us! */ - desc = irq_to_desc(irq); - __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc); - return 0; -} - -static int lguest_enable_irq(struct pci_dev *dev) -{ - int err; - u8 line = 0; - - /* We literally use the PCI interrupt line as the irq number. */ - pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); - err = lguest_setup_irq(line); - if (!err) - dev->irq = line; - return err; -} - -/* We don't do hotplug PCI, so this shouldn't be called. */ -static void lguest_disable_irq(struct pci_dev *dev) -{ - WARN_ON(1); -} - -/* - * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware - * interrupt (except 128, which is used for system calls). - */ -static void __init lguest_init_IRQ(void) -{ - unsigned int i; - - for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { - if (i != IA32_SYSCALL_VECTOR) - set_intr_gate(i, irq_entries_start + - 8 * (i - FIRST_EXTERNAL_VECTOR)); - } - - /* - * This call is required to set up for 4k stacks, where we have - * separate stacks for hard and soft interrupts. - */ - irq_ctx_init(smp_processor_id()); -} - -/* - * Time. - * - * It would be far better for everyone if the Guest had its own clock, but - * until then the Host gives us the time on every interrupt. - */ -static void lguest_get_wallclock(struct timespec *now) -{ - *now = lguest_data.time; -} - -/* - * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us - * what speed it runs at, or 0 if it's unusable as a reliable clock source. - * This matches what we want here: if we return 0 from this function, the x86 - * TSC clock will give up and not register itself. - */ -static unsigned long lguest_tsc_khz(void) -{ - return lguest_data.tsc_khz; -} - -/* - * If we can't use the TSC, the kernel falls back to our lower-priority - * "lguest_clock", where we read the time value given to us by the Host. - */ -static u64 lguest_clock_read(struct clocksource *cs) -{ - unsigned long sec, nsec; - - /* - * Since the time is in two parts (seconds and nanoseconds), we risk - * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, - * and getting 99 and 0. As Linux tends to come apart under the stress - * of time travel, we must be careful: - */ - do { - /* First we read the seconds part. */ - sec = lguest_data.time.tv_sec; - /* - * This read memory barrier tells the compiler and the CPU that - * this can't be reordered: we have to complete the above - * before going on. - */ - rmb(); - /* Now we read the nanoseconds part. */ - nsec = lguest_data.time.tv_nsec; - /* Make sure we've done that. */ - rmb(); - /* Now if the seconds part has changed, try again. */ - } while (unlikely(lguest_data.time.tv_sec != sec)); - - /* Our lguest clock is in real nanoseconds. */ - return sec*1000000000ULL + nsec; -} - -/* This is the fallback clocksource: lower priority than the TSC clocksource. */ -static struct clocksource lguest_clock = { - .name = "lguest", - .rating = 200, - .read = lguest_clock_read, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -/* - * We also need a "struct clock_event_device": Linux asks us to set it to go - * off some time in the future. Actually, James Morris figured all this out, I - * just applied the patch. - */ -static int lguest_clockevent_set_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - /* FIXME: I don't think this can ever happen, but James tells me he had - * to put this code in. Maybe we should remove it now. Anyone? */ - if (delta < LG_CLOCK_MIN_DELTA) { - if (printk_ratelimit()) - printk(KERN_DEBUG "%s: small delta %lu ns\n", - __func__, delta); - return -ETIME; - } - - /* Please wake us this far in the future. */ - hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0); - return 0; -} - -static int lguest_clockevent_shutdown(struct clock_event_device *evt) -{ - /* A 0 argument shuts the clock down. */ - hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0); - return 0; -} - -/* This describes our primitive timer chip. */ -static struct clock_event_device lguest_clockevent = { - .name = "lguest", - .features = CLOCK_EVT_FEAT_ONESHOT, - .set_next_event = lguest_clockevent_set_next_event, - .set_state_shutdown = lguest_clockevent_shutdown, - .rating = INT_MAX, - .mult = 1, - .shift = 0, - .min_delta_ns = LG_CLOCK_MIN_DELTA, - .min_delta_ticks = LG_CLOCK_MIN_DELTA, - .max_delta_ns = LG_CLOCK_MAX_DELTA, - .max_delta_ticks = LG_CLOCK_MAX_DELTA, -}; - -/* - * This is the Guest timer interrupt handler (hardware interrupt 0). We just - * call the clockevent infrastructure and it does whatever needs doing. - */ -static void lguest_time_irq(struct irq_desc *desc) -{ - unsigned long flags; - - /* Don't interrupt us while this is running. */ - local_irq_save(flags); - lguest_clockevent.event_handler(&lguest_clockevent); - local_irq_restore(flags); -} - -/* - * At some point in the boot process, we get asked to set up our timing - * infrastructure. The kernel doesn't expect timer interrupts before this, but - * we cleverly initialized the "blocked_interrupts" field of "struct - * lguest_data" so that timer interrupts were blocked until now. - */ -static void lguest_time_init(void) -{ - /* Set up the timer interrupt (0) to go to our simple timer routine */ - if (lguest_setup_irq(0) != 0) - panic("Could not set up timer irq"); - irq_set_handler(0, lguest_time_irq); - - clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); - - /* We can't set cpumask in the initializer: damn C limitations! Set it - * here and register our timer device. */ - lguest_clockevent.cpumask = cpumask_of(0); - clockevents_register_device(&lguest_clockevent); - - /* Finally, we unblock the timer interrupt. */ - clear_bit(0, lguest_data.blocked_interrupts); -} - -/* - * Miscellaneous bits and pieces. - * - * Here is an oddball collection of functions which the Guest needs for things - * to work. They're pretty simple. - */ - -/* - * The Guest needs to tell the Host what stack it expects traps to use. For - * native hardware, this is part of the Task State Segment mentioned above in - * lguest_load_tr_desc(), but to help hypervisors there's this special call. - * - * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data - * segment), the privilege level (we're privilege level 1, the Host is 0 and - * will not tolerate us trying to use that), the stack pointer, and the number - * of pages in the stack. - */ -static void lguest_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, - THREAD_SIZE / PAGE_SIZE); - tss->x86_tss.sp0 = thread->sp0; -} - -/* Let's just say, I wouldn't do debugging under a Guest. */ -static unsigned long lguest_get_debugreg(int regno) -{ - /* FIXME: Implement */ - return 0; -} - -static void lguest_set_debugreg(int regno, unsigned long value) -{ - /* FIXME: Implement */ -} - -/* - * There are times when the kernel wants to make sure that no memory writes are - * caught in the cache (that they've all reached real hardware devices). This - * doesn't matter for the Guest which has virtual hardware. - * - * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush - * (clflush) instruction is available and the kernel uses that. Otherwise, it - * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction. - * Unlike clflush, wbinvd can only be run at privilege level 0. So we can - * ignore clflush, but replace wbinvd. - */ -static void lguest_wbinvd(void) -{ -} - -/* - * If the Guest expects to have an Advanced Programmable Interrupt Controller, - * we play dumb by ignoring writes and returning 0 for reads. So it's no - * longer Programmable nor Controlling anything, and I don't think 8 lines of - * code qualifies for Advanced. It will also never interrupt anything. It - * does, however, allow us to get through the Linux boot code. - */ -#ifdef CONFIG_X86_LOCAL_APIC -static void lguest_apic_write(u32 reg, u32 v) -{ -} - -static u32 lguest_apic_read(u32 reg) -{ - return 0; -} - -static u64 lguest_apic_icr_read(void) -{ - return 0; -} - -static void lguest_apic_icr_write(u32 low, u32 id) -{ - /* Warn to see if there's any stray references */ - WARN_ON(1); -} - -static void lguest_apic_wait_icr_idle(void) -{ - return; -} - -static u32 lguest_apic_safe_wait_icr_idle(void) -{ - return 0; -} - -static void set_lguest_basic_apic_ops(void) -{ - apic->read = lguest_apic_read; - apic->write = lguest_apic_write; - apic->icr_read = lguest_apic_icr_read; - apic->icr_write = lguest_apic_icr_write; - apic->wait_icr_idle = lguest_apic_wait_icr_idle; - apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle; -}; -#endif - -/* STOP! Until an interrupt comes in. */ -static void lguest_safe_halt(void) -{ - hcall(LHCALL_HALT, 0, 0, 0, 0); -} - -/* - * The SHUTDOWN hypercall takes a string to describe what's happening, and - * an argument which says whether this to restart (reboot) the Guest or not. - * - * Note that the Host always prefers that the Guest speak in physical addresses - * rather than virtual addresses, so we use __pa() here. - */ -static void lguest_power_off(void) -{ - hcall(LHCALL_SHUTDOWN, __pa("Power down"), - LGUEST_SHUTDOWN_POWEROFF, 0, 0); -} - -/* - * Panicing. - * - * Don't. But if you did, this is what happens. - */ -static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) -{ - hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0); - /* The hcall won't return, but to keep gcc happy, we're "done". */ - return NOTIFY_DONE; -} - -static struct notifier_block paniced = { - .notifier_call = lguest_panic -}; - -/* Setting up memory is fairly easy. */ -static __init char *lguest_memory_setup(void) -{ - /* - * The Linux bootloader header contains an "e820" memory map: the - * Launcher populated the first entry with our memory limit. - */ - e820__range_add(boot_params.e820_table[0].addr, - boot_params.e820_table[0].size, - boot_params.e820_table[0].type); - - /* This string is for the boot messages. */ - return "LGUEST"; -} - -/* Offset within PCI config space of BAR access capability. */ -static int console_cfg_offset = 0; -static int console_access_cap; - -/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */ -static void set_cfg_window(u32 cfg_offset, u32 off) -{ - write_pci_config_byte(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, bar), - 0); - write_pci_config(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, length), - 4); - write_pci_config(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, offset), - off); -} - -static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val) -{ - /* - * We could set this up once, then leave it; nothing else in the * - * kernel should touch these registers. But if it went wrong, that - * would be a horrible bug to find. - */ - set_cfg_window(cfg_offset, off); - write_pci_config(0, 1, 0, - cfg_offset + sizeof(struct virtio_pci_cap), val); -} - -static void probe_pci_console(void) -{ - u8 cap, common_cap = 0, device_cap = 0; - u32 device_len; - - /* Avoid recursive printk into here. */ - console_cfg_offset = -1; - - if (!early_pci_allowed()) { - printk(KERN_ERR "lguest: early PCI access not allowed!\n"); - return; - } - - /* We expect a console PCI device at BUS0, slot 1. */ - if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) { - printk(KERN_ERR "lguest: PCI device is %#x!\n", - read_pci_config(0, 1, 0, 0)); - return; - } - - /* Find the capabilities we need (must be in bar0) */ - cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST); - while (cap) { - u8 vndr = read_pci_config_byte(0, 1, 0, cap); - if (vndr == PCI_CAP_ID_VNDR) { - u8 type, bar; - - type = read_pci_config_byte(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, cfg_type)); - bar = read_pci_config_byte(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, bar)); - - switch (type) { - case VIRTIO_PCI_CAP_DEVICE_CFG: - if (bar == 0) - device_cap = cap; - break; - case VIRTIO_PCI_CAP_PCI_CFG: - console_access_cap = cap; - break; - } - } - cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT); - } - if (!device_cap || !console_access_cap) { - printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n", - common_cap, device_cap, console_access_cap); - return; - } - - /* - * Note that we can't check features, until we've set the DRIVER - * status bit. We don't want to do that until we have a real driver, - * so we just check that the device-specific config has room for - * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE - * it should ignore the access. - */ - device_len = read_pci_config(0, 1, 0, - device_cap + offsetof(struct virtio_pci_cap, length)); - if (device_len < (offsetof(struct virtio_console_config, emerg_wr) - + sizeof(u32))) { - printk(KERN_ERR "lguest: console missing emerg_wr field\n"); - return; - } - - console_cfg_offset = read_pci_config(0, 1, 0, - device_cap + offsetof(struct virtio_pci_cap, offset)); - printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n"); -} - -/* - * We will eventually use the virtio console device to produce console output, - * but before that is set up we use the virtio PCI console's backdoor mmio - * access and the "emergency" write facility (which is legal even before the - * device is configured). - */ -static __init int early_put_chars(u32 vtermno, const char *buf, int count) -{ - /* If we couldn't find PCI console, forget it. */ - if (console_cfg_offset < 0) - return count; - - if (unlikely(!console_cfg_offset)) { - probe_pci_console(); - if (console_cfg_offset < 0) - return count; - } - - write_bar_via_cfg(console_access_cap, - console_cfg_offset - + offsetof(struct virtio_console_config, emerg_wr), - buf[0]); - return 1; -} - -/* - * Rebooting also tells the Host we're finished, but the RESTART flag tells the - * Launcher to reboot us. - */ -static void lguest_restart(char *reason) -{ - hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0); -} - -/*G:050 - * Patching (Powerfully Placating Performance Pedants) - * - * We have already seen that pv_ops structures let us replace simple native - * instructions with calls to the appropriate back end all throughout the - * kernel. This allows the same kernel to run as a Guest and as a native - * kernel, but it's slow because of all the indirect branches. - * - * Remember that David Wheeler quote about "Any problem in computer science can - * be solved with another layer of indirection"? The rest of that quote is - * "... But that usually will create another problem." This is the first of - * those problems. - * - * Our current solution is to allow the paravirt back end to optionally patch - * over the indirect calls to replace them with something more efficient. We - * patch two of the simplest of the most commonly called functions: disable - * interrupts and save interrupts. We usually have 6 or 10 bytes to patch - * into: the Guest versions of these operations are small enough that we can - * fit comfortably. - * - * First we need assembly templates of each of the patchable Guest operations, - * and these are in head_32.S. - */ - -/*G:060 We construct a table from the assembler templates: */ -static const struct lguest_insns -{ - const char *start, *end; -} lguest_insns[] = { - [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, - [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, -}; - -/* - * Now our patch routine is fairly simple (based on the native one in - * paravirt.c). If we have a replacement, we copy it in and return how much of - * the available space we used. - */ -static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, - unsigned long addr, unsigned len) -{ - unsigned int insn_len; - - /* Don't do anything special if we don't have a replacement */ - if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) - return paravirt_patch_default(type, clobber, ibuf, addr, len); - - insn_len = lguest_insns[type].end - lguest_insns[type].start; - - /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ - if (len < insn_len) - return paravirt_patch_default(type, clobber, ibuf, addr, len); - - /* Copy in our instructions. */ - memcpy(ibuf, lguest_insns[type].start, insn_len); - return insn_len; -} - -/*G:029 - * Once we get to lguest_init(), we know we're a Guest. The various - * pv_ops structures in the kernel provide points for (almost) every routine we - * have to override to avoid privileged instructions. - */ -__init void lguest_init(void) -{ - /* We're under lguest. */ - pv_info.name = "lguest"; - /* We're running at privilege level 1, not 0 as normal. */ - pv_info.kernel_rpl = 1; - /* Everyone except Xen runs with this set. */ - pv_info.shared_kernel_pmd = 1; - - /* - * We set up all the lguest overrides for sensitive operations. These - * are detailed with the operations themselves. - */ - - /* Interrupt-related operations */ - pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl); - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); - pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable); - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); - pv_irq_ops.safe_halt = lguest_safe_halt; - - /* Setup operations */ - pv_init_ops.patch = lguest_patch; - - /* Intercepts of various CPU instructions */ - pv_cpu_ops.load_gdt = lguest_load_gdt; - pv_cpu_ops.cpuid = lguest_cpuid; - pv_cpu_ops.load_idt = lguest_load_idt; - pv_cpu_ops.iret = lguest_iret; - pv_cpu_ops.load_sp0 = lguest_load_sp0; - pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; - pv_cpu_ops.set_ldt = lguest_set_ldt; - pv_cpu_ops.load_tls = lguest_load_tls; - pv_cpu_ops.get_debugreg = lguest_get_debugreg; - pv_cpu_ops.set_debugreg = lguest_set_debugreg; - pv_cpu_ops.read_cr0 = lguest_read_cr0; - pv_cpu_ops.write_cr0 = lguest_write_cr0; - pv_cpu_ops.read_cr4 = lguest_read_cr4; - pv_cpu_ops.write_cr4 = lguest_write_cr4; - pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; - pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; - pv_cpu_ops.wbinvd = lguest_wbinvd; - pv_cpu_ops.start_context_switch = paravirt_start_context_switch; - pv_cpu_ops.end_context_switch = lguest_end_context_switch; - - /* Pagetable management */ - pv_mmu_ops.write_cr3 = lguest_write_cr3; - pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; - pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; - pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; - pv_mmu_ops.set_pte = lguest_set_pte; - pv_mmu_ops.set_pte_at = lguest_set_pte_at; - pv_mmu_ops.set_pmd = lguest_set_pmd; -#ifdef CONFIG_X86_PAE - pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; - pv_mmu_ops.pte_clear = lguest_pte_clear; - pv_mmu_ops.pmd_clear = lguest_pmd_clear; - pv_mmu_ops.set_pud = lguest_set_pud; -#endif - pv_mmu_ops.read_cr2 = lguest_read_cr2; - pv_mmu_ops.read_cr3 = lguest_read_cr3; - pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; - pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu; - pv_mmu_ops.pte_update = lguest_pte_update; - -#ifdef CONFIG_X86_LOCAL_APIC - /* APIC read/write intercepts */ - set_lguest_basic_apic_ops(); -#endif - - x86_init.resources.memory_setup = lguest_memory_setup; - x86_init.irqs.intr_init = lguest_init_IRQ; - x86_init.timers.timer_init = lguest_time_init; - x86_platform.calibrate_tsc = lguest_tsc_khz; - x86_platform.get_wallclock = lguest_get_wallclock; - - /* - * Now is a good time to look at the implementations of these functions - * before returning to the rest of lguest_init(). - */ - - /*G:070 - * Now we've seen all the paravirt_ops, we return to - * lguest_init() where the rest of the fairly chaotic boot setup - * occurs. - */ - - /* - * The stack protector is a weird thing where gcc places a canary - * value on the stack and then checks it on return. This file is - * compiled with -fno-stack-protector it, so we got this far without - * problems. The value of the canary is kept at offset 20 from the - * %gs register, so we need to set that up before calling C functions - * in other files. - */ - setup_stack_canary_segment(0); - - /* - * We could just call load_stack_canary_segment(), but we might as well - * call switch_to_new_gdt() which loads the whole table and sets up the - * per-cpu segment descriptor register %fs as well. - */ - switch_to_new_gdt(0); - - /* - * The Host<->Guest Switcher lives at the top of our address space, and - * the Host told us how big it is when we made LGUEST_INIT hypercall: - * it put the answer in lguest_data.reserve_mem - */ - reserve_top_address(lguest_data.reserve_mem); - - /* Hook in our special panic hypercall code. */ - atomic_notifier_chain_register(&panic_notifier_list, &paniced); - - /* - * This is messy CPU setup stuff which the native boot code does before - * start_kernel, so we have to do, too: - */ - cpu_detect(&new_cpu_data); - /* head.S usually sets up the first capability word, so do it here. */ - new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); - - /* Math is always hard! */ - set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); - - /* We don't have features. We have puppies! Puppies! */ -#ifdef CONFIG_X86_MCE - mca_cfg.disabled = true; -#endif -#ifdef CONFIG_ACPI - acpi_disabled = 1; -#endif - - /* - * We set the preferred console to "hvc". This is the "hypervisor - * virtual console" driver written by the PowerPC people, which we also - * adapted for lguest's use. - */ - add_preferred_console("hvc", 0, NULL); - - /* Register our very early console. */ - virtio_cons_early_init(early_put_chars); - - /* Don't let ACPI try to control our PCI interrupts. */ - disable_acpi(); - - /* We control them ourselves, by overriding these two hooks. */ - pcibios_enable_irq = lguest_enable_irq; - pcibios_disable_irq = lguest_disable_irq; - - /* - * Last of all, we set the power management poweroff hook to point to - * the Guest routine to power off, and the reboot hook to our restart - * routine. - */ - pm_power_off = lguest_power_off; - machine_ops.restart = lguest_restart; - - /* - * Now we're set up, call i386_start_kernel() in head32.c and we proceed - * to boot as normal. It never returns. - */ - i386_start_kernel(); -} -/* - * This marks the end of stage II of our journey, The Guest. - * - * It is now time for us to explore the layer of virtual drivers and complete - * our understanding of the Guest in "make Drivers". - */ diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S deleted file mode 100644 index d5ae63f5ec5d..000000000000 --- a/arch/x86/lguest/head_32.S +++ /dev/null @@ -1,192 +0,0 @@ -#include <linux/linkage.h> -#include <linux/lguest.h> -#include <asm/lguest_hcall.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> -#include <asm/processor-flags.h> - -/*G:020 - - * Our story starts with the bzImage: booting starts at startup_32 in - * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real - * kernel in place and then jumps into it: startup_32 in - * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi - * register, which is created by the bootloader (the Launcher in our case). - * - * The startup_32 function does very little: it clears the uninitialized global - * C variables which we expect to be zero (ie. BSS) and then copies the boot - * header and kernel command line somewhere safe, and populates some initial - * page tables. Finally it checks the 'hardware_subarch' field. This was - * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's - * assigned number), then it calls us here. - * - * WARNING: be very careful here! We're running at addresses equal to physical - * addresses (around 0), not above PAGE_OFFSET as most code expects - * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any - * data without remembering to subtract __PAGE_OFFSET! - * - * The .section line puts this code in .init.text so it will be discarded after - * boot. - */ -.section .init.text, "ax", @progbits -ENTRY(lguest_entry) - /* - * We make the "initialization" hypercall now to tell the Host where - * our lguest_data struct is. - */ - movl $LHCALL_LGUEST_INIT, %eax - movl $lguest_data - __PAGE_OFFSET, %ebx - int $LGUEST_TRAP_ENTRY - - /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */ - movl $LHCALL_NEW_PGTABLE, %eax - movl $(initial_page_table - __PAGE_OFFSET), %ebx - int $LGUEST_TRAP_ENTRY - - /* Set up the initial stack so we can run C code. */ - movl $(init_thread_union+THREAD_SIZE),%esp - - /* Jumps are relative: we're running __PAGE_OFFSET too low. */ - jmp lguest_init+__PAGE_OFFSET - -/*G:055 - * We create a macro which puts the assembler code between lgstart_ and lgend_ - * markers. These templates are put in the .text section: they can't be - * discarded after boot as we may need to patch modules, too. - */ -.text -#define LGUEST_PATCH(name, insns...) \ - lgstart_##name: insns; lgend_##name:; \ - .globl lgstart_##name; .globl lgend_##name - -LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) - -/*G:033 - * But using those wrappers is inefficient (we'll see why that doesn't matter - * for save_fl and irq_disable later). If we write our routines carefully in - * assembler, we can avoid clobbering any registers and avoid jumping through - * the wrapper functions. - * - * I skipped over our first piece of assembler, but this one is worth studying - * in a bit more detail so I'll describe in easy stages. First, the routine to - * enable interrupts: - */ -ENTRY(lg_irq_enable) - /* - * The reverse of irq_disable, this sets lguest_data.irq_enabled to - * X86_EFLAGS_IF (ie. "Interrupts enabled"). - */ - movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled - /* - * But now we need to check if the Host wants to know: there might have - * been interrupts waiting to be delivered, in which case it will have - * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we - * jump to send_interrupts, otherwise we're done. - */ - cmpl $0, lguest_data+LGUEST_DATA_irq_pending - jnz send_interrupts - /* - * One cool thing about x86 is that you can do many things without using - * a register. In this case, the normal path hasn't needed to save or - * restore any registers at all! - */ - ret -send_interrupts: - /* - * OK, now we need a register: eax is used for the hypercall number, - * which is LHCALL_SEND_INTERRUPTS. - * - * We used not to bother with this pending detection at all, which was - * much simpler. Sooner or later the Host would realize it had to - * send us an interrupt. But that turns out to make performance 7 - * times worse on a simple tcp benchmark. So now we do this the hard - * way. - */ - pushl %eax - movl $LHCALL_SEND_INTERRUPTS, %eax - /* This is the actual hypercall trap. */ - int $LGUEST_TRAP_ENTRY - /* Put eax back the way we found it. */ - popl %eax - ret - -/* - * Finally, the "popf" or "restore flags" routine. The %eax register holds the - * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're - * enabling interrupts again, if it's 0 we're leaving them off. - */ -ENTRY(lg_restore_fl) - /* This is just "lguest_data.irq_enabled = flags;" */ - movl %eax, lguest_data+LGUEST_DATA_irq_enabled - /* - * Now, if the %eax value has enabled interrupts and - * lguest_data.irq_pending is set, we want to tell the Host so it can - * deliver any outstanding interrupts. Fortunately, both values will - * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" - * instruction will AND them together for us. If both are set, we - * jump to send_interrupts. - */ - testl lguest_data+LGUEST_DATA_irq_pending, %eax - jnz send_interrupts - /* Again, the normal path has used no extra registers. Clever, huh? */ - ret -/*:*/ - -/* These demark the EIP where host should never deliver interrupts. */ -.global lguest_noirq_iret - -/*M:004 - * When the Host reflects a trap or injects an interrupt into the Guest, it - * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, - * so the Guest iret logic does the right thing when restoring it. However, - * when the Host sets the Guest up for direct traps, such as system calls, the - * processor is the one to push eflags onto the stack, and the interrupt bit - * will be 1 (in reality, interrupts are always enabled in the Guest). - * - * This turns out to be harmless: the only trap which should happen under Linux - * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc - * regions), which has to be reflected through the Host anyway. If another - * trap *does* go off when interrupts are disabled, the Guest will panic, and - * we'll never get to this iret! -:*/ - -/*G:045 - * There is one final paravirt_op that the Guest implements, and glancing at it - * you can see why I left it to last. It's *cool*! It's in *assembler*! - * - * The "iret" instruction is used to return from an interrupt or trap. The - * stack looks like this: - * old address - * old code segment & privilege level - * old processor flags ("eflags") - * - * The "iret" instruction pops those values off the stack and restores them all - * at once. The only problem is that eflags includes the Interrupt Flag which - * the Guest can't change: the CPU will simply ignore it when we do an "iret". - * So we have to copy eflags from the stack to lguest_data.irq_enabled before - * we do the "iret". - * - * There are two problems with this: firstly, we can't clobber any registers - * and secondly, the whole thing needs to be atomic. The first problem - * is solved by using "push memory"/"pop memory" instruction pair for copying. - * - * The second is harder: copying eflags to lguest_data.irq_enabled will turn - * interrupts on before we're finished, so we could be interrupted before we - * return to userspace or wherever. Our solution to this is to tell the - * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. (It's not necessary to protect pop instruction, since - * data gets updated only after it completes, so we only need to protect - * one instruction, iret). - */ -ENTRY(lguest_iret) - pushl 2*4(%esp) - /* - * Note the %ss: segment prefix here. Normal data accesses use the - * "ds" segment, but that will have already been restored for whatever - * we're returning to (such as userspace): we can't trust it. The %ss: - * prefix makes sure we use the stack segment, which is still valid. - */ - popl %ss:lguest_data+LGUEST_DATA_irq_enabled -lguest_noirq_iret: - iret diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S index f77ba3058b31..066996dba6a2 100644 --- a/arch/x86/math-emu/div_Xsig.S +++ b/arch/x86/math-emu/div_Xsig.S @@ -363,3 +363,4 @@ L_bugged_2: pop %ebx jmp L_exit #endif /* PARANOID */ +ENDPROC(div_Xsig) diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S index 47099628fa4c..2c71527bd917 100644 --- a/arch/x86/math-emu/div_small.S +++ b/arch/x86/math-emu/div_small.S @@ -44,4 +44,4 @@ ENTRY(FPU_div_small) leave ret - +ENDPROC(FPU_div_small) diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 0203baefb5c0..d4a7df2205b8 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -147,7 +147,7 @@ void math_emulate(struct math_emu_info *info) } code_descriptor = FPU_get_ldt_descriptor(FPU_CS); - if (SEG_D_SIZE(code_descriptor)) { + if (code_descriptor.d) { /* The above test may be wrong, the book is not clear */ /* Segmented 32 bit protected mode */ addr_modes.default_mode = SEG32; @@ -155,11 +155,10 @@ void math_emulate(struct math_emu_info *info) /* 16 bit protected mode */ addr_modes.default_mode = PM16; } - FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor); - code_limit = code_base - + (SEG_LIMIT(code_descriptor) + - 1) * SEG_GRANULARITY(code_descriptor) - - 1; + FPU_EIP += code_base = seg_get_base(&code_descriptor); + code_limit = seg_get_limit(&code_descriptor) + 1; + code_limit *= seg_get_granularity(&code_descriptor); + code_limit += code_base - 1; if (code_limit < code_base) code_limit = 0xffffffff; } diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index a179254a5122..699f329f1d40 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -34,17 +34,43 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg) return ret; } -#define SEG_D_SIZE(x) ((x).b & (3 << 21)) -#define SEG_G_BIT(x) ((x).b & (1 << 23)) -#define SEG_GRANULARITY(x) (((x).b & (1 << 23)) ? 4096 : 1) -#define SEG_286_MODE(x) ((x).b & ( 0xff000000 | 0xf0000 | (1 << 23))) -#define SEG_BASE_ADDR(s) (((s).b & 0xff000000) \ - | (((s).b & 0xff) << 16) | ((s).a >> 16)) -#define SEG_LIMIT(s) (((s).b & 0xff0000) | ((s).a & 0xffff)) -#define SEG_EXECUTE_ONLY(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 11)) -#define SEG_WRITE_PERM(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 9)) -#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \ - == (1 << 10)) +#define SEG_TYPE_WRITABLE (1U << 1) +#define SEG_TYPE_EXPANDS_DOWN (1U << 2) +#define SEG_TYPE_EXECUTE (1U << 3) +#define SEG_TYPE_EXPAND_MASK (SEG_TYPE_EXPANDS_DOWN | SEG_TYPE_EXECUTE) +#define SEG_TYPE_EXECUTE_MASK (SEG_TYPE_WRITABLE | SEG_TYPE_EXECUTE) + +static inline unsigned long seg_get_base(struct desc_struct *d) +{ + unsigned long base = (unsigned long)d->base2 << 24; + + return base | ((unsigned long)d->base1 << 16) | d->base0; +} + +static inline unsigned long seg_get_limit(struct desc_struct *d) +{ + return ((unsigned long)d->limit1 << 16) | d->limit0; +} + +static inline unsigned long seg_get_granularity(struct desc_struct *d) +{ + return d->g ? 4096 : 1; +} + +static inline bool seg_expands_down(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXPAND_MASK) == SEG_TYPE_EXPANDS_DOWN; +} + +static inline bool seg_execute_only(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_EXECUTE; +} + +static inline bool seg_writable(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE; +} #define I387 (¤t->thread.fpu.state) #define FPU_info (I387->soft.info) diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index b8ef9f9d2ffc..c48967c6a0e2 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -159,17 +159,18 @@ static long pm_address(u_char FPU_modrm, u_char segment, } descriptor = FPU_get_ldt_descriptor(addr->selector); - base_address = SEG_BASE_ADDR(descriptor); + base_address = seg_get_base(&descriptor); address = base_address + offset; - limit = base_address - + (SEG_LIMIT(descriptor) + 1) * SEG_GRANULARITY(descriptor) - 1; + limit = seg_get_limit(&descriptor) + 1; + limit *= seg_get_granularity(&descriptor); + limit += base_address - 1; if (limit < base_address) limit = 0xffffffff; - if (SEG_EXPAND_DOWN(descriptor)) { - if (SEG_G_BIT(descriptor)) + if (seg_expands_down(&descriptor)) { + if (descriptor.g) { seg_top = 0xffffffff; - else { + } else { seg_top = base_address + (1 << 20); if (seg_top < base_address) seg_top = 0xffffffff; @@ -182,8 +183,8 @@ static long pm_address(u_char FPU_modrm, u_char segment, (address > limit) || (address < base_address) ? 0 : ((limit - address) >= 254 ? 255 : limit - address + 1); } - if (SEG_EXECUTE_ONLY(descriptor) || - (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT))) { + if (seg_execute_only(&descriptor) || + (!seg_writable(&descriptor) && (FPU_modrm & FPU_WRITE_BIT))) { access_limit = 0; } return address; diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S index 717785a53eb4..22e0631bb85a 100644 --- a/arch/x86/math-emu/mul_Xsig.S +++ b/arch/x86/math-emu/mul_Xsig.S @@ -62,6 +62,7 @@ ENTRY(mul32_Xsig) popl %esi leave ret +ENDPROC(mul32_Xsig) ENTRY(mul64_Xsig) @@ -114,6 +115,7 @@ ENTRY(mul64_Xsig) popl %esi leave ret +ENDPROC(mul64_Xsig) @@ -173,4 +175,4 @@ ENTRY(mul_Xsig_Xsig) popl %esi leave ret - +ENDPROC(mul_Xsig_Xsig) diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S index 17315c89ff3d..a9aaf414135d 100644 --- a/arch/x86/math-emu/polynom_Xsig.S +++ b/arch/x86/math-emu/polynom_Xsig.S @@ -133,3 +133,4 @@ L_accum_done: popl %esi leave ret +ENDPROC(polynomial_Xsig) diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S index 8b6352efceef..53ac1a343c69 100644 --- a/arch/x86/math-emu/reg_norm.S +++ b/arch/x86/math-emu/reg_norm.S @@ -94,6 +94,7 @@ L_overflow: call arith_overflow pop %ebx jmp L_exit +ENDPROC(FPU_normalize) @@ -145,3 +146,4 @@ L_exit_nuo_zero: popl %ebx leave ret +ENDPROC(FPU_normalize_nuo) diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S index d1d4e48b4f67..41af5b208d88 100644 --- a/arch/x86/math-emu/reg_round.S +++ b/arch/x86/math-emu/reg_round.S @@ -706,3 +706,5 @@ L_exception_exit: mov $-1,%eax jmp fpu_reg_round_special_exit #endif /* PARANOID */ + +ENDPROC(FPU_round) diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S index 47c4c2434d85..3b1bc5e9b2f6 100644 --- a/arch/x86/math-emu/reg_u_add.S +++ b/arch/x86/math-emu/reg_u_add.S @@ -165,3 +165,4 @@ L_exit: leave ret #endif /* PARANOID */ +ENDPROC(FPU_u_add) diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S index cc00654b6f9a..796eb5ab921b 100644 --- a/arch/x86/math-emu/reg_u_div.S +++ b/arch/x86/math-emu/reg_u_div.S @@ -469,3 +469,5 @@ L_exit: leave ret #endif /* PARANOID */ + +ENDPROC(FPU_u_div) diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S index 973f12af97df..6196f68cf3c1 100644 --- a/arch/x86/math-emu/reg_u_mul.S +++ b/arch/x86/math-emu/reg_u_mul.S @@ -146,3 +146,4 @@ L_exit: ret #endif /* PARANOID */ +ENDPROC(FPU_u_mul) diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S index 1b6c24801d22..d115b900919a 100644 --- a/arch/x86/math-emu/reg_u_sub.S +++ b/arch/x86/math-emu/reg_u_sub.S @@ -270,3 +270,4 @@ L_exit: popl %esi leave ret +ENDPROC(FPU_u_sub) diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S index bbe0e87718e4..87c99749a495 100644 --- a/arch/x86/math-emu/round_Xsig.S +++ b/arch/x86/math-emu/round_Xsig.S @@ -78,7 +78,7 @@ L_exit: popl %ebx leave ret - +ENDPROC(round_Xsig) @@ -138,4 +138,4 @@ L_n_exit: popl %ebx leave ret - +ENDPROC(norm_Xsig) diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S index 31cdd118e918..c8552edeec75 100644 --- a/arch/x86/math-emu/shr_Xsig.S +++ b/arch/x86/math-emu/shr_Xsig.S @@ -85,3 +85,4 @@ L_more_than_95: popl %esi leave ret +ENDPROC(shr_Xsig) diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S index 518428317985..340dd6897f85 100644 --- a/arch/x86/math-emu/wm_shrx.S +++ b/arch/x86/math-emu/wm_shrx.S @@ -92,6 +92,7 @@ L_more_than_95: popl %esi leave ret +ENDPROC(FPU_shrx) /*---------------------------------------------------------------------------+ @@ -202,3 +203,4 @@ Ls_more_than_95: popl %esi leave ret +ENDPROC(FPU_shrxs) diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S index d258f59564e1..695afae38fdf 100644 --- a/arch/x86/math-emu/wm_sqrt.S +++ b/arch/x86/math-emu/wm_sqrt.S @@ -468,3 +468,4 @@ sqrt_more_prec_large: /* Our estimate is too large */ movl $0x7fffff00,%eax jmp sqrt_round_result +ENDPROC(wm_sqrt) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 0ea8afcb929c..c076f710de4c 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -36,6 +36,48 @@ bool ex_handler_fault(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_fault); +/* + * Handler for UD0 exception following a failed test against the + * result of a refcount inc/dec/add/sub. + */ +bool ex_handler_refcount(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + /* First unconditionally saturate the refcount. */ + *(int *)regs->cx = INT_MIN / 2; + + /* + * Strictly speaking, this reports the fixup destination, not + * the fault location, and not the actually overflowing + * instruction, which is the instruction before the "js", but + * since that instruction could be a variety of lengths, just + * report the location after the overflow, which should be close + * enough for finding the overflow, as it's at least back in + * the function, having returned from .text.unlikely. + */ + regs->ip = ex_fixup_addr(fixup); + + /* + * This function has been called because either a negative refcount + * value was seen by any of the refcount functions, or a zero + * refcount value was seen by refcount_dec(). + * + * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result + * wrapped around) will be set. Additionally, seeing the refcount + * reach 0 will set ZF (Zero Flag: result was zero). In each of + * these cases we want a report, since it's a boundary condition. + * + */ + if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) { + bool zero = regs->flags & X86_EFLAGS_ZF; + + refcount_error_report(regs, zero ? "hit zero" : "overflow"); + } + + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_refcount); + bool ex_handler_ext(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { @@ -142,7 +184,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) * undefined. I'm not sure which CPUs do this, but at least * the 486 DX works this way. */ - if ((regs->cs & 0xFFFF) != __KERNEL_CS) + if (regs->cs != __KERNEL_CS) goto fail; /* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2a1fa10c6a98..b836a7274e12 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -396,14 +396,18 @@ static void dump_pagetable(unsigned long address) pte_t *pte; #ifdef CONFIG_X86_PAE - printk("*pdpt = %016Lx ", pgd_val(*pgd)); + pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; +#define pr_pde pr_cont +#else +#define pr_pde pr_info #endif p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); - printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); + pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); +#undef pr_pde /* * We must not directly access the pte in the highpte @@ -415,9 +419,9 @@ static void dump_pagetable(unsigned long address) goto out; pte = pte_offset_kernel(pmd, address); - printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); + pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); out: - printk("\n"); + pr_cont("\n"); } #else /* CONFIG_X86_64: */ @@ -565,7 +569,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pgd)) goto bad; - printk("PGD %lx ", pgd_val(*pgd)); + pr_info("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto out; @@ -574,7 +578,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(p4d)) goto bad; - printk("P4D %lx ", p4d_val(*p4d)); + pr_cont("P4D %lx ", p4d_val(*p4d)); if (!p4d_present(*p4d) || p4d_large(*p4d)) goto out; @@ -582,7 +586,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pud)) goto bad; - printk("PUD %lx ", pud_val(*pud)); + pr_cont("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) goto out; @@ -590,7 +594,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pmd)) goto bad; - printk("PMD %lx ", pmd_val(*pmd)); + pr_cont("PMD %lx ", pmd_val(*pmd)); if (!pmd_present(*pmd) || pmd_large(*pmd)) goto out; @@ -598,12 +602,12 @@ static void dump_pagetable(unsigned long address) if (bad_address(pte)) goto bad; - printk("PTE %lx", pte_val(*pte)); + pr_cont("PTE %lx", pte_val(*pte)); out: - printk("\n"); + pr_cont("\n"); return; bad: - printk("BAD\n"); + pr_info("BAD\n"); } #endif /* CONFIG_X86_64 */ @@ -1254,10 +1258,6 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. - * - * This function must have noinline because both callers - * {,trace_}do_page_fault() have notrace on. Having this an actual function - * guarantees there's a function trace entry. */ static noinline void __do_page_fault(struct pt_regs *regs, unsigned long error_code, @@ -1490,27 +1490,6 @@ good_area: } NOKPROBE_SYMBOL(__do_page_fault); -dotraplinkage void notrace -do_page_fault(struct pt_regs *regs, unsigned long error_code) -{ - unsigned long address = read_cr2(); /* Get the faulting address */ - enum ctx_state prev_state; - - /* - * We must have this function tagged with __kprobes, notrace and call - * read_cr2() before calling anything else. To avoid calling any kind - * of tracing machinery before we've observed the CR2 value. - * - * exception_{enter,exit}() contain all sorts of tracepoints. - */ - - prev_state = exception_enter(); - __do_page_fault(regs, error_code, address); - exception_exit(prev_state); -} -NOKPROBE_SYMBOL(do_page_fault); - -#ifdef CONFIG_TRACING static nokprobe_inline void trace_page_fault_entries(unsigned long address, struct pt_regs *regs, unsigned long error_code) @@ -1521,22 +1500,24 @@ trace_page_fault_entries(unsigned long address, struct pt_regs *regs, trace_page_fault_kernel(address, regs, error_code); } +/* + * We must have this function blacklisted from kprobes, tagged with notrace + * and call read_cr2() before calling anything else. To avoid calling any + * kind of tracing machinery before we've observed the CR2 value. + * + * exception_{enter,exit}() contains all sorts of tracepoints. + */ dotraplinkage void notrace -trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) +do_page_fault(struct pt_regs *regs, unsigned long error_code) { - /* - * The exception_enter and tracepoint processing could - * trigger another page faults (user space callchain - * reading) and destroy the original cr2 value, so read - * the faulting address now. - */ - unsigned long address = read_cr2(); + unsigned long address = read_cr2(); /* Get the faulting address */ enum ctx_state prev_state; prev_state = exception_enter(); - trace_page_fault_entries(address, regs, error_code); + if (trace_pagefault_enabled()) + trace_page_fault_entries(address, regs, error_code); + __do_page_fault(regs, error_code, address); exception_exit(prev_state); } -NOKPROBE_SYMBOL(trace_do_page_fault); -#endif /* CONFIG_TRACING */ +NOKPROBE_SYMBOL(do_page_fault); diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index b327e0472448..730e6d541df1 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -15,7 +15,6 @@ #include <asm/page.h> #include <asm/processor-flags.h> #include <asm/msr-index.h> -#include <asm/frame.h> .text .code64 @@ -33,7 +32,8 @@ ENTRY(sme_encrypt_execute) * R8 - physcial address of the pagetables to use for encryption */ - FRAME_BEGIN /* RBP now has original stack pointer */ + push %rbp + movq %rsp, %rbp /* RBP now has original stack pointer */ /* Set up a one page stack in the non-encrypted memory area */ movq %rcx, %rax /* Workarea stack page */ @@ -64,7 +64,7 @@ ENTRY(sme_encrypt_execute) pop %r12 movq %rbp, %rsp /* Restore original stack pointer */ - FRAME_END + pop %rbp ret ENDPROC(sme_encrypt_execute) diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index a8f90ce3dedf..d805162e6045 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -75,13 +75,15 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, /* * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr - * to max_addr. The return value is the number of nodes allocated. + * to max_addr. + * + * Returns zero on success or negative on error. */ static int __init split_nodes_interleave(struct numa_meminfo *ei, struct numa_meminfo *pi, u64 addr, u64 max_addr, int nr_nodes) { - nodemask_t physnode_mask = NODE_MASK_NONE; + nodemask_t physnode_mask = numa_nodes_parsed; u64 size; int big; int nid = 0; @@ -116,9 +118,6 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, return -1; } - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - /* * Continue to fill physical nodes with fake nodes until there is no * memory left on any of them. @@ -200,13 +199,15 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) /* * Sets up fake nodes of `size' interleaved over physical nodes ranging from - * `addr' to `max_addr'. The return value is the number of nodes allocated. + * `addr' to `max_addr'. + * + * Returns zero on success or negative on error. */ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, struct numa_meminfo *pi, u64 addr, u64 max_addr, u64 size) { - nodemask_t physnode_mask = NODE_MASK_NONE; + nodemask_t physnode_mask = numa_nodes_parsed; u64 min_size; int nid = 0; int i, ret; @@ -231,9 +232,6 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, } size &= FAKE_NODE_MIN_HASH_MASK; - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - /* * Fill physical nodes with fake nodes of size until there is no memory * left on any of them. @@ -280,6 +278,22 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, return 0; } +int __init setup_emu2phys_nid(int *dfl_phys_nid) +{ + int i, max_emu_nid = 0; + + *dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + max_emu_nid = i; + if (*dfl_phys_nid == NUMA_NO_NODE) + *dfl_phys_nid = emu_nid_to_phys[i]; + } + } + + return max_emu_nid; +} + /** * numa_emulation - Emulate NUMA nodes * @numa_meminfo: NUMA configuration to massage @@ -376,23 +390,18 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) * Determine the max emulated nid and the default phys nid to use * for unmapped nodes. */ - max_emu_nid = 0; - dfl_phys_nid = NUMA_NO_NODE; - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { - if (emu_nid_to_phys[i] != NUMA_NO_NODE) { - max_emu_nid = i; - if (dfl_phys_nid == NUMA_NO_NODE) - dfl_phys_nid = emu_nid_to_phys[i]; - } - } - if (dfl_phys_nid == NUMA_NO_NODE) { - pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); - goto no_emu; - } + max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); /* commit */ *numa_meminfo = ei; + /* Make sure numa_nodes_parsed only contains emulated nodes */ + nodes_clear(numa_nodes_parsed); + for (i = 0; i < ARRAY_SIZE(ei.blk); i++) + if (ei.blk[i].start != ei.blk[i].end && + ei.blk[i].nid != NUMA_NO_NODE) + node_set(ei.blk[i].nid, numa_nodes_parsed); + /* * Transform __apicid_to_node table to use emulated nids by * reverse-mapping phys_nid. The maps should always exist but fall diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 508a708eb9a6..218834a3e9ad 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -56,7 +56,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); - tlb_remove_page(tlb, pte); + tlb_remove_table(tlb, pte); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -72,21 +72,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) tlb->need_flush_all = 1; #endif pgtable_pmd_page_dtor(page); - tlb_remove_page(tlb, page); + tlb_remove_table(tlb, page); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(pud)); + tlb_remove_table(tlb, virt_to_page(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(p4d)); + tlb_remove_table(tlb, virt_to_page(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ce104b962a17..dbbcfd59726a 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -214,6 +214,50 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, } /* + * Call this when reinitializing a CPU. It fixes the following potential + * problems: + * + * - The ASID changed from what cpu_tlbstate thinks it is (most likely + * because the CPU was taken down and came back up with CR3's PCID + * bits clear. CPU hotplug can do this. + * + * - The TLB contains junk in slots corresponding to inactive ASIDs. + * + * - The CPU went so far out to lunch that it may have missed a TLB + * flush. + */ +void initialize_tlbstate_and_flush(void) +{ + int i; + struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); + unsigned long cr3 = __read_cr3(); + + /* Assert that CR3 already references the right mm. */ + WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); + + /* + * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization + * doesn't work like other CR4 bits because it can only be set from + * long mode.) + */ + WARN_ON(boot_cpu_has(X86_CR4_PCIDE) && + !(cr4_read_shadow() & X86_CR4_PCIDE)); + + /* Force ASID 0 and force a TLB flush. */ + write_cr3(cr3 & ~CR3_PCID_MASK); + + /* Reinitialize tlbstate. */ + this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); + this_cpu_write(cpu_tlbstate.next_asid, 1); + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); + + for (i = 1; i < TLB_NR_DYN_ASIDS; i++) + this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); +} + +/* * flush_tlb_func_common()'s memory ordering requirement is that any * TLB fills that happen after we flush the TLB are ordered after we * read active_mm's tlb_gen. We don't need any explicit barriers diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e1324f280e06..8c9573660d51 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -94,7 +94,9 @@ static int bpf_size_to_x86_bytes(int bpf_size) #define X86_JNE 0x75 #define X86_JBE 0x76 #define X86_JA 0x77 +#define X86_JL 0x7C #define X86_JGE 0x7D +#define X86_JLE 0x7E #define X86_JG 0x7F static void bpf_flush_icache(void *start, void *end) @@ -285,7 +287,7 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ offsetof(struct bpf_array, map.max_entries)); EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ -#define OFFSET1 47 /* number of bytes to jump */ +#define OFFSET1 43 /* number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; @@ -294,21 +296,20 @@ static void emit_bpf_tail_call(u8 **pprog) */ EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 36 +#define OFFSET2 32 EMIT2(X86_JA, OFFSET2); /* ja out */ label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, 36); /* mov dword ptr [rbp + 36], eax */ /* prog = array->ptrs[index]; */ - EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */ + EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */ offsetof(struct bpf_array, ptrs)); - EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ /* if (prog == NULL) * goto out; */ - EMIT4(0x48, 0x83, 0xF8, 0x00); /* cmp rax, 0 */ + EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ #define OFFSET3 10 EMIT2(X86_JE, OFFSET3); /* je out */ label3 = cnt; @@ -888,9 +889,13 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JNE | BPF_X: case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: /* cmp dst_reg, src_reg */ EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x39, add_2reg(0xC0, dst_reg, src_reg)); @@ -911,9 +916,13 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: /* cmp dst_reg, imm8/32 */ EMIT1(add_1mod(0x48, dst_reg)); @@ -935,18 +944,34 @@ emit_cond_jmp: /* convert BPF opcode to x86 */ /* GT is unsigned '>', JA in x86 */ jmp_cond = X86_JA; break; + case BPF_JLT: + /* LT is unsigned '<', JB in x86 */ + jmp_cond = X86_JB; + break; case BPF_JGE: /* GE is unsigned '>=', JAE in x86 */ jmp_cond = X86_JAE; break; + case BPF_JLE: + /* LE is unsigned '<=', JBE in x86 */ + jmp_cond = X86_JBE; + break; case BPF_JSGT: /* signed '>', GT in x86 */ jmp_cond = X86_JG; break; + case BPF_JSLT: + /* signed '<', LT in x86 */ + jmp_cond = X86_JL; + break; case BPF_JSGE: /* signed '>=', GE in x86 */ jmp_cond = X86_JGE; break; + case BPF_JSLE: + /* signed '<=', LE in x86 */ + jmp_cond = X86_JLE; + break; default: /* to silence gcc warning */ return -EFAULT; } diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 5a18aedcb341..b901ece278dd 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -215,16 +215,23 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) struct irq_alloc_info info; int polarity; int ret; + u8 gsi; if (dev->irq_managed && dev->irq > 0) return 0; + ret = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); + if (ret < 0) { + dev_warn(&dev->dev, "Failed to read interrupt line: %d\n", ret); + return ret; + } + switch (intel_mid_identify_cpu()) { case INTEL_MID_CPU_CHIP_TANGIER: polarity = IOAPIC_POL_HIGH; /* Special treatment for IRQ0 */ - if (dev->irq == 0) { + if (gsi == 0) { /* * Skip HS UART common registers device since it has * IRQ0 assigned and not used by the kernel. @@ -253,10 +260,11 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to * IOAPIC RTE entries, so we just enable RTE for the device. */ - ret = mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info); + ret = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); if (ret < 0) return ret; + dev->irq = ret; dev->irq_managed = 1; return 0; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c index 5a0483e7bf66..dc036e511f48 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -60,7 +60,7 @@ static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata) return 0; } -static struct bt_sfi_data tng_bt_sfi_data __initdata = { +static const struct bt_sfi_data tng_bt_sfi_data __initdata = { .setup = tng_bt_sfi_setup, }; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c index 9e304e2ea4f5..4f5fa65a1011 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c @@ -30,13 +30,13 @@ static int tangier_probe(struct platform_device *pdev) { struct irq_alloc_info info; struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data; - int gsi, irq; + int gsi = TANGIER_EXT_TIMER0_MSI; + int irq; if (!pdata) return -EINVAL; /* IOAPIC builds identity mapping between GSI and IRQ on MID */ - gsi = pdata->irq; ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0); irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); if (irq < 0) { @@ -44,11 +44,11 @@ static int tangier_probe(struct platform_device *pdev) return irq; } + pdata->irq = irq; return 0; } static struct intel_mid_wdt_pdata tangier_pdata = { - .irq = TANGIER_EXT_TIMER0_MSI, .probe = tangier_probe, }; diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 12a272582cdc..86676cec99a1 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -183,6 +183,7 @@ void __init x86_intel_mid_early_setup(void) x86_init.timers.timer_init = intel_mid_time_init; x86_init.timers.setup_percpu_clockev = x86_init_noop; + x86_init.timers.wallclock_init = intel_mid_rtc_init; x86_init.irqs.pre_vector_init = x86_init_noop; @@ -191,7 +192,6 @@ void __init x86_intel_mid_early_setup(void) x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; x86_platform.calibrate_tsc = intel_mid_calibrate_tsc; - x86_init.timers.wallclock_init = intel_mid_rtc_init; x86_platform.get_nmi_reason = intel_mid_get_nmi_reason; x86_init.pci.init = intel_mid_pci_init; diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c index ef03852ea6e8..49ec5b94c71f 100644 --- a/arch/x86/platform/intel-mid/pwr.c +++ b/arch/x86/platform/intel-mid/pwr.c @@ -444,7 +444,7 @@ static int mid_set_initial_state(struct mid_pwr *pwr, const u32 *states) static int pnw_set_initial_state(struct mid_pwr *pwr) { /* On Penwell SRAM must stay powered on */ - const u32 states[] = { + static const u32 states[] = { 0xf00fffff, /* PM_SSC(0) */ 0xffffffff, /* PM_SSC(1) */ 0xffffffff, /* PM_SSC(2) */ @@ -455,7 +455,7 @@ static int pnw_set_initial_state(struct mid_pwr *pwr) static int tng_set_initial_state(struct mid_pwr *pwr) { - const u32 states[] = { + static const u32 states[] = { 0xffffffff, /* PM_SSC(0) */ 0xffffffff, /* PM_SSC(1) */ 0xffffffff, /* PM_SSC(2) */ diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 78459a6d455a..4d68d59f457d 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -181,6 +181,7 @@ static void fix_processor_context(void) #endif load_TR_desc(); /* This does ltr */ load_mm_ldt(current->active_mm); /* This does lldt */ + initialize_tlbstate_and_flush(); fpu__resume_cpu(); diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index ae4cd58c0c7a..02250b2633b8 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -50,7 +50,7 @@ void foo(void) DEFINE(HOST_GS, GS); DEFINE(HOST_ORIG_AX, ORIG_EAX); #else -#if defined(PTRACE_GETREGSET) && defined(PTRACE_SETREGSET) +#ifdef FP_XSTATE_MAGIC1 DEFINE(HOST_FP_SIZE, sizeof(struct _xstate) / sizeof(unsigned long)); #else DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long)); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index df1921751aa5..ae2a2e2d6362 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -501,7 +501,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) static inline bool desc_equal(const struct desc_struct *d1, const struct desc_struct *d2) { - return d1->a == d2->a && d1->b == d2->b; + return !memcmp(d1, d2, sizeof(*d1)); } static void load_TLS_descriptor(struct thread_struct *t, @@ -586,59 +586,91 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, preempt_enable(); } +#ifdef CONFIG_X86_64 +struct trap_array_entry { + void (*orig)(void); + void (*xen)(void); + bool ist_okay; +}; + +static struct trap_array_entry trap_array[] = { + { debug, xen_xendebug, true }, + { int3, xen_xenint3, true }, + { double_fault, xen_double_fault, true }, +#ifdef CONFIG_X86_MCE + { machine_check, xen_machine_check, true }, +#endif + { nmi, xen_nmi, true }, + { overflow, xen_overflow, false }, +#ifdef CONFIG_IA32_EMULATION + { entry_INT80_compat, xen_entry_INT80_compat, false }, +#endif + { page_fault, xen_page_fault, false }, + { divide_error, xen_divide_error, false }, + { bounds, xen_bounds, false }, + { invalid_op, xen_invalid_op, false }, + { device_not_available, xen_device_not_available, false }, + { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false }, + { invalid_TSS, xen_invalid_TSS, false }, + { segment_not_present, xen_segment_not_present, false }, + { stack_segment, xen_stack_segment, false }, + { general_protection, xen_general_protection, false }, + { spurious_interrupt_bug, xen_spurious_interrupt_bug, false }, + { coprocessor_error, xen_coprocessor_error, false }, + { alignment_check, xen_alignment_check, false }, + { simd_coprocessor_error, xen_simd_coprocessor_error, false }, +}; + +static bool get_trap_addr(void **addr, unsigned int ist) +{ + unsigned int nr; + bool ist_okay = false; + + /* + * Replace trap handler addresses by Xen specific ones. + * Check for known traps using IST and whitelist them. + * The debugger ones are the only ones we care about. + * Xen will handle faults like double_fault, * so we should never see + * them. Warn if there's an unexpected IST-using fault handler. + */ + for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) { + struct trap_array_entry *entry = trap_array + nr; + + if (*addr == entry->orig) { + *addr = entry->xen; + ist_okay = entry->ist_okay; + break; + } + } + + if (WARN_ON(ist != 0 && !ist_okay)) + return false; + + return true; +} +#endif + static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { unsigned long addr; - if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) + if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT) return 0; info->vector = vector; - addr = gate_offset(*val); + addr = gate_offset(val); #ifdef CONFIG_X86_64 - /* - * Look for known traps using IST, and substitute them - * appropriately. The debugger ones are the only ones we care - * about. Xen will handle faults like double_fault, - * so we should never see them. Warn if - * there's an unexpected IST-using fault handler. - */ - if (addr == (unsigned long)debug) - addr = (unsigned long)xen_debug; - else if (addr == (unsigned long)int3) - addr = (unsigned long)xen_int3; - else if (addr == (unsigned long)stack_segment) - addr = (unsigned long)xen_stack_segment; - else if (addr == (unsigned long)double_fault) { - /* Don't need to handle these */ + if (!get_trap_addr((void **)&addr, val->bits.ist)) return 0; -#ifdef CONFIG_X86_MCE - } else if (addr == (unsigned long)machine_check) { - /* - * when xen hypervisor inject vMCE to guest, - * use native mce handler to handle it - */ - ; -#endif - } else if (addr == (unsigned long)nmi) - /* - * Use the native version as well. - */ - ; - else { - /* Some other trap using IST? */ - if (WARN_ON(val->ist != 0)) - return 0; - } #endif /* CONFIG_X86_64 */ info->address = addr; - info->cs = gate_segment(*val); - info->flags = val->dpl; + info->cs = gate_segment(val); + info->flags = val->bits.dpl; /* interrupt gates clear IF */ - if (val->type == GATE_INTERRUPT) + if (val->bits.type == GATE_INTERRUPT) info->flags |= 1 << 2; return 1; @@ -988,59 +1020,6 @@ void __ref xen_setup_vcpu_info_placement(void) } } -static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len) -{ - char *start, *end, *reloc; - unsigned ret; - - start = end = reloc = NULL; - -#define SITE(op, x) \ - case PARAVIRT_PATCH(op.x): \ - if (xen_have_vcpu_info_placement) { \ - start = (char *)xen_##x##_direct; \ - end = xen_##x##_direct_end; \ - reloc = xen_##x##_direct_reloc; \ - } \ - goto patch_site - - switch (type) { - SITE(pv_irq_ops, irq_enable); - SITE(pv_irq_ops, irq_disable); - SITE(pv_irq_ops, save_fl); - SITE(pv_irq_ops, restore_fl); -#undef SITE - - patch_site: - if (start == NULL || (end-start) > len) - goto default_patch; - - ret = paravirt_patch_insns(insnbuf, len, start, end); - - /* Note: because reloc is assigned from something that - appears to be an array, gcc assumes it's non-null, - but doesn't know its relationship with start and - end. */ - if (reloc > start && reloc < end) { - int reloc_off = reloc - start; - long *relocp = (long *)(insnbuf + reloc_off); - long delta = start - (char *)addr; - - *relocp += delta; - } - break; - - default_patch: - default: - ret = paravirt_patch_default(type, clobbers, insnbuf, - addr, len); - break; - } - - return ret; -} - static const struct pv_info xen_info __initconst = { .shared_kernel_pmd = 0, @@ -1050,10 +1029,6 @@ static const struct pv_info xen_info __initconst = { .name = "Xen", }; -static const struct pv_init_ops xen_init_ops __initconst = { - .patch = xen_patch, -}; - static const struct pv_cpu_ops xen_cpu_ops __initconst = { .cpuid = xen_cpuid, @@ -1251,7 +1226,7 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_init_ops = xen_init_ops; + pv_init_ops.patch = paravirt_patch_default; pv_cpu_ops = xen_cpu_ops; x86_platform.get_nmi_reason = xen_get_nmi_reason; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 33e92955e09d..d4eff5676cfa 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -123,9 +123,6 @@ static const struct pv_irq_ops xen_irq_ops __initconst = { .safe_halt = xen_safe_halt, .halt = xen_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = xen_adjust_exception_frame, -#endif }; void __init xen_init_irq_ops(void) diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index eff224df813f..dcd31fa39b5d 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in percpu data) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/asm-offsets.h> @@ -16,7 +10,7 @@ #include <asm/processor-flags.h> #include <asm/frame.h> -#include "xen-asm.h" +#include <linux/linkage.h> /* * Enable events. This clears the event mask and tests the pending @@ -38,13 +32,11 @@ ENTRY(xen_irq_enable_direct) testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending jz 1f -2: call check_events + call check_events 1: -ENDPATCH(xen_irq_enable_direct) FRAME_END ret ENDPROC(xen_irq_enable_direct) - RELOC(xen_irq_enable_direct, 2b+1) /* @@ -53,10 +45,8 @@ ENDPATCH(xen_irq_enable_direct) */ ENTRY(xen_irq_disable_direct) movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask -ENDPATCH(xen_irq_disable_direct) ret - ENDPROC(xen_irq_disable_direct) - RELOC(xen_irq_disable_direct, 0) +ENDPROC(xen_irq_disable_direct) /* * (xen_)save_fl is used to get the current interrupt enable status. @@ -71,10 +61,8 @@ ENTRY(xen_save_fl_direct) testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask setz %ah addb %ah, %ah -ENDPATCH(xen_save_fl_direct) ret ENDPROC(xen_save_fl_direct) - RELOC(xen_save_fl_direct, 0) /* @@ -101,13 +89,11 @@ ENTRY(xen_restore_fl_direct) /* check for unmasked and pending */ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending jnz 1f -2: call check_events + call check_events 1: -ENDPATCH(xen_restore_fl_direct) FRAME_END ret ENDPROC(xen_restore_fl_direct) - RELOC(xen_restore_fl_direct, 2b+1) /* diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h deleted file mode 100644 index 465276467a47..000000000000 --- a/arch/x86/xen/xen-asm.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _XEN_XEN_ASM_H -#define _XEN_XEN_ASM_H - -#include <linux/linkage.h> - -#define RELOC(x, v) .globl x##_reloc; x##_reloc=v -#define ENDPATCH(x) .globl x##_end; x##_end=. - -/* Pseudo-flag used for virtual NMI, which we don't implement yet */ -#define XEN_EFLAGS_NMI 0x80000000 - -#endif diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index feb6d40a0860..1200e262a116 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in pda) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/thread_info.h> @@ -18,21 +12,10 @@ #include <xen/interface/xen.h> -#include "xen-asm.h" +#include <linux/linkage.h> -/* - * Force an event check by making a hypercall, but preserve regs - * before making the call. - */ -check_events: - push %eax - push %ecx - push %edx - call xen_force_evtchn_callback - pop %edx - pop %ecx - pop %eax - ret +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 /* * This is run where a normal iret would be run, with the same stack setup: diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index c3df43141e70..dae2cc33afb5 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -1,14 +1,8 @@ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in pda) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/errno.h> @@ -20,13 +14,44 @@ #include <xen/interface/xen.h> -#include "xen-asm.h" +#include <linux/linkage.h> + +.macro xen_pv_trap name +ENTRY(xen_\name) + pop %rcx + pop %r11 + jmp \name +END(xen_\name) +.endm -ENTRY(xen_adjust_exception_frame) - mov 8+0(%rsp), %rcx - mov 8+8(%rsp), %r11 - ret $16 -ENDPROC(xen_adjust_exception_frame) +xen_pv_trap divide_error +xen_pv_trap debug +xen_pv_trap xendebug +xen_pv_trap int3 +xen_pv_trap xenint3 +xen_pv_trap nmi +xen_pv_trap overflow +xen_pv_trap bounds +xen_pv_trap invalid_op +xen_pv_trap device_not_available +xen_pv_trap double_fault +xen_pv_trap coprocessor_segment_overrun +xen_pv_trap invalid_TSS +xen_pv_trap segment_not_present +xen_pv_trap stack_segment +xen_pv_trap general_protection +xen_pv_trap page_fault +xen_pv_trap spurious_interrupt_bug +xen_pv_trap coprocessor_error +xen_pv_trap alignment_check +#ifdef CONFIG_X86_MCE +xen_pv_trap machine_check +#endif /* CONFIG_X86_MCE */ +xen_pv_trap simd_coprocessor_error +#ifdef CONFIG_IA32_EMULATION +xen_pv_trap entry_INT80_compat +#endif +xen_pv_trap hypervisor_callback hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 /* @@ -46,9 +71,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 */ ENTRY(xen_iret) pushq $0 -1: jmp hypercall_iret -ENDPATCH(xen_iret) -RELOC(xen_iret, 1b+1) + jmp hypercall_iret ENTRY(xen_sysret64) /* @@ -65,9 +88,7 @@ ENTRY(xen_sysret64) pushq %rcx pushq $VGCF_in_syscall -1: jmp hypercall_iret -ENDPATCH(xen_sysret64) -RELOC(xen_sysret64, 1b+1) + jmp hypercall_iret /* * Xen handles syscall callbacks much like ordinary exceptions, which @@ -82,34 +103,47 @@ RELOC(xen_sysret64, 1b+1) * rip * r11 * rsp->rcx - * - * In all the entrypoints, we undo all that to make it look like a - * CPU-generated syscall/sysenter and jump to the normal entrypoint. */ -.macro undo_xen_syscall - mov 0*8(%rsp), %rcx - mov 1*8(%rsp), %r11 - mov 5*8(%rsp), %rsp -.endm - /* Normal 64-bit system call target */ ENTRY(xen_syscall_target) - undo_xen_syscall - jmp entry_SYSCALL_64_after_swapgs + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER_DS and __USER_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER_DS, 4*8(%rsp) + movq $__USER_CS, 1*8(%rsp) + + jmp entry_SYSCALL_64_after_hwframe ENDPROC(xen_syscall_target) #ifdef CONFIG_IA32_EMULATION /* 32-bit compat syscall target */ ENTRY(xen_syscall32_target) - undo_xen_syscall - jmp entry_SYSCALL_compat + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER32_DS, 4*8(%rsp) + movq $__USER32_CS, 1*8(%rsp) + + jmp entry_SYSCALL_compat_after_hwframe ENDPROC(xen_syscall32_target) /* 32-bit compat sysenter target */ ENTRY(xen_sysenter_target) - undo_xen_syscall + mov 0*8(%rsp), %rcx + mov 1*8(%rsp), %r11 + mov 5*8(%rsp), %rsp jmp entry_SYSENTER_compat ENDPROC(xen_sysenter_target) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 0d5004477db6..c8a6d224f7ed 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -129,23 +129,15 @@ static inline void __init xen_efi_init(void) } #endif -/* Declare an asm function, along with symbols needed to make it - inlineable */ -#define DECL_ASM(ret, name, ...) \ - __visible ret name(__VA_ARGS__); \ - extern char name##_end[] __visible; \ - extern char name##_reloc[] __visible - -DECL_ASM(void, xen_irq_enable_direct, void); -DECL_ASM(void, xen_irq_disable_direct, void); -DECL_ASM(unsigned long, xen_save_fl_direct, void); -DECL_ASM(void, xen_restore_fl_direct, unsigned long); +__visible void xen_irq_enable_direct(void); +__visible void xen_irq_disable_direct(void); +__visible unsigned long xen_save_fl_direct(void); +__visible void xen_restore_fl_direct(unsigned long); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); __visible void xen_sysret32(void); __visible void xen_sysret64(void); -__visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h index b39531babec0..eaaf1ebcc7a4 100644 --- a/arch/xtensa/include/asm/futex.h +++ b/arch/xtensa/include/asm/futex.h @@ -44,18 +44,10 @@ : "r" (uaddr), "I" (-EFAULT), "r" (oparg) \ : "memory") -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; #if !XCHAL_HAVE_S32C1I return -ENOSYS; @@ -89,19 +81,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (ret) - return ret; + if (!ret) + *oval = oldval; - switch (cmp) { - case FUTEX_OP_CMP_EQ: return (oldval == cmparg); - case FUTEX_OP_CMP_NE: return (oldval != cmparg); - case FUTEX_OP_CMP_LT: return (oldval < cmparg); - case FUTEX_OP_CMP_GE: return (oldval >= cmparg); - case FUTEX_OP_CMP_LE: return (oldval <= cmparg); - case FUTEX_OP_CMP_GT: return (oldval > cmparg); - } - - return -ENOSYS; + return ret; } static inline int diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h index a36221cf6363..3bb49681ee24 100644 --- a/arch/xtensa/include/asm/spinlock.h +++ b/arch/xtensa/include/asm/spinlock.h @@ -33,11 +33,6 @@ #define arch_spin_is_locked(x) ((x)->slock != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, !VAL); -} - #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) static inline void arch_spin_lock(arch_spinlock_t *lock) diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 24365b30aae9..b15b278aa314 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -103,20 +103,12 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 -/* - * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. - * This gives us 6 bits, which is enough until someone invents 128 bit address - * spaces. - * - * Assume these are all power of twos. - * When 0 use the default page size. - */ -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f - #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 3eed2761c149..220059999e74 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -113,4 +113,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _XTENSA_SOCKET_H */ diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index 33bfa5270d95..08175df7a69e 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -273,8 +273,8 @@ void __init init_arch(bp_tag_t *bp_start) * Initialize system. Setup memory and reserve regions. */ -extern char _end; -extern char _stext; +extern char _end[]; +extern char _stext[]; extern char _WindowVectors_text_start; extern char _WindowVectors_text_end; extern char _DebugInterruptVector_literal_start; @@ -333,7 +333,7 @@ void __init setup_arch(char **cmdline_p) } #endif - mem_reserve(__pa(&_stext), __pa(&_end)); + mem_reserve(__pa(_stext), __pa(_end)); #ifdef CONFIG_VECTORS_OFFSET mem_reserve(__pa(&_WindowVectors_text_start), |