diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-28 09:43:49 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-28 09:43:49 -0700 |
commit | 22b8cc3e78f5448b4c5df00303817a9137cd663f (patch) | |
tree | 4c90f7ebbf4d439ae37f879f0e0f97c2eafd434b | |
parent | 7b664cc38ea7bdd5e3ce018bba98583741921bd4 (diff) | |
parent | 97740266de26e5dfe6e4fbecacb6995b66c2e378 (diff) |
Merge tag 'x86_mm_for_6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 LAM (Linear Address Masking) support from Dave Hansen:
"Add support for the new Linear Address Masking CPU feature.
This is similar to ARM's Top Byte Ignore and allows userspace to store
metadata in some bits of pointers without masking it out before use"
* tag 'x86_mm_for_6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mm/iommu/sva: Do not allow to set FORCE_TAGGED_SVA bit from outside
x86/mm/iommu/sva: Fix error code for LAM enabling failure due to SVA
selftests/x86/lam: Add test cases for LAM vs thread creation
selftests/x86/lam: Add ARCH_FORCE_TAGGED_SVA test cases for linear-address masking
selftests/x86/lam: Add inherit test cases for linear-address masking
selftests/x86/lam: Add io_uring test cases for linear-address masking
selftests/x86/lam: Add mmap and SYSCALL test cases for linear-address masking
selftests/x86/lam: Add malloc and tag-bits test cases for linear-address masking
x86/mm/iommu/sva: Make LAM and SVA mutually exclusive
iommu/sva: Replace pasid_valid() helper with mm_valid_pasid()
mm: Expose untagging mask in /proc/$PID/status
x86/mm: Provide arch_prctl() interface for LAM
x86/mm: Reduce untagged_addr() overhead for systems without LAM
x86/uaccess: Provide untagged_addr() and remove tags before address check
mm: Introduce untagged_addr_remote()
x86/mm: Handle LAM on context switch
x86: CPUID and CR3/CR4 flags for Linear Address Masking
x86: Allow atomic MM_CONTEXT flags setting
x86/mm: Rework address range check in get_user() and put_user()
35 files changed, 1701 insertions, 149 deletions
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 72dbd6400549..56911691bef0 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -288,6 +288,12 @@ void post_ttbr_update_workaround(void); unsigned long arm64_mm_context_get(struct mm_struct *mm); void arm64_mm_context_put(struct mm_struct *mm); +#define mm_untag_mask mm_untag_mask +static inline unsigned long mm_untag_mask(struct mm_struct *mm) +{ + return -1UL >> 8; +} + #include <asm-generic/mmu_context.h> #endif /* !__ASSEMBLY__ */ diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h index 7a8380c63aab..799e797c5cdd 100644 --- a/arch/sparc/include/asm/mmu_context_64.h +++ b/arch/sparc/include/asm/mmu_context_64.h @@ -185,6 +185,12 @@ static inline void finish_arch_post_lock_switch(void) } } +#define mm_untag_mask mm_untag_mask +static inline unsigned long mm_untag_mask(struct mm_struct *mm) +{ + return -1UL >> adi_nbits(); +} + #include <asm-generic/mmu_context.h> #endif /* !(__ASSEMBLY__) */ diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h index 94266a5c5b04..b825a5dd0210 100644 --- a/arch/sparc/include/asm/uaccess_64.h +++ b/arch/sparc/include/asm/uaccess_64.h @@ -8,8 +8,10 @@ #include <linux/compiler.h> #include <linux/string.h> +#include <linux/mm_types.h> #include <asm/asi.h> #include <asm/spitfire.h> +#include <asm/pgtable.h> #include <asm/processor.h> #include <asm-generic/access_ok.h> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f87590c3c382..53bab123a8ee 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2290,6 +2290,17 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING If unsure, leave at the default value. +config ADDRESS_MASKING + bool "Linear Address Masking support" + depends on X86_64 + help + Linear Address Masking (LAM) modifies the checking that is applied + to 64-bit linear addresses, allowing software to use of the + untranslated address bits for metadata. + + The capability can be used for efficient address sanitizers (ASAN) + implementation and for optimizations in JITs. + config HOTPLUG_CPU def_bool y depends on SMP diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index d234ca797e4a..e0ca8120aea8 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -317,7 +317,7 @@ static struct vm_area_struct gate_vma __ro_after_init = { struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_COMPAT - if (!mm || !(mm->context.flags & MM_CONTEXT_HAS_VSYSCALL)) + if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags)) return NULL; #endif if (vsyscall_mode == NONE) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 73c9672c123b..353b054812de 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -321,6 +321,7 @@ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ +#define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 5dfa4fb76f4b..fafe9be7a6f4 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -75,6 +75,12 @@ # define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) #endif +#ifdef CONFIG_ADDRESS_MASKING +# define DISABLE_LAM 0 +#else +# define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31)) +#endif + #ifdef CONFIG_INTEL_IOMMU_SVM # define DISABLE_ENQCMD 0 #else @@ -115,7 +121,7 @@ #define DISABLED_MASK10 0 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ DISABLE_CALL_DEPTH_TRACKING) -#define DISABLED_MASK12 0 +#define DISABLED_MASK12 (DISABLE_LAM) #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 5d7494631ea9..0da5c227f490 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -9,9 +9,13 @@ #include <linux/bits.h> /* Uprobes on this MM assume 32-bit code */ -#define MM_CONTEXT_UPROBE_IA32 BIT(0) +#define MM_CONTEXT_UPROBE_IA32 0 /* vsyscall page is accessible on this MM */ -#define MM_CONTEXT_HAS_VSYSCALL BIT(1) +#define MM_CONTEXT_HAS_VSYSCALL 1 +/* Do not allow changing LAM mode */ +#define MM_CONTEXT_LOCK_LAM 2 +/* Allow LAM and SVA coexisting */ +#define MM_CONTEXT_FORCE_TAGGED_SVA 3 /* * x86 has arch-specific MMU state beyond what lives in mm_struct. @@ -39,7 +43,15 @@ typedef struct { #endif #ifdef CONFIG_X86_64 - unsigned short flags; + unsigned long flags; +#endif + +#ifdef CONFIG_ADDRESS_MASKING + /* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */ + unsigned long lam_cr3_mask; + + /* Significant bits of the virtual address. Excludes tag bits. */ + u64 untag_mask; #endif struct mutex lock; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index c3ad8a526378..1d29dc791f5a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -85,6 +85,51 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) } #endif +#ifdef CONFIG_ADDRESS_MASKING +static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) +{ + return mm->context.lam_cr3_mask; +} + +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) +{ + mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask; + mm->context.untag_mask = oldmm->context.untag_mask; +} + +#define mm_untag_mask mm_untag_mask +static inline unsigned long mm_untag_mask(struct mm_struct *mm) +{ + return mm->context.untag_mask; +} + +static inline void mm_reset_untag_mask(struct mm_struct *mm) +{ + mm->context.untag_mask = -1UL; +} + +#define arch_pgtable_dma_compat arch_pgtable_dma_compat +static inline bool arch_pgtable_dma_compat(struct mm_struct *mm) +{ + return !mm_lam_cr3_mask(mm) || + test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags); +} +#else + +static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) +{ + return 0; +} + +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) +{ +} + +static inline void mm_reset_untag_mask(struct mm_struct *mm) +{ +} +#endif + #define enter_lazy_tlb enter_lazy_tlb extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); @@ -109,6 +154,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.execute_only_pkey = -1; } #endif + mm_reset_untag_mask(mm); init_new_context_ldt(mm); return 0; } @@ -162,6 +208,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { arch_dup_pkeys(oldmm, mm); paravirt_enter_mmap(mm); + dup_lam(oldmm, mm); return ldt_dup_context(oldmm, mm); } @@ -175,7 +222,7 @@ static inline void arch_exit_mmap(struct mm_struct *mm) static inline bool is_64bit_mm(struct mm_struct *mm) { return !IS_ENABLED(CONFIG_IA32_EMULATION) || - !(mm->context.flags & MM_CONTEXT_UPROBE_IA32); + !test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags); } #else static inline bool is_64bit_mm(struct mm_struct *mm) diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index a7f3d9100adb..d8cccadc83a6 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -28,6 +28,8 @@ * On systems with SME, one bit (in a variable position!) is stolen to indicate * that the top-level paging structure is encrypted. * + * On systemms with LAM, bits 61 and 62 are used to indicate LAM mode. + * * All of the remaining bits indicate the physical address of the top-level * paging structure. * diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cda3118f3b27..75bfaa421030 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_TLBFLUSH_H #define _ASM_X86_TLBFLUSH_H -#include <linux/mm.h> +#include <linux/mm_types.h> #include <linux/sched.h> #include <asm/processor.h> @@ -12,6 +12,7 @@ #include <asm/invpcid.h> #include <asm/pti.h> #include <asm/processor-flags.h> +#include <asm/pgtable.h> void __flush_tlb_all(void); @@ -53,6 +54,15 @@ static inline void cr4_clear_bits(unsigned long mask) local_irq_restore(flags); } +#ifdef CONFIG_ADDRESS_MASKING +DECLARE_PER_CPU(u64, tlbstate_untag_mask); + +static inline u64 current_untag_mask(void) +{ + return this_cpu_read(tlbstate_untag_mask); +} +#endif + #ifndef MODULE /* * 6 because 6 should be plenty and struct tlb_state will fit in two cache @@ -101,6 +111,16 @@ struct tlb_state { */ bool invalidate_other; +#ifdef CONFIG_ADDRESS_MASKING + /* + * Active LAM mode. + * + * X86_CR3_LAM_U57/U48 shifted right by X86_CR3_LAM_U57_BIT or 0 if LAM + * disabled. + */ + u8 lam; +#endif + /* * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate * the corresponding user PCID needs a flush next time we @@ -357,6 +377,32 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) } #define huge_pmd_needs_flush huge_pmd_needs_flush +#ifdef CONFIG_ADDRESS_MASKING +static inline u64 tlbstate_lam_cr3_mask(void) +{ + u64 lam = this_cpu_read(cpu_tlbstate.lam); + + return lam << X86_CR3_LAM_U57_BIT; +} + +static inline void set_tlbstate_lam_mode(struct mm_struct *mm) +{ + this_cpu_write(cpu_tlbstate.lam, + mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT); + this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask); +} + +#else + +static inline u64 tlbstate_lam_cr3_mask(void) +{ + return 0; +} + +static inline void set_tlbstate_lam_mode(struct mm_struct *mm) +{ +} +#endif #endif /* !MODULE */ static inline void __native_tlb_flush_global(unsigned long cr4) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 1cc756eafa44..457e814712af 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -7,11 +7,14 @@ #include <linux/compiler.h> #include <linux/instrumented.h> #include <linux/kasan-checks.h> +#include <linux/mm_types.h> #include <linux/string.h> +#include <linux/mmap_lock.h> #include <asm/asm.h> #include <asm/page.h> #include <asm/smap.h> #include <asm/extable.h> +#include <asm/tlbflush.h> #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline bool pagefault_disabled(void); @@ -21,6 +24,57 @@ static inline bool pagefault_disabled(void); # define WARN_ON_IN_IRQ() #endif +#ifdef CONFIG_ADDRESS_MASKING +/* + * Mask out tag bits from the address. + * + * Magic with the 'sign' allows to untag userspace pointer without any branches + * while leaving kernel addresses intact. + */ +static inline unsigned long __untagged_addr(unsigned long addr) +{ + long sign; + + /* + * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation + * in alternative instructions. The relocation gets wrong when gets + * copied to the target place. + */ + asm (ALTERNATIVE("", + "sar $63, %[sign]\n\t" /* user_ptr ? 0 : -1UL */ + "or %%gs:tlbstate_untag_mask, %[sign]\n\t" + "and %[sign], %[addr]\n\t", X86_FEATURE_LAM) + : [addr] "+r" (addr), [sign] "=r" (sign) + : "m" (tlbstate_untag_mask), "[sign]" (addr)); + + return addr; +} + +#define untagged_addr(addr) ({ \ + unsigned long __addr = (__force unsigned long)(addr); \ + (__force __typeof__(addr))__untagged_addr(__addr); \ +}) + +static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, + unsigned long addr) +{ + long sign = addr >> 63; + + mmap_assert_locked(mm); + addr &= (mm)->context.untag_mask | sign; + + return addr; +} + +#define untagged_addr_remote(mm, addr) ({ \ + unsigned long __addr = (__force unsigned long)(addr); \ + (__force __typeof__(addr))__untagged_addr_remote(mm, __addr); \ +}) + +#else +#define untagged_addr(addr) (addr) +#endif + /** * access_ok - Checks if a user space pointer is valid * @addr: User space pointer to start of block to check @@ -38,10 +92,10 @@ static inline bool pagefault_disabled(void); * Return: true (nonzero) if the memory block may be valid, false (zero) * if it is definitely invalid. */ -#define access_ok(addr, size) \ +#define access_ok(addr, size) \ ({ \ WARN_ON_IN_IRQ(); \ - likely(__access_ok(addr, size)); \ + likely(__access_ok(untagged_addr(addr), size)); \ }) #include <asm-generic/access_ok.h> diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index f298c778f856..e8d7ebbca1a4 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -23,4 +23,9 @@ #define ARCH_MAP_VDSO_32 0x2002 #define ARCH_MAP_VDSO_64 0x2003 +#define ARCH_GET_UNTAG_MASK 0x4001 +#define ARCH_ENABLE_TAGGED_ADDR 0x4002 +#define ARCH_GET_MAX_TAG_BITS 0x4003 +#define ARCH_FORCE_TAGGED_SVA 0x4004 + #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index c47cc7f2feeb..d898432947ff 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -82,6 +82,10 @@ #define X86_CR3_PCID_BITS 12 #define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL)) +#define X86_CR3_LAM_U57_BIT 61 /* Activate LAM for userspace, 62:57 bits masked */ +#define X86_CR3_LAM_U57 _BITULL(X86_CR3_LAM_U57_BIT) +#define X86_CR3_LAM_U48_BIT 62 /* Activate LAM for userspace, 62:48 bits masked */ +#define X86_CR3_LAM_U48 _BITULL(X86_CR3_LAM_U48_BIT) #define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ #define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) @@ -132,6 +136,8 @@ #define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT) #define X86_CR4_CET_BIT 23 /* enable Control-flow Enforcement Technology */ #define X86_CR4_CET _BITUL(X86_CR4_CET_BIT) +#define X86_CR4_LAM_SUP_BIT 28 /* LAM for supervisor pointers */ +#define X86_CR4_LAM_SUP _BITUL(X86_CR4_LAM_SUP_BIT) /* * x86-64 Task Priority Register, CR8 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b650cde3f64d..50d950771371 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -48,6 +48,7 @@ #include <asm/frame.h> #include <asm/unwind.h> #include <asm/tdx.h> +#include <asm/mmu_context.h> #include "process.h" @@ -162,6 +163,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); + + if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) + set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags); #else p->thread.sp0 = (unsigned long) (childregs + 1); savesegment(gs, p->thread.gs); @@ -368,6 +372,8 @@ void arch_setup_new_exec(void) task_clear_spec_ssb_noexec(current); speculation_ctrl_update(read_thread_flags()); } + + mm_reset_untag_mask(current->mm); } #ifdef CONFIG_X86_IOPL_IOPERM diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index bb65a68b4b49..223b223f713f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -671,7 +671,7 @@ void set_personality_64bit(void) task_pt_regs(current)->orig_ax = __NR_execve; current_thread_info()->status &= ~TS_COMPAT; if (current->mm) - current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL; + __set_bit(MM_CONTEXT_HAS_VSYSCALL, ¤t->mm->context.flags); /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, @@ -708,7 +708,7 @@ static void __set_personality_ia32(void) * uprobes applied to this MM need to know this and * cannot use user_64bit_mode() at that time. */ - current->mm->context.flags = MM_CONTEXT_UPROBE_IA32; + __set_bit(MM_CONTEXT_UPROBE_IA32, ¤t->mm->context.flags); } current->personality |= force_personality32; @@ -743,6 +743,52 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) } #endif +#ifdef CONFIG_ADDRESS_MASKING + +#define LAM_U57_BITS 6 + +static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) +{ + if (!cpu_feature_enabled(X86_FEATURE_LAM)) + return -ENODEV; + + /* PTRACE_ARCH_PRCTL */ + if (current->mm != mm) + return -EINVAL; + + if (mm_valid_pasid(mm) && + !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags)) + return -EINVAL; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { + mmap_write_unlock(mm); + return -EBUSY; + } + + if (!nr_bits) { + mmap_write_unlock(mm); + return -EINVAL; + } else if (nr_bits <= LAM_U57_BITS) { + mm->context.lam_cr3_mask = X86_CR3_LAM_U57; + mm->context.untag_mask = ~GENMASK(62, 57); + } else { + mmap_write_unlock(mm); + return -EINVAL; + } + + write_cr3(__read_cr3() | mm->context.lam_cr3_mask); + set_tlbstate_lam_mode(mm); + set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); + + mmap_write_unlock(mm); + + return 0; +} +#endif + long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; @@ -830,7 +876,23 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) case ARCH_MAP_VDSO_64: return prctl_map_vdso(&vdso_image_64, arg2); #endif - +#ifdef CONFIG_ADDRESS_MASKING + case ARCH_GET_UNTAG_MASK: + return put_user(task->mm->context.untag_mask, + (unsigned long __user *)arg2); + case ARCH_ENABLE_TAGGED_ADDR: + return prctl_enable_tagged_addr(task->mm, arg2); + case ARCH_FORCE_TAGGED_SVA: + if (current != task) + return -EINVAL; + set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags); + return 0; + case ARCH_GET_MAX_TAG_BITS: + if (!cpu_feature_enabled(X86_FEATURE_LAM)) + return put_user(0, (unsigned long __user *)arg2); + else + return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); +#endif default: ret = -EINVAL; break; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d317dc3d06a3..8b83d8fbce71 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -671,15 +671,15 @@ static bool try_fixup_enqcmd_gp(void) if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) return false; - pasid = current->mm->pasid; - /* * If the mm has not been allocated a * PASID, the #GP can not be fixed up. */ - if (!pasid_valid(pasid)) + if (!mm_valid_pasid(current->mm)) return false; + pasid = current->mm->pasid; + /* * Did this thread already have its PASID activated? * If so, the #GP must be from something else. diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index b70d98d79a9d..b64a2bd1a1ef 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -37,22 +37,22 @@ #define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC -#ifdef CONFIG_X86_5LEVEL -#define LOAD_TASK_SIZE_MINUS_N(n) \ - ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \ - __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), X86_FEATURE_LA57 -#else -#define LOAD_TASK_SIZE_MINUS_N(n) \ - mov $(TASK_SIZE_MAX - (n)),%_ASM_DX -#endif +.macro check_range size:req +.if IS_ENABLED(CONFIG_X86_64) + mov %rax, %rdx + sar $63, %rdx + or %rdx, %rax +.else + cmp $TASK_SIZE_MAX-\size+1, %eax + jae .Lbad_get_user + sbb %edx, %edx /* array_index_mask_nospec() */ + and %edx, %eax +.endif +.endm .text SYM_FUNC_START(__get_user_1) - LOAD_TASK_SIZE_MINUS_N(0) - cmp %_ASM_DX,%_ASM_AX - jae bad_get_user - sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ - and %_ASM_DX, %_ASM_AX + check_range size=1 ASM_STAC 1: movzbl (%_ASM_AX),%edx xor %eax,%eax @@ -62,11 +62,7 @@ SYM_FUNC_END(__get_user_1) EXPORT_SYMBOL(__get_user_1) SYM_FUNC_START(__get_user_2) - LOAD_TASK_SIZE_MINUS_N(1) - cmp %_ASM_DX,%_ASM_AX - jae bad_get_user - sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ - and %_ASM_DX, %_ASM_AX + check_range size=2 ASM_STAC 2: movzwl (%_ASM_AX),%edx xor %eax,%eax @@ -76,11 +72,7 @@ SYM_FUNC_END(__get_user_2) EXPORT_SYMBOL(__get_user_2) SYM_FUNC_START(__get_user_4) - LOAD_TASK_SIZE_MINUS_N(3) - cmp %_ASM_DX,%_ASM_AX - jae bad_get_user - sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ - and %_ASM_DX, %_ASM_AX + check_range size=4 ASM_STAC 3: movl (%_ASM_AX),%edx xor %eax,%eax @@ -90,30 +82,17 @@ SYM_FUNC_END(__get_user_4) EXPORT_SYMBOL(__get_user_4) SYM_FUNC_START(__get_user_8) -#ifdef CONFIG_X86_64 - LOAD_TASK_SIZE_MINUS_N(7) - cmp %_ASM_DX,%_ASM_AX - jae bad_get_user - sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ - and %_ASM_DX, %_ASM_AX + check_range size=8 ASM_STAC +#ifdef CONFIG_X86_64 4: movq (%_ASM_AX),%rdx - xor %eax,%eax - ASM_CLAC - RET #else - LOAD_TASK_SIZE_MINUS_N(7) - cmp %_ASM_DX,%_ASM_AX - jae bad_get_user_8 - sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ - and %_ASM_DX, %_ASM_AX - ASM_STAC 4: movl (%_ASM_AX),%edx 5: movl 4(%_ASM_AX),%ecx +#endif xor %eax,%eax ASM_CLAC RET -#endif SYM_FUNC_END(__get_user_8) EXPORT_SYMBOL(__get_user_8) @@ -166,7 +145,7 @@ EXPORT_SYMBOL(__get_user_nocheck_8) SYM_CODE_START_LOCAL(.Lbad_get_user_clac) ASM_CLAC -bad_get_user: +.Lbad_get_user: xor %edx,%edx mov $(-EFAULT),%_ASM_AX RET @@ -184,23 +163,23 @@ SYM_CODE_END(.Lbad_get_user_8_clac) #endif /* get_user */ - _ASM_EXTABLE_UA(1b, .Lbad_get_user_clac) - _ASM_EXTABLE_UA(2b, .Lbad_get_user_clac) - _ASM_EXTABLE_UA(3b, .Lbad_get_user_clac) + _ASM_EXTABLE(1b, .Lbad_get_user_clac) + _ASM_EXTABLE(2b, .Lbad_get_user_clac) + _ASM_EXTABLE(3b, .Lbad_get_user_clac) #ifdef CONFIG_X86_64 - _ASM_EXTABLE_UA(4b, .Lbad_get_user_clac) + _ASM_EXTABLE(4b, .Lbad_get_user_clac) #else - _ASM_EXTABLE_UA(4b, .Lbad_get_user_8_clac) - _ASM_EXTABLE_UA(5b, .Lbad_get_user_8_clac) + _ASM_EXTABLE(4b, .Lbad_get_user_8_clac) + _ASM_EXTABLE(5b, .Lbad_get_user_8_clac) #endif /* __get_user */ - _ASM_EXTABLE_UA(6b, .Lbad_get_user_clac) - _ASM_EXTABLE_UA(7b, .Lbad_get_user_clac) - _ASM_EXTABLE_UA(8b, .Lbad_get_user_clac) + _ASM_EXTABLE(6b, .Lbad_get_user_clac) + _ASM_EXTABLE(7b, .Lbad_get_user_clac) + _ASM_EXTABLE(8b, .Lbad_get_user_clac) #ifdef CONFIG_X86_64 - _ASM_EXTABLE_UA(9b, .Lbad_get_user_clac) + _ASM_EXTABLE(9b, .Lbad_get_user_clac) #else - _ASM_EXTABLE_UA(9b, .Lbad_get_user_8_clac) - _ASM_EXTABLE_UA(10b, .Lbad_get_user_8_clac) + _ASM_EXTABLE(9b, .Lbad_get_user_8_clac) + _ASM_EXTABLE(10b, .Lbad_get_user_8_clac) #endif diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 32125224fcca..3062d09a776d 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -33,20 +33,20 @@ * as they get called from within inline assembly. */ -#ifdef CONFIG_X86_5LEVEL -#define LOAD_TASK_SIZE_MINUS_N(n) \ - ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rbx), \ - __stringify(mov $((1 << 56) - 4096 - (n)),%rbx), X86_FEATURE_LA57 -#else -#define LOAD_TASK_SIZE_MINUS_N(n) \ - mov $(TASK_SIZE_MAX - (n)),%_ASM_BX -#endif +.macro check_range size:req +.if IS_ENABLED(CONFIG_X86_64) + mov %rcx, %rbx + sar $63, %rbx + or %rbx, %rcx +.else + cmp $TASK_SIZE_MAX-\size+1, %ecx + jae .Lbad_put_user +.endif +.endm .text SYM_FUNC_START(__put_user_1) - LOAD_TASK_SIZE_MINUS_N(0) - cmp %_ASM_BX,%_ASM_CX - jae .Lbad_put_user + check_range size=1 ASM_STAC 1: movb %al,(%_ASM_CX) xor %ecx,%ecx @@ -66,9 +66,7 @@ SYM_FUNC_END(__put_user_nocheck_1) EXPORT_SYMBOL(__put_user_nocheck_1) SYM_FUNC_START(__put_user_2) - LOAD_TASK_SIZE_MINUS_N(1) - cmp %_ASM_BX,%_ASM_CX - jae .Lbad_put_user + check_range size=2 ASM_STAC 3: movw %ax,(%_ASM_CX) xor %ecx,%ecx @@ -88,9 +86,7 @@ SYM_FUNC_END(__put_user_nocheck_2) EXPORT_SYMBOL(__put_user_nocheck_2) SYM_FUNC_START(__put_user_4) - LOAD_TASK_SIZE_MINUS_N(3) - cmp %_ASM_BX,%_ASM_CX - jae .Lbad_put_user + check_range size=4 ASM_STAC 5: movl %eax,(%_ASM_CX) xor %ecx,%ecx @@ -110,9 +106,7 @@ SYM_FUNC_END(__put_user_nocheck_4) EXPORT_SYMBOL(__put_user_nocheck_4) SYM_FUNC_START(__put_user_8) - LOAD_TASK_SIZE_MINUS_N(7) - cmp %_ASM_BX,%_ASM_CX - jae .Lbad_put_user + check_range size=8 ASM_STAC 7: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 @@ -144,15 +138,15 @@ SYM_CODE_START_LOCAL(.Lbad_put_user_clac) RET SYM_CODE_END(.Lbad_put_user_clac) - _ASM_EXTABLE_UA(1b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(2b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(3b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(4b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(5b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(6b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(7b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(9b, .Lbad_put_user_clac) + _ASM_EXTABLE(1b, .Lbad_put_user_clac) + _ASM_EXTABLE(2b, .Lbad_put_user_clac) + _ASM_EXTABLE(3b, .Lbad_put_user_clac) + _ASM_EXTABLE(4b, .Lbad_put_user_clac) + _ASM_EXTABLE(5b, .Lbad_put_user_clac) + _ASM_EXTABLE(6b, .Lbad_put_user_clac) + _ASM_EXTABLE(7b, .Lbad_put_user_clac) + _ASM_EXTABLE(9b, .Lbad_put_user_clac) #ifdef CONFIG_X86_32 - _ASM_EXTABLE_UA(8b, .Lbad_put_user_clac) - _ASM_EXTABLE_UA(10b, .Lbad_put_user_clac) + _ASM_EXTABLE(8b, .Lbad_put_user_clac) + _ASM_EXTABLE(10b, .Lbad_put_user_clac) #endif diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index cbc53da4c1b4..3cdac0f0055d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1048,6 +1048,11 @@ __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = { .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; +#ifdef CONFIG_ADDRESS_MASKING +DEFINE_PER_CPU(u64, tlbstate_untag_mask); +EXPORT_PER_CPU_SYMBOL(tlbstate_untag_mask); +#endif + void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) { /* entry 0 MUST be WB (hardwired to speed up translations) */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 16c5292d227d..267acf27480a 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -154,26 +154,30 @@ static inline u16 user_pcid(u16 asid) return ret; } -static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam) { + unsigned long cr3 = __sme_pa(pgd) | lam; + if (static_cpu_has(X86_FEATURE_PCID)) { - return __sme_pa(pgd) | kern_pcid(asid); + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + cr3 |= kern_pcid(asid); } else { VM_WARN_ON_ONCE(asid != 0); - return __sme_pa(pgd); } + + return cr3; } -static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid, + unsigned long lam) { - VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); /* * Use boot_cpu_has() instead of this_cpu_has() as this function * might be called during early boot. This should work even after * boot because all CPU's the have same capabilities: */ VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); - return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; + return build_cr3(pgd, asid, lam) | CR3_NOFLUSH; } /* @@ -274,15 +278,16 @@ static inline void invalidate_user_asid(u16 asid) (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); } -static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam, + bool need_flush) { unsigned long new_mm_cr3; if (need_flush) { invalidate_user_asid(new_asid); - new_mm_cr3 = build_cr3(pgdir, new_asid); + new_mm_cr3 = build_cr3(pgdir, new_asid, lam); } else { - new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam); } /* @@ -491,6 +496,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + unsigned long new_lam = mm_lam_cr3_mask(next); bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; @@ -520,7 +526,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * isn't free. */ #ifdef CONFIG_DEBUG_VM - if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, + tlbstate_lam_cr3_mask()))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. @@ -552,10 +559,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * instruction. */ if (real_prev == next) { + /* Not actually switching mm's */ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); /* + * If this races with another thread that enables lam, 'new_lam' + * might not match tlbstate_lam_cr3_mask(). + */ + + /* * Even in lazy TLB mode, the CPU should stay set in the * mm_cpumask. The TLB shootdown code can figure out from * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. @@ -622,15 +635,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, barrier(); } + set_tlbstate_lam_mode(next); if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); + load_new_mm_cr3(next->pgd, new_asid, new_lam, true); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); + load_new_mm_cr3(next->pgd, new_asid, new_lam, false); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } @@ -691,6 +705,10 @@ void initialize_tlbstate_and_flush(void) /* Assert that CR3 already references the right mm. */ WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); + /* LAM expected to be disabled */ + WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)); + WARN_ON(mm_lam_cr3_mask(mm)); + /* * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization * doesn't work like other CR4 bits because it can only be set from @@ -699,8 +717,8 @@ void initialize_tlbstate_and_flush(void) WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && !(cr4_read_shadow() & X86_CR4_PCIDE)); - /* Force ASID 0 and force a TLB flush. */ - write_cr3(build_cr3(mm->pgd, 0)); + /* Disable LAM, force ASID 0 and force a TLB flush. */ + write_cr3(build_cr3(mm->pgd, 0, 0)); /* Reinitialize tlbstate. */ this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT); @@ -708,6 +726,7 @@ void initialize_tlbstate_and_flush(void) this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); + set_tlbstate_lam_mode(mm); for (i = 1; i < TLB_NR_DYN_ASIDS; i++) this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); @@ -1071,8 +1090,10 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) */ unsigned long __get_current_cr3_fast(void) { - unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, - this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + unsigned long cr3 = + build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, + this_cpu_read(cpu_tlbstate.loaded_mm_asid), + tlbstate_lam_cr3_mask()); /* For now, be very restrictive about when this can be called. */ VM_WARN_ON(in_nmi() || preemptible()); diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 24bf9b2b58aa..dd76a1a09cf7 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -2,6 +2,7 @@ /* * Helpers for IOMMU drivers implementing SVA */ +#include <linux/mmu_context.h> #include <linux/mutex.h> #include <linux/sched/mm.h> #include <linux/iommu.h> @@ -32,16 +33,19 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max) min == 0 || max < min) return -EINVAL; + if (!arch_pgtable_dma_compat(mm)) + return -EBUSY; + mutex_lock(&iommu_sva_lock); /* Is a PASID already associated with this mm? */ - if (pasid_valid(mm->pasid)) { + if (mm_valid_pasid(mm)) { if (mm->pasid < min || mm->pasid >= max) ret = -EOVERFLOW; goto out; } pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm); - if (!pasid_valid(pasid)) + if (pasid == INVALID_IOASID) ret = -ENOMEM; else mm_pasid_set(mm, pasid); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 493c31de0edb..3d4dd9420c30 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -580,7 +580,7 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, goto done; } - vaddr = untagged_addr(vaddr); + vaddr = untagged_addr_remote(mm, vaddr); retry: vma = vma_lookup(mm, vaddr); diff --git a/fs/proc/array.c b/fs/proc/array.c index 425824ad85e1..d35bbf35a874 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -91,6 +91,7 @@ #include <linux/user_namespace.h> #include <linux/fs_struct.h> #include <linux/kthread.h> +#include <linux/mmu_context.h> #include <asm/processor.h> #include "internal.h" @@ -425,6 +426,11 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); } +static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm) +{ + seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm)); +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -440,6 +446,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_mem(m, mm); task_core_dumping(m, task); task_thp_status(m, mm); + task_untag_mask(m, mm); mmput(mm); } task_sig(m, task); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index cb49479acd2e..420510f6a545 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1688,8 +1688,13 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* watch out for wraparound */ start_vaddr = end_vaddr; - if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) - start_vaddr = untagged_addr(svpfn << PAGE_SHIFT); + if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { + ret = mmap_read_lock_killable(mm); + if (ret) + goto out_free; + start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); + mmap_read_unlock(mm); + } /* Ensure the address is inside the task */ if (start_vaddr > mm->task_size) diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h index af1c9d62e642..836ae09e92c2 100644 --- a/include/linux/ioasid.h +++ b/include/linux/ioasid.h @@ -40,10 +40,6 @@ void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid, int ioasid_register_allocator(struct ioasid_allocator_ops *allocator); void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator); int ioasid_set_data(ioasid_t ioasid, void *data); -static inline bool pasid_valid(ioasid_t ioasid) -{ - return ioasid != INVALID_IOASID; -} #else /* !CONFIG_IOASID */ static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, @@ -74,10 +70,5 @@ static inline int ioasid_set_data(ioasid_t ioasid, void *data) return -ENOTSUPP; } -static inline bool pasid_valid(ioasid_t ioasid) -{ - return false; -} - #endif /* CONFIG_IOASID */ #endif /* __LINUX_IOASID_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 3731999cd9f0..27ce77080c79 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -98,17 +98,6 @@ extern int mmap_rnd_compat_bits __read_mostly; #include <asm/page.h> #include <asm/processor.h> -/* - * Architectures that support memory tagging (assigning tags to memory regions, - * embedding these tags into addresses that point to these memory regions, and - * checking that the memory and the pointer tags match on memory accesses) - * redefine this macro to strip tags from pointers. - * It's defined as noop for architectures that don't support memory tagging. - */ -#ifndef untagged_addr -#define untagged_addr(addr) (addr) -#endif - #ifndef __pa_symbol #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) #endif diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index b9b970f7ab45..f2b7a3f04099 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -28,4 +28,18 @@ static inline void leave_mm(int cpu) { } # define task_cpu_possible(cpu, p) cpumask_test_cpu((cpu), task_cpu_possible_mask(p)) #endif +#ifndef mm_untag_mask +static inline unsigned long mm_untag_mask(struct mm_struct *mm) +{ + return -1UL; +} +#endif + +#ifndef arch_pgtable_dma_compat +static inline bool arch_pgtable_dma_compat(struct mm_struct *mm) +{ + return true; +} +#endif + #endif diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 689dbe812563..af12fcb11005 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -485,6 +485,11 @@ static inline void mm_pasid_init(struct mm_struct *mm) mm->pasid = INVALID_IOASID; } +static inline bool mm_valid_pasid(struct mm_struct *mm) +{ + return mm->pasid != INVALID_IOASID; +} + /* Associate a PASID with an mm_struct: */ static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid) { @@ -493,13 +498,14 @@ static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid) static inline void mm_pasid_drop(struct mm_struct *mm) { - if (pasid_valid(mm->pasid)) { + if (mm_valid_pasid(mm)) { ioasid_free(mm->pasid); mm->pasid = INVALID_IOASID; } } #else static inline void mm_pasid_init(struct mm_struct *mm) {} +static inline bool mm_valid_pasid(struct mm_struct *mm) { return false; } static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid) {} static inline void mm_pasid_drop(struct mm_struct *mm) {} #endif diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index ab9728138ad6..3064314f4832 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -11,6 +11,28 @@ #include <asm/uaccess.h> /* + * Architectures that support memory tagging (assigning tags to memory regions, + * embedding these tags into addresses that point to these memory regions, and + * checking that the memory and the pointer tags match on memory accesses) + * redefine this macro to strip tags from pointers. + * + * Passing down mm_struct allows to define untagging rules on per-process + * basis. + * + * It's defined as noop for architectures that don't support memory tagging. + */ +#ifndef untagged_addr +#define untagged_addr(addr) (addr) +#endif + +#ifndef untagged_addr_remote +#define untagged_addr_remote(mm, addr) ({ \ + mmap_assert_locked(mm); \ + untagged_addr(addr); \ +}) +#endif + +/* * Architectures should provide two primitives (raw_copy_{to,from}_user()) * and get rid of their private instances of copy_{to,from}_user() and * __copy_{to,from}_user{,_inatomic}(). @@ -1085,7 +1085,7 @@ static long __get_user_pages(struct mm_struct *mm, if (!nr_pages) return 0; - start = untagged_addr(start); + start = untagged_addr_remote(mm, start); VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); @@ -1259,7 +1259,7 @@ int fixup_user_fault(struct mm_struct *mm, struct vm_area_struct *vma; vm_fault_t ret; - address = untagged_addr(address); + address = untagged_addr_remote(mm, address); if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; diff --git a/mm/madvise.c b/mm/madvise.c index 24c5cffe3e6c..b5ffbaf616f5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1390,8 +1390,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh size_t len; struct blk_plug plug; - start = untagged_addr(start); - if (!madvise_behavior_valid(behavior)) return -EINVAL; @@ -1423,6 +1421,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh mmap_read_lock(mm); } + start = untagged_addr_remote(mm, start); + end = start + len; + blk_start_plug(&plug); error = madvise_walk_vmas(mm, start, end, behavior, madvise_vma_behavior); diff --git a/mm/migrate.c b/mm/migrate.c index 02cace7955d4..01cac26a3127 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2099,15 +2099,18 @@ static int do_move_pages_to_node(struct mm_struct *mm, * target node * 1 - when it has been queued */ -static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, +static int add_page_for_migration(struct mm_struct *mm, const void __user *p, int node, struct list_head *pagelist, bool migrate_all) { struct vm_area_struct *vma; + unsigned long addr; struct page *page; int err; bool isolated; mmap_read_lock(mm); + addr = (unsigned long)untagged_addr_remote(mm, p); + err = -EFAULT; vma = vma_lookup(mm, addr); if (!vma || !vma_migratable(vma)) @@ -2213,7 +2216,6 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, for (i = start = 0; i < nr_pages; i++) { const void __user *p; - unsigned long addr; int node; err = -EFAULT; @@ -2221,7 +2223,6 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, goto out_flush; if (get_user(node, nodes + i)) goto out_flush; - addr = (unsigned long)untagged_addr(p); err = -ENODEV; if (node < 0 || node >= MAX_NUMNODES) @@ -2249,8 +2250,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, * Errors in the page lookup or isolation are not fatal and we simply * report them via status */ - err = add_page_for_migration(mm, addr, current_node, - &pagelist, flags & MPOL_MF_MOVE_ALL); + err = add_page_for_migration(mm, p, current_node, &pagelist, + flags & MPOL_MF_MOVE_ALL); if (err > 0) { /* The page is successfully queued for migration */ diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index ca9374b56ead..598135d3162b 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -18,7 +18,7 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \ - corrupt_xstate_header amx + corrupt_xstate_header amx lam # Some selftests require 32bit support enabled also on 64bit systems TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c new file mode 100644 index 000000000000..eb0e46905bf9 --- /dev/null +++ b/tools/testing/selftests/x86/lam.c @@ -0,0 +1,1241 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/syscall.h> +#include <time.h> +#include <signal.h> +#include <setjmp.h> +#include <sys/mman.h> +#include <sys/utsname.h> +#include <sys/wait.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <inttypes.h> +#include <sched.h> + +#include <sys/uio.h> +#include <linux/io_uring.h> +#include "../kselftest.h" + +#ifndef __x86_64__ +# error This test is 64-bit only +#endif + +/* LAM modes, these definitions were copied from kernel code */ +#define LAM_NONE 0 +#define LAM_U57_BITS 6 + +#define LAM_U57_MASK (0x3fULL << 57) +/* arch prctl for LAM */ +#define ARCH_GET_UNTAG_MASK 0x4001 +#define ARCH_ENABLE_TAGGED_ADDR 0x4002 +#define ARCH_GET_MAX_TAG_BITS 0x4003 +#define ARCH_FORCE_TAGGED_SVA 0x4004 + +/* Specified test function bits */ +#define FUNC_MALLOC 0x1 +#define FUNC_BITS 0x2 +#define FUNC_MMAP 0x4 +#define FUNC_SYSCALL 0x8 +#define FUNC_URING 0x10 +#define FUNC_INHERITE 0x20 +#define FUNC_PASID 0x40 + +#define TEST_MASK 0x7f + +#define LOW_ADDR (0x1UL << 30) +#define HIGH_ADDR (0x3UL << 48) + +#define MALLOC_LEN 32 + +#define PAGE_SIZE (4 << 10) + +#define STACK_SIZE 65536 + +#define barrier() ({ \ + __asm__ __volatile__("" : : : "memory"); \ +}) + +#define URING_QUEUE_SZ 1 +#define URING_BLOCK_SZ 2048 + +/* Pasid test define */ +#define LAM_CMD_BIT 0x1 +#define PAS_CMD_BIT 0x2 +#define SVA_CMD_BIT 0x4 + +#define PAS_CMD(cmd1, cmd2, cmd3) (((cmd3) << 8) | ((cmd2) << 4) | ((cmd1) << 0)) + +struct testcases { + unsigned int later; + int expected; /* 2: SIGSEGV Error; 1: other errors */ + unsigned long lam; + uint64_t addr; + uint64_t cmd; + int (*test_func)(struct testcases *test); + const char *msg; +}; + +/* Used by CQ of uring, source file handler and file's size */ +struct file_io { + int file_fd; + off_t file_sz; + struct iovec iovecs[]; +}; + +struct io_uring_queue { + unsigned int *head; + unsigned int *tail; + unsigned int *ring_mask; + unsigned int *ring_entries; + unsigned int *flags; + unsigned int *array; + union { + struct io_uring_cqe *cqes; + struct io_uring_sqe *sqes; + } queue; + size_t ring_sz; +}; + +struct io_ring { + int ring_fd; + struct io_uring_queue sq_ring; + struct io_uring_queue cq_ring; +}; + +int tests_cnt; +jmp_buf segv_env; + +static void segv_handler(int sig) +{ + ksft_print_msg("Get segmentation fault(%d).", sig); + + siglongjmp(segv_env, 1); +} + +static inline int cpu_has_lam(void) +{ + unsigned int cpuinfo[4]; + + __cpuid_count(0x7, 1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); + + return (cpuinfo[0] & (1 << 26)); +} + +/* Check 5-level page table feature in CPUID.(EAX=07H, ECX=00H):ECX.[bit 16] */ +static inline int cpu_has_la57(void) +{ + unsigned int cpuinfo[4]; + + __cpuid_count(0x7, 0, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); + + return (cpuinfo[2] & (1 << 16)); +} + +/* + * Set tagged address and read back untag mask. + * check if the untagged mask is expected. + * + * @return: + * 0: Set LAM mode successfully + * others: failed to set LAM + */ +static int set_lam(unsigned long lam) +{ + int ret = 0; + uint64_t ptr = 0; + + if (lam != LAM_U57_BITS && lam != LAM_NONE) + return -1; + + /* Skip check return */ + syscall(SYS_arch_prctl, ARCH_ENABLE_TAGGED_ADDR, lam); + + /* Get untagged mask */ + syscall(SYS_arch_prctl, ARCH_GET_UNTAG_MASK, &ptr); + + /* Check mask returned is expected */ + if (lam == LAM_U57_BITS) + ret = (ptr != ~(LAM_U57_MASK)); + else if (lam == LAM_NONE) + ret = (ptr != -1ULL); + + return ret; +} + +static unsigned long get_default_tag_bits(void) +{ + pid_t pid; + int lam = LAM_NONE; + int ret = 0; + + pid = fork(); + if (pid < 0) { + perror("Fork failed."); + } else if (pid == 0) { + /* Set LAM mode in child process */ + if (set_lam(LAM_U57_BITS) == 0) + lam = LAM_U57_BITS; + else + lam = LAM_NONE; + exit(lam); + } else { + wait(&ret); + lam = WEXITSTATUS(ret); + } + + return lam; +} + +/* + * Set tagged address and read back untag mask. + * check if the untag mask is expected. + */ +static int get_lam(void) +{ + uint64_t ptr = 0; + int ret = -1; + /* Get untagged mask */ + if (syscall(SYS_arch_prctl, ARCH_GET_UNTAG_MASK, &ptr) == -1) + return -1; + + /* Check mask returned is expected */ + if (ptr == ~(LAM_U57_MASK)) + ret = LAM_U57_BITS; + else if (ptr == -1ULL) + ret = LAM_NONE; + + + return ret; +} + +/* According to LAM mode, set metadata in high bits */ +static uint64_t set_metadata(uint64_t src, unsigned long lam) +{ + uint64_t metadata; + + srand(time(NULL)); + + switch (lam) { + case LAM_U57_BITS: /* Set metadata in bits 62:57 */ + /* Get a random non-zero value as metadata */ + metadata = (rand() % ((1UL << LAM_U57_BITS) - 1) + 1) << 57; + metadata |= (src & ~(LAM_U57_MASK)); + break; + default: + metadata = src; + break; + } + + return metadata; +} + +/* + * Set metadata in user pointer, compare new pointer with original pointer. + * both pointers should point to the same address. + * + * @return: + * 0: value on the pointer with metadate and value on original are same + * 1: not same. + */ +static int handle_lam_test(void *src, unsigned int lam) +{ + char *ptr; + + strcpy((char *)src, "USER POINTER"); + + ptr = (char *)set_metadata((uint64_t)src, lam); + if (src == ptr) + return 0; + + /* Copy a string into the pointer with metadata */ + strcpy((char *)ptr, "METADATA POINTER"); + + return (!!strcmp((char *)src, (char *)ptr)); +} + + +int handle_max_bits(struct testcases *test) +{ + unsigned long exp_bits = get_default_tag_bits(); + unsigned long bits = 0; + + if (exp_bits != LAM_NONE) + exp_bits = LAM_U57_BITS; + + /* Get LAM max tag bits */ + if (syscall(SYS_arch_prctl, ARCH_GET_MAX_TAG_BITS, &bits) == -1) + return 1; + + return (exp_bits != bits); +} + +/* + * Test lam feature through dereference pointer get from malloc. + * @return 0: Pass test. 1: Get failure during test 2: Get SIGSEGV + */ +static int handle_malloc(struct testcases *test) +{ + char *ptr = NULL; + int ret = 0; + + if (test->later == 0 && test->lam != 0) + if (set_lam(test->lam) == -1) + return 1; + + ptr = (char *)malloc(MALLOC_LEN); + if (ptr == NULL) { + perror("malloc() failure\n"); + return 1; + } + + /* Set signal handler */ + if (sigsetjmp(segv_env, 1) == 0) { + signal(SIGSEGV, segv_handler); + ret = handle_lam_test(ptr, test->lam); + } else { + ret = 2; + } + + if (test->later != 0 && test->lam != 0) + if (set_lam(test->lam) == -1 && ret == 0) + ret = 1; + + free(ptr); + + return ret; +} + +static int handle_mmap(struct testcases *test) +{ + void *ptr; + unsigned int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED; + int ret = 0; + + if (test->later == 0 && test->lam != 0) + if (set_lam(test->lam) != 0) + return 1; + + ptr = mmap((void *)test->addr, PAGE_SIZE, PROT_READ | PROT_WRITE, + flags, -1, 0); + if (ptr == MAP_FAILED) { + if (test->addr == HIGH_ADDR) + if (!cpu_has_la57()) + return 3; /* unsupport LA57 */ + return 1; + } + + if (test->later != 0 && test->lam != 0) + if (set_lam(test->lam) != 0) + ret = 1; + + if (ret == 0) { + if (sigsetjmp(segv_env, 1) == 0) { + signal(SIGSEGV, segv_handler); + ret = handle_lam_test(ptr, test->lam); + } else { + ret = 2; + } + } + + munmap(ptr, PAGE_SIZE); + return ret; +} + +static int handle_syscall(struct testcases *test) +{ + struct utsname unme, *pu; + int ret = 0; + + if (test->later == 0 && test->lam != 0) + if (set_lam(test->lam) != 0) + return 1; + + if (sigsetjmp(segv_env, 1) == 0) { + signal(SIGSEGV, segv_handler); + pu = (struct utsname *)set_metadata((uint64_t)&unme, test->lam); + ret = uname(pu); + if (ret < 0) + ret = 1; + } else { + ret = 2; + } + + if (test->later != 0 && test->lam != 0) + if (set_lam(test->lam) != -1 && ret == 0) + ret = 1; + + return ret; +} + +int sys_uring_setup(unsigned int entries, struct io_uring_params *p) +{ + return (int)syscall(__NR_io_uring_setup, entries, p); +} + +int sys_uring_enter(int fd, unsigned int to, unsigned int min, unsigned int flags) +{ + return (int)syscall(__NR_io_uring_enter, fd, to, min, flags, NULL, 0); +} + +/* Init submission queue and completion queue */ +int mmap_io_uring(struct io_uring_params p, struct io_ring *s) +{ + struct io_uring_queue *sring = &s->sq_ring; + struct io_uring_queue *cring = &s->cq_ring; + + sring->ring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned int); + cring->ring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe); + + if (p.features & IORING_FEAT_SINGLE_MMAP) { + if (cring->ring_sz > sring->ring_sz) + sring->ring_sz = cring->ring_sz; + + cring->ring_sz = sring->ring_sz; + } + + void *sq_ptr = mmap(0, sring->ring_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, s->ring_fd, + IORING_OFF_SQ_RING); + + if (sq_ptr == MAP_FAILED) { + perror("sub-queue!"); + return 1; + } + + void *cq_ptr = sq_ptr; + + if (!(p.features & IORING_FEAT_SINGLE_MMAP)) { + cq_ptr = mmap(0, cring->ring_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, s->ring_fd, + IORING_OFF_CQ_RING); + if (cq_ptr == MAP_FAILED) { + perror("cpl-queue!"); + munmap(sq_ptr, sring->ring_sz); + return 1; + } + } + + sring->head = sq_ptr + p.sq_off.head; + sring->tail = sq_ptr + p.sq_off.tail; + sring->ring_mask = sq_ptr + p.sq_off.ring_mask; + sring->ring_entries = sq_ptr + p.sq_off.ring_entries; + sring->flags = sq_ptr + p.sq_off.flags; + sring->array = sq_ptr + p.sq_off.array; + + /* Map a queue as mem map */ + s->sq_ring.queue.sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, + s->ring_fd, IORING_OFF_SQES); + if (s->sq_ring.queue.sqes == MAP_FAILED) { + munmap(sq_ptr, sring->ring_sz); + if (sq_ptr != cq_ptr) { + ksft_print_msg("failed to mmap uring queue!"); + munmap(cq_ptr, cring->ring_sz); + return 1; + } + } + + cring->head = cq_ptr + p.cq_off.head; + cring->tail = cq_ptr + p.cq_off.tail; + cring->ring_mask = cq_ptr + p.cq_off.ring_mask; + cring->ring_entries = cq_ptr + p.cq_off.ring_entries; + cring->queue.cqes = cq_ptr + p.cq_off.cqes; + + return 0; +} + +/* Init io_uring queues */ +int setup_io_uring(struct io_ring *s) +{ + struct io_uring_params para; + + memset(¶, 0, sizeof(para)); + s->ring_fd = sys_uring_setup(URING_QUEUE_SZ, ¶); + if (s->ring_fd < 0) + return 1; + + return mmap_io_uring(para, s); +} + +/* + * Get data from completion queue. the data buffer saved the file data + * return 0: success; others: error; + */ +int handle_uring_cq(struct io_ring *s) +{ + struct file_io *fi = NULL; + struct io_uring_queue *cring = &s->cq_ring; + struct io_uring_cqe *cqe; + unsigned int head; + off_t len = 0; + + head = *cring->head; + + do { + barrier(); + if (head == *cring->tail) + break; + /* Get the entry */ + cqe = &cring->queue.cqes[head & *s->cq_ring.ring_mask]; + fi = (struct file_io *)cqe->user_data; + if (cqe->res < 0) + break; + + int blocks = (int)(fi->file_sz + URING_BLOCK_SZ - 1) / URING_BLOCK_SZ; + + for (int i = 0; i < blocks; i++) + len += fi->iovecs[i].iov_len; + + head++; + } while (1); + + *cring->head = head; + barrier(); + + return (len != fi->file_sz); +} + +/* + * Submit squeue. specify via IORING_OP_READV. + * the buffer need to be set metadata according to LAM mode + */ +int handle_uring_sq(struct io_ring *ring, struct file_io *fi, unsigned long lam) +{ + int file_fd = fi->file_fd; + struct io_uring_queue *sring = &ring->sq_ring; + unsigned int index = 0, cur_block = 0, tail = 0, next_tail = 0; + struct io_uring_sqe *sqe; + + off_t remain = fi->file_sz; + int blocks = (int)(remain + URING_BLOCK_SZ - 1) / URING_BLOCK_SZ; + + while (remain) { + off_t bytes = remain; + void *buf; + + if (bytes > URING_BLOCK_SZ) + bytes = URING_BLOCK_SZ; + + fi->iovecs[cur_block].iov_len = bytes; + + if (posix_memalign(&buf, URING_BLOCK_SZ, URING_BLOCK_SZ)) + return 1; + + fi->iovecs[cur_block].iov_base = (void *)set_metadata((uint64_t)buf, lam); + remain -= bytes; + cur_block++; + } + + next_tail = *sring->tail; + tail = next_tail; + next_tail++; + + barrier(); + + index = tail & *ring->sq_ring.ring_mask; + + sqe = &ring->sq_ring.queue.sqes[index]; + sqe->fd = file_fd; + sqe->flags = 0; + sqe->opcode = IORING_OP_READV; + sqe->addr = (unsigned long)fi->iovecs; + sqe->len = blocks; + sqe->off = 0; + sqe->user_data = (uint64_t)fi; + + sring->array[index] = index; + tail = next_tail; + + if (*sring->tail != tail) { + *sring->tail = tail; + barrier(); + } + + if (sys_uring_enter(ring->ring_fd, 1, 1, IORING_ENTER_GETEVENTS) < 0) + return 1; + + return 0; +} + +/* + * Test LAM in async I/O and io_uring, read current binery through io_uring + * Set metadata in pointers to iovecs buffer. + */ +int do_uring(unsigned long lam) +{ + struct io_ring *ring; + struct file_io *fi; + struct stat st; + int ret = 1; + char path[PATH_MAX] = {0}; + + /* get current process path */ + if (readlink("/proc/self/exe", path, PATH_MAX) <= 0) + return 1; + + int file_fd = open(path, O_RDONLY); + + if (file_fd < 0) + return 1; + + if (fstat(file_fd, &st) < 0) + return 1; + + off_t file_sz = st.st_size; + + int blocks = (int)(file_sz + URING_BLOCK_SZ - 1) / URING_BLOCK_SZ; + + fi = malloc(sizeof(*fi) + sizeof(struct iovec) * blocks); + if (!fi) + return 1; + + fi->file_sz = file_sz; + fi->file_fd = file_fd; + + ring = malloc(sizeof(*ring)); + if (!ring) + return 1; + + memset(ring, 0, sizeof(struct io_ring)); + + if (setup_io_uring(ring)) + goto out; + + if (handle_uring_sq(ring, fi, lam)) + goto out; + + ret = handle_uring_cq(ring); + +out: + free(ring); + + for (int i = 0; i < blocks; i++) { + if (fi->iovecs[i].iov_base) { + uint64_t addr = ((uint64_t)fi->iovecs[i].iov_base); + + switch (lam) { + case LAM_U57_BITS: /* Clear bits 62:57 */ + addr = (addr & ~(LAM_U57_MASK)); + break; + } + free((void *)addr); + fi->iovecs[i].iov_base = NULL; + } + } + + free(fi); + + return ret; +} + +int handle_uring(struct testcases *test) +{ + int ret = 0; + + if (test->later == 0 && test->lam != 0) + if (set_lam(test->lam) != 0) + return 1; + + if (sigsetjmp(segv_env, 1) == 0) { + signal(SIGSEGV, segv_handler); + ret = do_uring(test->lam); + } else { + ret = 2; + } + + return ret; +} + +static int fork_test(struct testcases *test) +{ + int ret, child_ret; + pid_t pid; + + pid = fork(); + if (pid < 0) { + perror("Fork failed."); + ret = 1; + } else if (pid == 0) { + ret = test->test_func(test); + exit(ret); + } else { + wait(&child_ret); + ret = WEXITSTATUS(child_ret); + } + + return ret; +} + +static int handle_execve(struct testcases *test) +{ + int ret, child_ret; + int lam = test->lam; + pid_t pid; + + pid = fork(); + if (pid < 0) { + perror("Fork failed."); + ret = 1; + } else if (pid == 0) { + char path[PATH_MAX]; + + /* Set LAM mode in parent process */ + if (set_lam(lam) != 0) + return 1; + + /* Get current binary's path and the binary was run by execve */ + if (readlink("/proc/self/exe", path, PATH_MAX) <= 0) + exit(-1); + + /* run binary to get LAM mode and return to parent process */ + if (execlp(path, path, "-t 0x0", NULL) < 0) { + perror("error on exec"); + exit(-1); + } + } else { + wait(&child_ret); + ret = WEXITSTATUS(child_ret); + if (ret != LAM_NONE) + return 1; + } + + return 0; +} + +static int handle_inheritance(struct testcases *test) +{ + int ret, child_ret; + int lam = test->lam; + pid_t pid; + + /* Set LAM mode in parent process */ + if (set_lam(lam) != 0) + return 1; + + pid = fork(); + if (pid < 0) { + perror("Fork failed."); + return 1; + } else if (pid == 0) { + /* Set LAM mode in parent process */ + int child_lam = get_lam(); + + exit(child_lam); + } else { + wait(&child_ret); + ret = WEXITSTATUS(child_ret); + + if (lam != ret) + return 1; + } + + return 0; +} + +static int thread_fn_get_lam(void *arg) +{ + return get_lam(); +} + +static int thread_fn_set_lam(void *arg) +{ + struct testcases *test = arg; + + return set_lam(test->lam); +} + +static int handle_thread(struct testcases *test) +{ + char stack[STACK_SIZE]; + int ret, child_ret; + int lam = 0; + pid_t pid; + + /* Set LAM mode in parent process */ + if (!test->later) { + lam = test->lam; + if (set_lam(lam) != 0) + return 1; + } + + pid = clone(thread_fn_get_lam, stack + STACK_SIZE, + SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM, NULL); + if (pid < 0) { + perror("Clone failed."); + return 1; + } + + waitpid(pid, &child_ret, 0); + ret = WEXITSTATUS(child_ret); + + if (lam != ret) + return 1; + + if (test->later) { + if (set_lam(test->lam) != 0) + return 1; + } + + return 0; +} + +static int handle_thread_enable(struct testcases *test) +{ + char stack[STACK_SIZE]; + int ret, child_ret; + int lam = test->lam; + pid_t pid; + + pid = clone(thread_fn_set_lam, stack + STACK_SIZE, + SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM, test); + if (pid < 0) { + perror("Clone failed."); + return 1; + } + + waitpid(pid, &child_ret, 0); + ret = WEXITSTATUS(child_ret); + + if (lam != ret) + return 1; + + return 0; +} +static void run_test(struct testcases *test, int count) +{ + int i, ret = 0; + + for (i = 0; i < count; i++) { + struct testcases *t = test + i; + + /* fork a process to run test case */ + tests_cnt++; + ret = fork_test(t); + + /* return 3 is not support LA57, the case should be skipped */ + if (ret == 3) { + ksft_test_result_skip(t->msg); + continue; + } + + if (ret != 0) + ret = (t->expected == ret); + else + ret = !(t->expected); + + ksft_test_result(ret, t->msg); + } +} + +static struct testcases uring_cases[] = { + { + .later = 0, + .lam = LAM_U57_BITS, + .test_func = handle_uring, + .msg = "URING: LAM_U57. Dereferencing pointer with metadata\n", + }, + { + .later = 1, + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = handle_uring, + .msg = "URING:[Negative] Disable LAM. Dereferencing pointer with metadata.\n", + }, +}; + +static struct testcases malloc_cases[] = { + { + .later = 0, + .lam = LAM_U57_BITS, + .test_func = handle_malloc, + .msg = "MALLOC: LAM_U57. Dereferencing pointer with metadata\n", + }, + { + .later = 1, + .expected = 2, + .lam = LAM_U57_BITS, + .test_func = handle_malloc, + .msg = "MALLOC:[Negative] Disable LAM. Dereferencing pointer with metadata.\n", + }, +}; + +static struct testcases bits_cases[] = { + { + .test_func = handle_max_bits, + .msg = "BITS: Check default tag bits\n", + }, +}; + +static struct testcases syscall_cases[] = { + { + .later = 0, + .lam = LAM_U57_BITS, + .test_func = handle_syscall, + .msg = "SYSCALL: LAM_U57. syscall with metadata\n", + }, + { + .later = 1, + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = handle_syscall, + .msg = "SYSCALL:[Negative] Disable LAM. Dereferencing pointer with metadata.\n", + }, +}; + +static struct testcases mmap_cases[] = { + { + .later = 1, + .expected = 0, + .lam = LAM_U57_BITS, + .addr = HIGH_ADDR, + .test_func = handle_mmap, + .msg = "MMAP: First mmap high address, then set LAM_U57.\n", + }, + { + .later = 0, + .expected = 0, + .lam = LAM_U57_BITS, + .addr = HIGH_ADDR, + .test_func = handle_mmap, + .msg = "MMAP: First LAM_U57, then High address.\n", + }, + { + .later = 0, + .expected = 0, + .lam = LAM_U57_BITS, + .addr = LOW_ADDR, + .test_func = handle_mmap, + .msg = "MMAP: First LAM_U57, then Low address.\n", + }, +}; + +static struct testcases inheritance_cases[] = { + { + .expected = 0, + .lam = LAM_U57_BITS, + .test_func = handle_inheritance, + .msg = "FORK: LAM_U57, child process should get LAM mode same as parent\n", + }, + { + .expected = 0, + .lam = LAM_U57_BITS, + .test_func = handle_thread, + .msg = "THREAD: LAM_U57, child thread should get LAM mode same as parent\n", + }, + { + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = handle_thread_enable, + .msg = "THREAD: [NEGATIVE] Enable LAM in child.\n", + }, + { + .expected = 1, + .later = 1, + .lam = LAM_U57_BITS, + .test_func = handle_thread, + .msg = "THREAD: [NEGATIVE] Enable LAM in parent after thread created.\n", + }, + { + .expected = 0, + .lam = LAM_U57_BITS, + .test_func = handle_execve, + .msg = "EXECVE: LAM_U57, child process should get disabled LAM mode\n", + }, +}; + +static void cmd_help(void) +{ + printf("usage: lam [-h] [-t test list]\n"); + printf("\t-t test list: run tests specified in the test list, default:0x%x\n", TEST_MASK); + printf("\t\t0x1:malloc; 0x2:max_bits; 0x4:mmap; 0x8:syscall; 0x10:io_uring; 0x20:inherit;\n"); + printf("\t-h: help\n"); +} + +/* Check for file existence */ +uint8_t file_Exists(const char *fileName) +{ + struct stat buffer; + + uint8_t ret = (stat(fileName, &buffer) == 0); + + return ret; +} + +/* Sysfs idxd files */ +const char *dsa_configs[] = { + "echo 1 > /sys/bus/dsa/devices/dsa0/wq0.1/group_id", + "echo shared > /sys/bus/dsa/devices/dsa0/wq0.1/mode", + "echo 10 > /sys/bus/dsa/devices/dsa0/wq0.1/priority", + "echo 16 > /sys/bus/dsa/devices/dsa0/wq0.1/size", + "echo 15 > /sys/bus/dsa/devices/dsa0/wq0.1/threshold", + "echo user > /sys/bus/dsa/devices/dsa0/wq0.1/type", + "echo MyApp1 > /sys/bus/dsa/devices/dsa0/wq0.1/name", + "echo 1 > /sys/bus/dsa/devices/dsa0/engine0.1/group_id", + "echo dsa0 > /sys/bus/dsa/drivers/idxd/bind", + /* bind files and devices, generated a device file in /dev */ + "echo wq0.1 > /sys/bus/dsa/drivers/user/bind", +}; + +/* DSA device file */ +const char *dsaDeviceFile = "/dev/dsa/wq0.1"; +/* file for io*/ +const char *dsaPasidEnable = "/sys/bus/dsa/devices/dsa0/pasid_enabled"; + +/* + * DSA depends on kernel cmdline "intel_iommu=on,sm_on" + * return pasid_enabled (0: disable 1:enable) + */ +int Check_DSA_Kernel_Setting(void) +{ + char command[256] = ""; + char buf[256] = ""; + char *ptr; + int rv = -1; + + snprintf(command, sizeof(command) - 1, "cat %s", dsaPasidEnable); + + FILE *cmd = popen(command, "r"); + + if (cmd) { + while (fgets(buf, sizeof(buf) - 1, cmd) != NULL); + + pclose(cmd); + rv = strtol(buf, &ptr, 16); + } + + return rv; +} + +/* + * Config DSA's sysfs files as shared DSA's WQ. + * Generated a device file /dev/dsa/wq0.1 + * Return: 0 OK; 1 Failed; 3 Skip(SVA disabled). + */ +int Dsa_Init_Sysfs(void) +{ + uint len = ARRAY_SIZE(dsa_configs); + const char **p = dsa_configs; + + if (file_Exists(dsaDeviceFile) == 1) + return 0; + + /* check the idxd driver */ + if (file_Exists(dsaPasidEnable) != 1) { + printf("Please make sure idxd driver was loaded\n"); + return 3; + } + + /* Check SVA feature */ + if (Check_DSA_Kernel_Setting() != 1) { + printf("Please enable SVA.(Add intel_iommu=on,sm_on in kernel cmdline)\n"); + return 3; + } + + /* Check the idxd device file on /dev/dsa/ */ + for (int i = 0; i < len; i++) { + if (system(p[i])) + return 1; + } + + /* After config, /dev/dsa/wq0.1 should be generated */ + return (file_Exists(dsaDeviceFile) != 1); +} + +/* + * Open DSA device file, triger API: iommu_sva_alloc_pasid + */ +void *allocate_dsa_pasid(void) +{ + int fd; + void *wq; + + fd = open(dsaDeviceFile, O_RDWR); + if (fd < 0) { + perror("open"); + return MAP_FAILED; + } + + wq = mmap(NULL, 0x1000, PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (wq == MAP_FAILED) + perror("mmap"); + + return wq; +} + +int set_force_svm(void) +{ + int ret = 0; + + ret = syscall(SYS_arch_prctl, ARCH_FORCE_TAGGED_SVA); + + return ret; +} + +int handle_pasid(struct testcases *test) +{ + uint tmp = test->cmd; + uint runed = 0x0; + int ret = 0; + void *wq = NULL; + + ret = Dsa_Init_Sysfs(); + if (ret != 0) + return ret; + + for (int i = 0; i < 3; i++) { + int err = 0; + + if (tmp & 0x1) { + /* run set lam mode*/ + if ((runed & 0x1) == 0) { + err = set_lam(LAM_U57_BITS); + runed = runed | 0x1; + } else + err = 1; + } else if (tmp & 0x4) { + /* run force svm */ + if ((runed & 0x4) == 0) { + err = set_force_svm(); + runed = runed | 0x4; + } else + err = 1; + } else if (tmp & 0x2) { + /* run allocate pasid */ + if ((runed & 0x2) == 0) { + runed = runed | 0x2; + wq = allocate_dsa_pasid(); + if (wq == MAP_FAILED) + err = 1; + } else + err = 1; + } + + ret = ret + err; + if (ret > 0) + break; + + tmp = tmp >> 4; + } + + if (wq != MAP_FAILED && wq != NULL) + if (munmap(wq, 0x1000)) + printf("munmap failed %d\n", errno); + + if (runed != 0x7) + ret = 1; + + return (ret != 0); +} + +/* + * Pasid test depends on idxd and SVA, kernel should enable iommu and sm. + * command line(intel_iommu=on,sm_on) + */ +static struct testcases pasid_cases[] = { + { + .expected = 1, + .cmd = PAS_CMD(LAM_CMD_BIT, PAS_CMD_BIT, SVA_CMD_BIT), + .test_func = handle_pasid, + .msg = "PASID: [Negative] Execute LAM, PASID, SVA in sequence\n", + }, + { + .expected = 0, + .cmd = PAS_CMD(LAM_CMD_BIT, SVA_CMD_BIT, PAS_CMD_BIT), + .test_func = handle_pasid, + .msg = "PASID: Execute LAM, SVA, PASID in sequence\n", + }, + { + .expected = 1, + .cmd = PAS_CMD(PAS_CMD_BIT, LAM_CMD_BIT, SVA_CMD_BIT), + .test_func = handle_pasid, + .msg = "PASID: [Negative] Execute PASID, LAM, SVA in sequence\n", + }, + { + .expected = 0, + .cmd = PAS_CMD(PAS_CMD_BIT, SVA_CMD_BIT, LAM_CMD_BIT), + .test_func = handle_pasid, + .msg = "PASID: Execute PASID, SVA, LAM in sequence\n", + }, + { + .expected = 0, + .cmd = PAS_CMD(SVA_CMD_BIT, LAM_CMD_BIT, PAS_CMD_BIT), + .test_func = handle_pasid, + .msg = "PASID: Execute SVA, LAM, PASID in sequence\n", + }, + { + .expected = 0, + .cmd = PAS_CMD(SVA_CMD_BIT, PAS_CMD_BIT, LAM_CMD_BIT), + .test_func = handle_pasid, + .msg = "PASID: Execute SVA, PASID, LAM in sequence\n", + }, +}; + +int main(int argc, char **argv) +{ + int c = 0; + unsigned int tests = TEST_MASK; + + tests_cnt = 0; + + if (!cpu_has_lam()) { + ksft_print_msg("Unsupported LAM feature!\n"); + return -1; + } + + while ((c = getopt(argc, argv, "ht:")) != -1) { + switch (c) { + case 't': + tests = strtoul(optarg, NULL, 16); + if (tests && !(tests & TEST_MASK)) { + ksft_print_msg("Invalid argument!\n"); + return -1; + } + break; + case 'h': + cmd_help(); + return 0; + default: + ksft_print_msg("Invalid argument\n"); + return -1; + } + } + + /* + * When tests is 0, it is not a real test case; + * the option used by test case(execve) to check the lam mode in + * process generated by execve, the process read back lam mode and + * check with lam mode in parent process. + */ + if (!tests) + return (get_lam()); + + /* Run test cases */ + if (tests & FUNC_MALLOC) + run_test(malloc_cases, ARRAY_SIZE(malloc_cases)); + + if (tests & FUNC_BITS) + run_test(bits_cases, ARRAY_SIZE(bits_cases)); + + if (tests & FUNC_MMAP) + run_test(mmap_cases, ARRAY_SIZE(mmap_cases)); + + if (tests & FUNC_SYSCALL) + run_test(syscall_cases, ARRAY_SIZE(syscall_cases)); + + if (tests & FUNC_URING) + run_test(uring_cases, ARRAY_SIZE(uring_cases)); + + if (tests & FUNC_INHERITE) + run_test(inheritance_cases, ARRAY_SIZE(inheritance_cases)); + + if (tests & FUNC_PASID) + run_test(pasid_cases, ARRAY_SIZE(pasid_cases)); + + ksft_set_plan(tests_cnt); + + return ksft_exit_pass(); +} |