diff options
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/Makefile | 3 | ||||
-rw-r--r-- | kernel/bpf/arena.c | 558 | ||||
-rw-r--r-- | kernel/bpf/bpf_iter.c | 4 | ||||
-rw-r--r-- | kernel/bpf/bpf_struct_ops.c | 141 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 35 | ||||
-rw-r--r-- | kernel/bpf/core.c | 33 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 11 | ||||
-rw-r--r-- | kernel/bpf/disasm.c | 14 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 14 | ||||
-rw-r--r-- | kernel/bpf/log.c | 3 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 9 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 64 | ||||
-rw-r--r-- | kernel/bpf/trampoline.c | 4 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 366 |
14 files changed, 1120 insertions, 139 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 4ce95acfcaa7..368c5d86b5b7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -15,6 +15,9 @@ obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o +ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) +obj-$(CONFIG_BPF_SYSCALL) += arena.o +endif obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c new file mode 100644 index 000000000000..86571e760dd6 --- /dev/null +++ b/kernel/bpf/arena.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/err.h> +#include <linux/btf_ids.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> + +/* + * bpf_arena is a sparsely populated shared memory region between bpf program and + * user space process. + * + * For example on x86-64 the values could be: + * user_vm_start 7f7d26200000 // picked by mmap() + * kern_vm_start ffffc90001e69000 // picked by get_vm_area() + * For user space all pointers within the arena are normal 8-byte addresses. + * In this example 7f7d26200000 is the address of the first page (pgoff=0). + * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr + * (u32)7f7d26200000 -> 26200000 + * hence + * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb + * kernel memory region. + * + * BPF JITs generate the following code to access arena: + * mov eax, eax // eax has lower 32-bit of user pointer + * mov word ptr [rax + r12 + off], bx + * where r12 == kern_vm_start and off is s16. + * Hence allocate 4Gb + GUARD_SZ/2 on each side. + * + * Initially kernel vm_area and user vma are not populated. + * User space can fault-in any address which will insert the page + * into kernel and user vma. + * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc + * which will insert it into kernel vm_area. + * The later fault-in from user space will populate that page into user vma. + */ + +/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ +#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8) +#define KERN_VM_SZ ((1ull << 32) + GUARD_SZ) + +struct bpf_arena { + struct bpf_map map; + u64 user_vm_start; + u64 user_vm_end; + struct vm_struct *kern_vm; + struct maple_tree mt; + struct list_head vma_list; + struct mutex lock; +}; + +u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) +{ + return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0; +} + +u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) +{ + return arena ? arena->user_vm_start : 0; +} + +static long arena_map_peek_elem(struct bpf_map *map, void *value) +{ + return -EOPNOTSUPP; +} + +static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags) +{ + return -EOPNOTSUPP; +} + +static long arena_map_pop_elem(struct bpf_map *map, void *value) +{ + return -EOPNOTSUPP; +} + +static long arena_map_delete_elem(struct bpf_map *map, void *value) +{ + return -EOPNOTSUPP; +} + +static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EOPNOTSUPP; +} + +static long compute_pgoff(struct bpf_arena *arena, long uaddr) +{ + return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT; +} + +static struct bpf_map *arena_map_alloc(union bpf_attr *attr) +{ + struct vm_struct *kern_vm; + int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_arena *arena; + u64 vm_range; + int err = -ENOMEM; + + if (attr->key_size || attr->value_size || attr->max_entries == 0 || + /* BPF_F_MMAPABLE must be set */ + !(attr->map_flags & BPF_F_MMAPABLE) || + /* No unsupported flags present */ + (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV))) + return ERR_PTR(-EINVAL); + + if (attr->map_extra & ~PAGE_MASK) + /* If non-zero the map_extra is an expected user VMA start address */ + return ERR_PTR(-EINVAL); + + vm_range = (u64)attr->max_entries * PAGE_SIZE; + if (vm_range > (1ull << 32)) + return ERR_PTR(-E2BIG); + + if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32)) + /* user vma must not cross 32-bit boundary */ + return ERR_PTR(-ERANGE); + + kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP); + if (!kern_vm) + return ERR_PTR(-ENOMEM); + + arena = bpf_map_area_alloc(sizeof(*arena), numa_node); + if (!arena) + goto err; + + arena->kern_vm = kern_vm; + arena->user_vm_start = attr->map_extra; + if (arena->user_vm_start) + arena->user_vm_end = arena->user_vm_start + vm_range; + + INIT_LIST_HEAD(&arena->vma_list); + bpf_map_init_from_attr(&arena->map, attr); + mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE); + mutex_init(&arena->lock); + + return &arena->map; +err: + free_vm_area(kern_vm); + return ERR_PTR(err); +} + +static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) +{ + struct page *page; + pte_t pte; + + pte = ptep_get(ptep); + if (!pte_present(pte)) /* sanity check */ + return 0; + page = pte_page(pte); + /* + * We do not update pte here: + * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug + * 2. TLB flushing is batched or deferred. Even if we clear pte, + * the TLB entries can stick around and continue to permit access to + * the freed page. So it all relies on 1. + */ + __free_page(page); + return 0; +} + +static void arena_map_free(struct bpf_map *map) +{ + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + /* + * Check that user vma-s are not around when bpf map is freed. + * mmap() holds vm_file which holds bpf_map refcnt. + * munmap() must have happened on vma followed by arena_vm_close() + * which would clear arena->vma_list. + */ + if (WARN_ON_ONCE(!list_empty(&arena->vma_list))) + return; + + /* + * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area(). + * It unmaps everything from vmalloc area and clears pgtables. + * Call apply_to_existing_page_range() first to find populated ptes and + * free those pages. + */ + apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), + KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); + free_vm_area(arena->kern_vm); + mtree_destroy(&arena->mt); + bpf_map_area_free(arena); +} + +static void *arena_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EINVAL); +} + +static long arena_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return -EOPNOTSUPP; +} + +static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, const struct btf_type *value_type) +{ + return 0; +} + +static u64 arena_map_mem_usage(const struct bpf_map *map) +{ + return 0; +} + +struct vma_list { + struct vm_area_struct *vma; + struct list_head head; +}; + +static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) +{ + struct vma_list *vml; + + vml = kmalloc(sizeof(*vml), GFP_KERNEL); + if (!vml) + return -ENOMEM; + vma->vm_private_data = vml; + vml->vma = vma; + list_add(&vml->head, &arena->vma_list); + return 0; +} + +static void arena_vm_close(struct vm_area_struct *vma) +{ + struct bpf_map *map = vma->vm_file->private_data; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + struct vma_list *vml; + + guard(mutex)(&arena->lock); + vml = vma->vm_private_data; + list_del(&vml->head); + vma->vm_private_data = NULL; + kfree(vml); +} + +#define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */ + +static vm_fault_t arena_vm_fault(struct vm_fault *vmf) +{ + struct bpf_map *map = vmf->vma->vm_file->private_data; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + struct page *page; + long kbase, kaddr; + int ret; + + kbase = bpf_arena_get_kern_vm_start(arena); + kaddr = kbase + (u32)(vmf->address & PAGE_MASK); + + guard(mutex)(&arena->lock); + page = vmalloc_to_page((void *)kaddr); + if (page) + /* already have a page vmap-ed */ + goto out; + + if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) + /* User space requested to segfault when page is not allocated by bpf prog */ + return VM_FAULT_SIGSEGV; + + ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL); + if (ret) + return VM_FAULT_SIGSEGV; + + /* Account into memcg of the process that created bpf_arena */ + ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); + if (ret) { + mtree_erase(&arena->mt, vmf->pgoff); + return VM_FAULT_SIGSEGV; + } + + ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page); + if (ret) { + mtree_erase(&arena->mt, vmf->pgoff); + __free_page(page); + return VM_FAULT_SIGSEGV; + } +out: + page_ref_add(page, 1); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct arena_vm_ops = { + .close = arena_vm_close, + .fault = arena_vm_fault, +}; + +static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct bpf_map *map = filp->private_data; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + long ret; + + if (pgoff) + return -EINVAL; + if (len > (1ull << 32)) + return -E2BIG; + + /* if user_vm_start was specified at arena creation time */ + if (arena->user_vm_start) { + if (len > arena->user_vm_end - arena->user_vm_start) + return -E2BIG; + if (len != arena->user_vm_end - arena->user_vm_start) + return -EINVAL; + if (addr != arena->user_vm_start) + return -EINVAL; + } + + ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags); + if (IS_ERR_VALUE(ret)) + return ret; + if ((ret >> 32) == ((ret + len - 1) >> 32)) + return ret; + if (WARN_ON_ONCE(arena->user_vm_start)) + /* checks at map creation time should prevent this */ + return -EFAULT; + return round_up(ret, 1ull << 32); +} + +static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + guard(mutex)(&arena->lock); + if (arena->user_vm_start && arena->user_vm_start != vma->vm_start) + /* + * If map_extra was not specified at arena creation time then + * 1st user process can do mmap(NULL, ...) to pick user_vm_start + * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..); + * or + * specify addr in map_extra and + * use the same addr later with mmap(addr, MAP_FIXED..); + */ + return -EBUSY; + + if (arena->user_vm_end && arena->user_vm_end != vma->vm_end) + /* all user processes must have the same size of mmap-ed region */ + return -EBUSY; + + /* Earlier checks should prevent this */ + if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > (1ull << 32) || vma->vm_pgoff)) + return -EFAULT; + + if (remember_vma(arena, vma)) + return -ENOMEM; + + arena->user_vm_start = vma->vm_start; + arena->user_vm_end = vma->vm_end; + /* + * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and + * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid + * potential change of user_vm_start. + */ + vm_flags_set(vma, VM_DONTEXPAND); + vma->vm_ops = &arena_vm_ops; + return 0; +} + +static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) +{ + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if ((u64)off > arena->user_vm_end - arena->user_vm_start) + return -ERANGE; + *imm = (unsigned long)arena->user_vm_start; + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena) +const struct bpf_map_ops arena_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = arena_map_alloc, + .map_free = arena_map_free, + .map_direct_value_addr = arena_map_direct_value_addr, + .map_mmap = arena_map_mmap, + .map_get_unmapped_area = arena_get_unmapped_area, + .map_get_next_key = arena_map_get_next_key, + .map_push_elem = arena_map_push_elem, + .map_peek_elem = arena_map_peek_elem, + .map_pop_elem = arena_map_pop_elem, + .map_lookup_elem = arena_map_lookup_elem, + .map_update_elem = arena_map_update_elem, + .map_delete_elem = arena_map_delete_elem, + .map_check_btf = arena_map_check_btf, + .map_mem_usage = arena_map_mem_usage, + .map_btf_id = &bpf_arena_map_btf_ids[0], +}; + +static u64 clear_lo32(u64 val) +{ + return val & ~(u64)~0U; +} + +/* + * Allocate pages and vmap them into kernel vmalloc area. + * Later the pages will be mmaped into user space vma. + */ +static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id) +{ + /* user_vm_end/start are fixed before bpf prog runs */ + long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; + u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); + struct page **pages; + long pgoff = 0; + u32 uaddr32; + int ret, i; + + if (page_cnt > page_cnt_max) + return 0; + + if (uaddr) { + if (uaddr & ~PAGE_MASK) + return 0; + pgoff = compute_pgoff(arena, uaddr); + if (pgoff + page_cnt > page_cnt_max) + /* requested address will be outside of user VMA */ + return 0; + } + + /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */ + pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return 0; + + guard(mutex)(&arena->lock); + + if (uaddr) + ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1, + MT_ENTRY, GFP_KERNEL); + else + ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY, + page_cnt, 0, page_cnt_max - 1, GFP_KERNEL); + if (ret) + goto out_free_pages; + + ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, + node_id, page_cnt, pages); + if (ret) + goto out; + + uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); + /* Earlier checks make sure that uaddr32 + page_cnt * PAGE_SIZE will not overflow 32-bit */ + ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32, + kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages); + if (ret) { + for (i = 0; i < page_cnt; i++) + __free_page(pages[i]); + goto out; + } + kvfree(pages); + return clear_lo32(arena->user_vm_start) + uaddr32; +out: + mtree_erase(&arena->mt, pgoff); +out_free_pages: + kvfree(pages); + return 0; +} + +/* + * If page is present in vmalloc area, unmap it from vmalloc area, + * unmap it from all user space vma-s, + * and free it. + */ +static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) +{ + struct vma_list *vml; + + list_for_each_entry(vml, &arena->vma_list, head) + zap_page_range_single(vml->vma, uaddr, + PAGE_SIZE * page_cnt, NULL); +} + +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) +{ + u64 full_uaddr, uaddr_end; + long kaddr, pgoff, i; + struct page *page; + + /* only aligned lower 32-bit are relevant */ + uaddr = (u32)uaddr; + uaddr &= PAGE_MASK; + full_uaddr = clear_lo32(arena->user_vm_start) + uaddr; + uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT)); + if (full_uaddr >= uaddr_end) + return; + + page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; + + guard(mutex)(&arena->lock); + + pgoff = compute_pgoff(arena, uaddr); + /* clear range */ + mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL); + + if (page_cnt > 1) + /* bulk zap if multiple pages being freed */ + zap_pages(arena, full_uaddr, page_cnt); + + kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; + for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) { + page = vmalloc_to_page((void *)kaddr); + if (!page) + continue; + if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ + zap_pages(arena, full_uaddr, 1); + vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE); + __free_page(page); + } +} + +__bpf_kfunc_start_defs(); + +__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) + return NULL; + + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id); +} + +__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) + return; + arena_free_pages(arena, (long)ptr__ign, page_cnt); +} +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(arena_kfuncs) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_KFUNCS_END(arena_kfuncs) + +static const struct btf_kfunc_id_set common_kfunc_set = { + .owner = THIS_MODULE, + .set = &arena_kfuncs, +}; + +static int __init kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); +} +late_initcall(kfunc_init); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 0fae79164187..112581cf97e7 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -548,7 +548,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, return -ENOENT; /* Only allow sleepable program for resched-able iterator */ - if (prog->aux->sleepable && !bpf_iter_target_support_resched(tinfo)) + if (prog->sleepable && !bpf_iter_target_support_resched(tinfo)) return -EINVAL; link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); @@ -697,7 +697,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) struct bpf_run_ctx run_ctx, *old_run_ctx; int ret; - if (prog->aux->sleepable) { + if (prog->sleepable) { rcu_read_lock_trace(); migrate_disable(); might_fault(); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index a6019087b467..43356faaa057 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -18,6 +18,8 @@ struct bpf_struct_ops_value { char data[] ____cacheline_aligned_in_smp; }; +#define MAX_TRAMP_IMAGE_PAGES 8 + struct bpf_struct_ops_map { struct bpf_map map; struct rcu_head rcu; @@ -30,12 +32,11 @@ struct bpf_struct_ops_map { */ struct bpf_link **links; u32 links_cnt; - /* image is a page that has all the trampolines + u32 image_pages_cnt; + /* image_pages is an array of pages that has all the trampolines * that stores the func args before calling the bpf_prog. - * A PAGE_SIZE "image" is enough to store all trampoline for - * "links[]". */ - void *image; + void *image_pages[MAX_TRAMP_IMAGE_PAGES]; /* The owner moduler's btf. */ struct btf *btf; /* uvalue->data stores the kernel struct @@ -116,6 +117,31 @@ static bool is_valid_value_type(struct btf *btf, s32 value_id, return true; } +static void *bpf_struct_ops_image_alloc(void) +{ + void *image; + int err; + + err = bpf_jit_charge_modmem(PAGE_SIZE); + if (err) + return ERR_PTR(err); + image = arch_alloc_bpf_trampoline(PAGE_SIZE); + if (!image) { + bpf_jit_uncharge_modmem(PAGE_SIZE); + return ERR_PTR(-ENOMEM); + } + + return image; +} + +void bpf_struct_ops_image_free(void *image) +{ + if (image) { + arch_free_bpf_trampoline(image, PAGE_SIZE); + bpf_jit_uncharge_modmem(PAGE_SIZE); + } +} + #define MAYBE_NULL_SUFFIX "__nullable" #define MAX_STUB_NAME 128 @@ -461,6 +487,15 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) } } +static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map) +{ + int i; + + for (i = 0; i < st_map->image_pages_cnt; i++) + bpf_struct_ops_image_free(st_map->image_pages[i]); + st_map->image_pages_cnt = 0; +} + static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data) { const struct btf_member *member; @@ -506,9 +541,12 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = { int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, - void *stub_func, void *image, void *image_end) + void *stub_func, + void **_image, u32 *_image_off, + bool allow_alloc) { - u32 flags = BPF_TRAMP_F_INDIRECT; + u32 image_off = *_image_off, flags = BPF_TRAMP_F_INDIRECT; + void *image = *_image; int size; tlinks[BPF_TRAMP_FENTRY].links[0] = link; @@ -518,12 +556,32 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, flags |= BPF_TRAMP_F_RET_FENTRY_RET; size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); - if (size < 0) - return size; - if (size > (unsigned long)image_end - (unsigned long)image) - return -E2BIG; - return arch_prepare_bpf_trampoline(NULL, image, image_end, + if (size <= 0) + return size ? : -EFAULT; + + /* Allocate image buffer if necessary */ + if (!image || size > PAGE_SIZE - image_off) { + if (!allow_alloc) + return -E2BIG; + + image = bpf_struct_ops_image_alloc(); + if (IS_ERR(image)) + return PTR_ERR(image); + image_off = 0; + } + + size = arch_prepare_bpf_trampoline(NULL, image + image_off, + image + PAGE_SIZE, model, flags, tlinks, stub_func); + if (size <= 0) { + if (image != *_image) + bpf_struct_ops_image_free(image); + return size ? : -EFAULT; + } + + *_image = image; + *_image_off = image_off + size; + return 0; } static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, @@ -539,8 +597,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_tramp_links *tlinks; void *udata, *kdata; int prog_fd, err; - void *image, *image_end; - u32 i; + u32 i, trampoline_start, image_off = 0; + void *cur_image = NULL, *image = NULL; if (flags) return -EINVAL; @@ -578,8 +636,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, udata = &uvalue->data; kdata = &kvalue->data; - image = st_map->image; - image_end = st_map->image + PAGE_SIZE; module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); for_each_member(i, t, member) { @@ -658,28 +714,39 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, &bpf_struct_ops_link_lops, prog); st_map->links[i] = &link->link; + trampoline_start = image_off; err = bpf_struct_ops_prepare_trampoline(tlinks, link, - &st_ops->func_models[i], - *(void **)(st_ops->cfi_stubs + moff), - image, image_end); + &st_ops->func_models[i], + *(void **)(st_ops->cfi_stubs + moff), + &image, &image_off, + st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES); + if (err) + goto reset_unlock; + + if (cur_image != image) { + st_map->image_pages[st_map->image_pages_cnt++] = image; + cur_image = image; + trampoline_start = 0; + } if (err < 0) goto reset_unlock; - *(void **)(kdata + moff) = image + cfi_get_offset(); - image += err; + *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset(); /* put prog_id to udata */ *(unsigned long *)(udata + moff) = prog->aux->id; } + if (st_ops->validate) { + err = st_ops->validate(kdata); + if (err) + goto reset_unlock; + } + for (i = 0; i < st_map->image_pages_cnt; i++) + arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE); + if (st_map->map.map_flags & BPF_F_LINK) { err = 0; - if (st_ops->validate) { - err = st_ops->validate(kdata); - if (err) - goto reset_unlock; - } - arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); /* Let bpf_link handle registration & unregistration. * * Pair with smp_load_acquire() during lookup_elem(). @@ -688,7 +755,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto unlock; } - arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); err = st_ops->reg(kdata); if (likely(!err)) { /* This refcnt increment on the map here after @@ -711,9 +777,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ - arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE); reset_unlock: + bpf_struct_ops_map_free_image(st_map); bpf_struct_ops_map_put_progs(st_map); memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); @@ -780,10 +846,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map) if (st_map->links) bpf_struct_ops_map_put_progs(st_map); bpf_map_area_free(st_map->links); - if (st_map->image) { - arch_free_bpf_trampoline(st_map->image, PAGE_SIZE); - bpf_jit_uncharge_modmem(PAGE_SIZE); - } + bpf_struct_ops_map_free_image(st_map); bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); } @@ -893,20 +956,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) st_map->st_ops_desc = st_ops_desc; map = &st_map->map; - ret = bpf_jit_charge_modmem(PAGE_SIZE); - if (ret) - goto errout_free; - - st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE); - if (!st_map->image) { - /* __bpf_struct_ops_map_free() uses st_map->image as flag - * for "charged or not". In this case, we need to unchange - * here. - */ - bpf_jit_uncharge_modmem(PAGE_SIZE); - ret = -ENOMEM; - goto errout_free; - } st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); st_map->links_cnt = btf_type_vlen(t); st_map->links = diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6ff0bd1a91d5..90c4a32d89ff 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -809,9 +809,23 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) return __btf_name_valid(btf, offset); } +/* Allow any printable character in DATASEC names */ static bool btf_name_valid_section(const struct btf *btf, u32 offset) { - return __btf_name_valid(btf, offset); + /* offset must be valid */ + const char *src = btf_str_by_offset(btf, offset); + const char *src_limit; + + /* set a limit on identifier length */ + src_limit = src + KSYM_NAME_LEN; + src++; + while (*src && src < src_limit) { + if (!isprint(*src)) + return false; + src++; + } + + return !*src; } static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) @@ -7097,10 +7111,11 @@ cand_cache_unlock: } enum btf_arg_tag { - ARG_TAG_CTX = 0x1, - ARG_TAG_NONNULL = 0x2, - ARG_TAG_TRUSTED = 0x4, - ARG_TAG_NULLABLE = 0x8, + ARG_TAG_CTX = BIT_ULL(0), + ARG_TAG_NONNULL = BIT_ULL(1), + ARG_TAG_TRUSTED = BIT_ULL(2), + ARG_TAG_NULLABLE = BIT_ULL(3), + ARG_TAG_ARENA = BIT_ULL(4), }; /* Process BTF of a function to produce high-level expectation of function @@ -7212,6 +7227,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tags |= ARG_TAG_NONNULL; } else if (strcmp(tag, "nullable") == 0) { tags |= ARG_TAG_NULLABLE; + } else if (strcmp(tag, "arena") == 0) { + tags |= ARG_TAG_ARENA; } else { bpf_log(log, "arg#%d has unsupported set of tags\n", i); return -EOPNOTSUPP; @@ -7266,6 +7283,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) sub->args[i].btf_id = kern_type_id; continue; } + if (tags & ARG_TAG_ARENA) { + if (tags & ~ARG_TAG_ARENA) { + bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i); + return -EINVAL; + } + sub->args[i].arg_type = ARG_PTR_TO_ARENA; + continue; + } if (is_global) { /* generic user data pointer */ u32 mem_size; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 71c459a51d9e..696bc55de8e8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -88,13 +88,18 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns return NULL; } +/* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */ +enum page_size_enum { + __PAGE_SIZE = PAGE_SIZE +}; + struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog_aux *aux; struct bpf_prog *fp; - size = round_up(size, PAGE_SIZE); + size = round_up(size, __PAGE_SIZE); fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; @@ -888,7 +893,12 @@ static LIST_HEAD(pack_list); * CONFIG_MMU=n. Use PAGE_SIZE in these cases. */ #ifdef PMD_SIZE -#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes()) +/* PMD_SIZE is really big for some archs. It doesn't make sense to + * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to + * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be + * greater than or equal to 2MB. + */ +#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes()) #else #define BPF_PROG_PACK_SIZE PAGE_SIZE #endif @@ -1675,6 +1685,7 @@ bool bpf_opcode_in_insntable(u8 code) [BPF_LD | BPF_IND | BPF_B] = true, [BPF_LD | BPF_IND | BPF_H] = true, [BPF_LD | BPF_IND | BPF_W] = true, + [BPF_JMP | BPF_JCOND] = true, }; #undef BPF_INSN_3_TBL #undef BPF_INSN_2_TBL @@ -2695,7 +2706,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux, bool sleepable; u32 i; - sleepable = aux->sleepable; + sleepable = aux->prog->sleepable; for (i = 0; i < len; i++) { map = used_maps[i]; if (map->ops->map_poke_untrack) @@ -2926,6 +2937,11 @@ bool __weak bpf_jit_supports_far_kfunc_call(void) return false; } +bool __weak bpf_jit_supports_arena(void) +{ + return false; +} + /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same @@ -2970,6 +2986,17 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, { } +/* for configs without MMU or 32-bit */ +__weak const struct bpf_map_ops arena_map_ops; +__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) +{ + return 0; +} +__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) +{ + return 0; +} + #ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) { diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a936c704d4e7..4e2cdbb5629f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -130,13 +130,14 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) bpf_map_init_from_attr(&dtab->map, attr); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); - - if (!dtab->n_buckets) /* Overflow check */ + /* hash table size must be power of 2; roundup_pow_of_two() can + * overflow into UB on 32-bit arches, so check that first + */ + if (dtab->map.max_entries > 1UL << 31) return -EINVAL; - } - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, dtab->map.numa_node); if (!dtab->dev_index_head) diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 49940c26a227..bd2e2dd04740 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -166,6 +166,12 @@ static bool is_movsx(const struct bpf_insn *insn) (insn->off == 8 || insn->off == 16 || insn->off == 32); } +static bool is_addr_space_cast(const struct bpf_insn *insn) +{ + return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && + insn->off == BPF_ADDR_SPACE_CAST; +} + void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) @@ -184,6 +190,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); + } else if (is_addr_space_cast(insn)) { + verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", + insn->code, insn->dst_reg, + insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', @@ -322,6 +332,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_JCOND) && + insn->src_reg == BPF_MAY_GOTO) { + verbose(cbs->private_data, "(%02x) may_goto pc%+d\n", + insn->code, insn->off); } else if (insn->code == (BPF_JMP32 | BPF_JA)) { verbose(cbs->private_data, "(%02x) gotol pc%+d\n", insn->code, insn->imm); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 03a6a2500b6a..3a088a5349bc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -499,7 +499,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) num_possible_cpus()); } - /* hash table size must be power of 2 */ + /* hash table size must be power of 2; roundup_pow_of_two() can overflow + * into UB on 32-bit arches, so check that first + */ + err = -E2BIG; + if (htab->map.max_entries > 1UL << 31) + goto free_htab; + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); htab->elem_size = sizeof(struct htab_elem) + @@ -509,10 +515,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else htab->elem_size += round_up(htab->map.value_size, 8); - err = -E2BIG; - /* prevent zero size kmalloc and check for u32 overflow */ - if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct bucket)) + /* check for u32 overflow */ + if (htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; err = bpf_map_init_elem_count(&htab->map); diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 63c34e7b0715..2a243cf37c60 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -458,6 +458,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_MEM] = "mem", + [PTR_TO_ARENA] = "arena", [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", @@ -693,6 +694,8 @@ static void print_reg_state(struct bpf_verifier_env *env, } verbose(env, "%s", reg_type_str(env, t)); + if (t == PTR_TO_ARENA) + return; if (t == PTR_TO_STACK) { if (state->frameno != reg->frameno) verbose(env, "[%d]", reg->frameno); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index dff7ba539701..c99f8e5234ac 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -91,11 +91,14 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) } else if (value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); - /* hash table size must be power of 2 */ - n_buckets = roundup_pow_of_two(attr->max_entries); - if (!n_buckets) + /* hash table size must be power of 2; roundup_pow_of_two() can overflow + * into UB on 32-bit arches, so check that first + */ + if (attr->max_entries > 1UL << 31) return ERR_PTR(-E2BIG); + n_buckets = roundup_pow_of_two(attr->max_entries); + cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b2750b79ac80..ae2ff73bde7e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -164,6 +164,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, if (bpf_map_is_offloaded(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || + map->map_type == BPF_MAP_TYPE_ARENA || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { return map->ops->map_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || @@ -479,6 +480,39 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif +int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, + unsigned long nr_pages, struct page **pages) +{ + unsigned long i, j; + struct page *pg; + int ret = 0; +#ifdef CONFIG_MEMCG_KMEM + struct mem_cgroup *memcg, *old_memcg; + + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); +#endif + for (i = 0; i < nr_pages; i++) { + pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); + + if (pg) { + pages[i] = pg; + continue; + } + for (j = 0; j < i; j++) + __free_page(pages[j]); + ret = -ENOMEM; + break; + } + +#ifdef CONFIG_MEMCG_KMEM + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); +#endif + return ret; +} + + static int btf_field_cmp(const void *a, const void *b) { const struct btf_field *f1 = a, *f2 = b; @@ -937,6 +971,21 @@ static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) return EPOLLERR; } +static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct bpf_map *map = filp->private_data; + + if (map->ops->map_get_unmapped_area) + return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); +#ifdef CONFIG_MMU + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +#else + return addr; +#endif +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -946,6 +995,7 @@ const struct file_operations bpf_map_fops = { .write = bpf_dummy_write, .mmap = bpf_map_mmap, .poll = bpf_map_poll, + .get_unmapped_area = bpf_get_unmapped_area, }; int bpf_map_new_fd(struct bpf_map *map, int flags) @@ -1160,6 +1210,7 @@ static int map_create(union bpf_attr *attr) } if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && + attr->map_type != BPF_MAP_TYPE_ARENA && attr->map_extra != 0) return -EINVAL; @@ -1249,6 +1300,7 @@ static int map_create(union bpf_attr *attr) case BPF_MAP_TYPE_LRU_PERCPU_HASH: case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_ARENA: if (!bpf_token_capable(token, CAP_BPF)) goto put_token; break; @@ -2196,7 +2248,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) btf_put(prog->aux->attach_btf); if (deferred) { - if (prog->aux->sleepable) + if (prog->sleepable) call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); else call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); @@ -2761,11 +2813,11 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) } prog->expected_attach_type = attr->expected_attach_type; + prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE); prog->aux->attach_btf = attach_btf; prog->aux->attach_btf_id = attr->attach_btf_id; prog->aux->dst_prog = dst_prog; prog->aux->dev_bound = !!attr->prog_ifindex; - prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; /* move token into prog->aux, reuse taken refcnt */ @@ -4401,6 +4453,12 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, continue; } + if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX || + BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) { + insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM; + continue; + } + if (code != (BPF_LD | BPF_IMM | BPF_DW)) continue; @@ -5496,7 +5554,7 @@ static int bpf_prog_bind_map(union bpf_attr *attr) /* The bpf program will not access the bpf map, but for the sake of * simplicity, increase sleepable_refcnt for sleepable program as well. */ - if (prog->aux->sleepable) + if (prog->sleepable) atomic64_inc(&map->sleepable_refcnt); memcpy(used_maps_new, used_maps_old, sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index d382f5ebe06c..db7599c59c78 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -1014,7 +1014,7 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog) { - bool sleepable = prog->aux->sleepable; + bool sleepable = prog->sleepable; if (bpf_prog_check_recur(prog)) return sleepable ? __bpf_prog_enter_sleepable_recur : @@ -1029,7 +1029,7 @@ bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog) bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) { - bool sleepable = prog->aux->sleepable; + bool sleepable = prog->sleepable; if (bpf_prog_check_recur(prog)) return sleepable ? __bpf_prog_exit_sleepable_recur : diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 21f80383f8b2..63749ad5ac6b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -533,6 +533,16 @@ static bool is_async_callback_calling_insn(struct bpf_insn *insn) return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm); } +static bool is_may_goto_insn(struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO; +} + +static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx) +{ + return is_may_goto_insn(&env->prog->insnsi[insn_idx]); +} + static bool is_storage_get_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_storage_get || @@ -1429,6 +1439,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->dfs_depth = src->dfs_depth; dst_state->callback_unroll_depth = src->callback_unroll_depth; dst_state->used_as_loop_entry = src->used_as_loop_entry; + dst_state->may_goto_depth = src->may_goto_depth; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -4375,6 +4386,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_MEM: case PTR_TO_FUNC: case PTR_TO_MAP_KEY: + case PTR_TO_ARENA: return true; default: return false; @@ -5262,7 +5274,7 @@ bad_type: static bool in_sleepable(struct bpf_verifier_env *env) { - return env->prog->aux->sleepable; + return env->prog->sleepable; } /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() @@ -5817,6 +5829,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_XDP_SOCK: pointer_desc = "xdp_sock "; break; + case PTR_TO_ARENA: + return 0; default: break; } @@ -6926,6 +6940,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_ARENA) { + if (t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str(env, reg->type)); @@ -8397,6 +8414,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_MEM | MEM_RINGBUF: case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: + case PTR_TO_ARENA: case SCALAR_VALUE: return 0; /* All the rest must be rejected, except PTR_TO_BTF_ID which allows @@ -9361,6 +9379,18 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "arg#%d is expected to be non-NULL\n", i); return -EINVAL; } + } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) { + /* + * Can pass any value and the kernel won't crash, but + * only PTR_TO_ARENA or SCALAR make sense. Everything + * else is a bug in the bpf program. Point it out to + * the user at the verification time instead of + * run-time debug nightmare. + */ + if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) { + bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno); + return -EINVAL; + } } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0); if (ret) @@ -10741,6 +10771,11 @@ static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *a return btf_param_match_suffix(btf, arg, "__ign"); } +static bool is_kfunc_arg_map(const struct btf *btf, const struct btf_param *arg) +{ + return btf_param_match_suffix(btf, arg, "__map"); +} + static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg) { return btf_param_match_suffix(btf, arg, "__alloc"); @@ -10910,6 +10945,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_RB_NODE, KF_ARG_PTR_TO_NULL, KF_ARG_PTR_TO_CONST_STR, + KF_ARG_PTR_TO_MAP, }; enum special_kfunc_type { @@ -11063,6 +11099,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_const_str(meta->btf, &args[argno])) return KF_ARG_PTR_TO_CONST_STR; + if (is_kfunc_arg_map(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_MAP; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -11663,6 +11702,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ switch (kf_arg_type) { case KF_ARG_PTR_TO_NULL: continue; + case KF_ARG_PTR_TO_MAP: case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) @@ -11879,6 +11919,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_MAP: + /* If argument has '__map' suffix expect 'struct bpf_map *' */ + ref_id = *reg2btf_ids[CONST_PTR_TO_MAP]; + ref_t = btf_type_by_id(btf_vmlinux, ref_id); + ref_tname = btf_name_by_offset(btf, ref_t->name_off); + fallthrough; case KF_ARG_PTR_TO_BTF_ID: /* Only base_type is checked, further checks are done here */ if ((base_type(reg->type) != PTR_TO_BTF_ID || @@ -12353,6 +12399,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.func_name); return -EFAULT; } + } else if (btf_type_is_void(ptr_type)) { + /* kfunc returning 'void *' is equivalent to returning scalar */ + mark_reg_unknown(env, regs, BPF_REG_0); } else if (!__btf_type_is_struct(ptr_type)) { if (!meta.r0_size) { __u32 sz; @@ -13822,6 +13871,21 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; src_reg = NULL; + + if (dst_reg->type == PTR_TO_ARENA) { + struct bpf_insn_aux_data *aux = cur_aux(env); + + if (BPF_CLASS(insn->code) == BPF_ALU64) + /* + * 32-bit operations zero upper bits automatically. + * 64-bit operations need to be converted to 32. + */ + aux->needs_zext = true; + + /* Any arithmetic operations are allowed on arena pointers */ + return 0; + } + if (dst_reg->type != SCALAR_VALUE) ptr_reg = dst_reg; else @@ -13939,19 +14003,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0) { - verbose(env, "BPF_MOV uses reserved fields\n"); - return -EINVAL; - } - if (BPF_CLASS(insn->code) == BPF_ALU) { - if (insn->off != 0 && insn->off != 8 && insn->off != 16) { + if ((insn->off != 0 && insn->off != 8 && insn->off != 16) || + insn->imm) { verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } + } else if (insn->off == BPF_ADDR_SPACE_CAST) { + if (insn->imm != 1 && insn->imm != 1u << 16) { + verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n"); + return -EINVAL; + } } else { - if (insn->off != 0 && insn->off != 8 && insn->off != 16 && - insn->off != 32) { + if ((insn->off != 0 && insn->off != 8 && insn->off != 16 && + insn->off != 32) || insn->imm) { verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } @@ -13978,7 +14043,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) struct bpf_reg_state *dst_reg = regs + insn->dst_reg; if (BPF_CLASS(insn->code) == BPF_ALU64) { - if (insn->off == 0) { + if (insn->imm) { + /* off == BPF_ADDR_SPACE_CAST */ + mark_reg_unknown(env, regs, insn->dst_reg); + if (insn->imm == 1) /* cast from as(1) to as(0) */ + dst_reg->type = PTR_TO_ARENA; + } else if (insn->off == 0) { /* case: R1 = R2 * copy register state to dest reg */ @@ -14871,11 +14941,36 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, int err; /* Only conditional jumps are expected to reach here. */ - if (opcode == BPF_JA || opcode > BPF_JSLE) { + if (opcode == BPF_JA || opcode > BPF_JCOND) { verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode); return -EINVAL; } + if (opcode == BPF_JCOND) { + struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st; + int idx = *insn_idx; + + if (insn->code != (BPF_JMP | BPF_JCOND) || + insn->src_reg != BPF_MAY_GOTO || + insn->dst_reg || insn->imm || insn->off == 0) { + verbose(env, "invalid may_goto off %d imm %d\n", + insn->off, insn->imm); + return -EINVAL; + } + prev_st = find_prev_entry(env, cur_st->parent, idx); + + /* branch out 'fallthrough' insn as a new state to explore */ + queued_st = push_stack(env, idx + 1, idx, false); + if (!queued_st) + return -ENOMEM; + + queued_st->may_goto_depth++; + if (prev_st) + widen_imprecise_scalars(env, prev_st, queued_st); + *insn_idx += insn->off; + return 0; + } + /* check src2 operand */ err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) @@ -15127,6 +15222,10 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { + if (map->map_type == BPF_MAP_TYPE_ARENA) { + __mark_reg_unknown(env, dst_reg); + return 0; + } dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; WARN_ON_ONCE(map->max_entries != 1); @@ -15659,6 +15758,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) default: /* conditional jump with two edges */ mark_prune_point(env, t); + if (is_may_goto_insn(insn)) + mark_force_checkpoint(env, t); ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret) @@ -16222,8 +16323,8 @@ static int check_btf_info(struct bpf_verifier_env *env, } /* check %cur's range satisfies %old's */ -static bool range_within(struct bpf_reg_state *old, - struct bpf_reg_state *cur) +static bool range_within(const struct bpf_reg_state *old, + const struct bpf_reg_state *cur) { return old->umin_value <= cur->umin_value && old->umax_value >= cur->umax_value && @@ -16387,21 +16488,28 @@ static bool regs_exact(const struct bpf_reg_state *rold, check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); } +enum exact_level { + NOT_EXACT, + EXACT, + RANGE_WITHIN +}; + /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, - struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact) + struct bpf_reg_state *rcur, struct bpf_idmap *idmap, + enum exact_level exact) { - if (exact) + if (exact == EXACT) return regs_exact(rold, rcur, idmap); - if (!(rold->live & REG_LIVE_READ)) + if (!(rold->live & REG_LIVE_READ) && exact == NOT_EXACT) /* explored state didn't use this */ return true; - if (rold->type == NOT_INIT) - /* explored state can't have used this */ - return true; - if (rcur->type == NOT_INIT) - return false; + if (rold->type == NOT_INIT) { + if (exact == NOT_EXACT || rcur->type == NOT_INIT) + /* explored state can't have used this */ + return true; + } /* Enforce that register types have to match exactly, including their * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general @@ -16436,7 +16544,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && check_scalar_ids(rold->id, rcur->id, idmap); } - if (!rold->precise) + if (!rold->precise && exact == NOT_EXACT) return true; /* Why check_ids() for scalar registers? * @@ -16504,6 +16612,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, * the same stack frame, since fp-8 in foo != fp-8 in bar */ return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno; + case PTR_TO_ARENA: + return true; default: return regs_exact(rold, rcur, idmap); } @@ -16547,7 +16657,8 @@ static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env, } static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact) + struct bpf_func_state *cur, struct bpf_idmap *idmap, + enum exact_level exact) { int i, spi; @@ -16560,12 +16671,13 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, spi = i / BPF_REG_SIZE; - if (exact && + if (exact != NOT_EXACT && old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) return false; - if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) { + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) + && exact == NOT_EXACT) { i += BPF_REG_SIZE - 1; /* explored state didn't use this */ continue; @@ -16711,7 +16823,7 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, * the current state will reach 'bpf_exit' instruction safely */ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, bool exact) + struct bpf_func_state *cur, enum exact_level exact) { int i; @@ -16741,7 +16853,7 @@ static void reset_idmap_scratch(struct bpf_verifier_env *env) static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *old, struct bpf_verifier_state *cur, - bool exact) + enum exact_level exact) { int i; @@ -17115,7 +17227,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * => unsafe memory access at 11 would not be caught. */ if (is_iter_next_insn(env, insn_idx)) { - if (states_equal(env, &sl->state, cur, true)) { + if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { struct bpf_func_state *cur_frame; struct bpf_reg_state *iter_state, *iter_reg; int spi; @@ -17138,15 +17250,23 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) } goto skip_inf_loop_check; } + if (is_may_goto_insn_at(env, insn_idx)) { + if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { + update_loop_entry(cur, &sl->state); + goto hit; + } + goto skip_inf_loop_check; + } if (calls_callback(env, insn_idx)) { - if (states_equal(env, &sl->state, cur, true)) + if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) goto hit; goto skip_inf_loop_check; } /* attempt to detect infinite loop to avoid unnecessary doomed work */ if (states_maybe_looping(&sl->state, cur) && - states_equal(env, &sl->state, cur, true) && + states_equal(env, &sl->state, cur, EXACT) && !iter_active_depths_differ(&sl->state, cur) && + sl->state.may_goto_depth == cur->may_goto_depth && sl->state.callback_unroll_depth == cur->callback_unroll_depth) { verbose_linfo(env, insn_idx, "; "); verbose(env, "infinite loop detected at insn %d\n", insn_idx); @@ -17202,7 +17322,7 @@ skip_inf_loop_check: */ loop_entry = get_loop_entry(&sl->state); force_exact = loop_entry && loop_entry->branches > 0; - if (states_equal(env, &sl->state, cur, force_exact)) { + if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) { if (force_exact) update_loop_entry(cur, loop_entry); hit: @@ -17372,6 +17492,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_TCP_SOCK: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_ARENA: return false; default: return true; @@ -18019,7 +18140,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } - if (prog->aux->sleepable) + if (prog->sleepable) switch (map->map_type) { case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_LRU_HASH: @@ -18037,6 +18158,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_CGRP_STORAGE: case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: + case BPF_MAP_TYPE_ARENA: break; default: verbose(env, @@ -18206,7 +18328,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return -E2BIG; } - if (env->prog->aux->sleepable) + if (env->prog->sleepable) atomic64_inc(&map->sleepable_refcnt); /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it @@ -18224,6 +18346,31 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) fdput(f); return -EBUSY; } + if (map->map_type == BPF_MAP_TYPE_ARENA) { + if (env->prog->aux->arena) { + verbose(env, "Only one arena per program\n"); + fdput(f); + return -EBUSY; + } + if (!env->allow_ptr_leaks || !env->bpf_capable) { + verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n"); + fdput(f); + return -EPERM; + } + if (!env->prog->jit_requested) { + verbose(env, "JIT is required to use arena\n"); + return -EOPNOTSUPP; + } + if (!bpf_jit_supports_arena()) { + verbose(env, "JIT doesn't support arena\n"); + return -EOPNOTSUPP; + } + env->prog->aux->arena = (void *)map; + if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) { + verbose(env, "arena's user address must be set via map_extra or mmap()\n"); + return -EINVAL; + } + } fdput(f); next_insn: @@ -18845,6 +18992,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) env->prog->aux->num_exentries++; } continue; + case PTR_TO_ARENA: + if (BPF_MODE(insn->code) == BPF_MEMSX) { + verbose(env, "sign extending loads from arena are not supported yet\n"); + return -EOPNOTSUPP; + } + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); + env->prog->aux->num_exentries++; + continue; default: continue; } @@ -19030,13 +19185,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->nr_linfo = prog->aux->nr_linfo; func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; + func[i]->aux->arena = prog->aux->arena; num_exentries = 0; insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEM32 || BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) num_exentries++; + if ((BPF_CLASS(insn->code) == BPF_STX || + BPF_CLASS(insn->code) == BPF_ST) && + BPF_MODE(insn->code) == BPF_PROBE_MEM32) + num_exentries++; } func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; @@ -19411,7 +19572,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env) struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; struct bpf_map *map_ptr; - int i, ret, cnt, delta = 0; + int i, ret, cnt, delta = 0, cur_subprog = 0; + struct bpf_subprog_info *subprogs = env->subprog_info; + u16 stack_depth = subprogs[cur_subprog].stack_depth; + u16 stack_depth_extra = 0; if (env->seen_exception && !env->exception_callback_subprog) { struct bpf_insn patch[] = { @@ -19431,7 +19595,22 @@ static int do_misc_fixups(struct bpf_verifier_env *env) mark_subprog_exc_cb(env, env->exception_callback_subprog); } - for (i = 0; i < insn_cnt; i++, insn++) { + for (i = 0; i < insn_cnt;) { + if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) { + if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) || + (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) { + /* convert to 32-bit mov that clears upper 32-bit */ + insn->code = BPF_ALU | BPF_MOV | BPF_X; + /* clear off, so it's a normal 'wX = wY' from JIT pov */ + insn->off = 0; + } /* cast from as(0) to as(1) should be handled by JIT */ + goto next_insn; + } + + if (env->insn_aux_data[i + delta].needs_zext) + /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */ + insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code); + /* Make divide-by-zero exceptions impossible. */ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || @@ -19470,7 +19649,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */ @@ -19490,7 +19669,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Rewrite pointer arithmetic to mitigate speculation attacks. */ @@ -19505,7 +19684,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) aux = &env->insn_aux_data[i + delta]; if (!aux->alu_state || aux->alu_state == BPF_ALU_NON_POINTER) - continue; + goto next_insn; isneg = aux->alu_state & BPF_ALU_NEG_VALUE; issrc = (aux->alu_state & BPF_ALU_SANITIZE) == @@ -19543,19 +19722,39 @@ static int do_misc_fixups(struct bpf_verifier_env *env) delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; + } + + if (is_may_goto_insn(insn)) { + int stack_off = -stack_depth - 8; + + stack_depth_extra = 8; + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off); + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2); + insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); + insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off); + cnt = 4; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; } if (insn->code != (BPF_JMP | BPF_CALL)) - continue; + goto next_insn; if (insn->src_reg == BPF_PSEUDO_CALL) - continue; + goto next_insn; if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt); if (ret) return ret; if (cnt == 0) - continue; + goto next_insn; new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) @@ -19564,7 +19763,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } if (insn->imm == BPF_FUNC_get_route_realm) @@ -19612,11 +19811,11 @@ static int do_misc_fixups(struct bpf_verifier_env *env) } insn->imm = ret + 1; - continue; + goto next_insn; } if (!bpf_map_ptr_unpriv(aux)) - continue; + goto next_insn; /* instead of changing every JIT dealing with tail_call * emit two extra insns: @@ -19645,7 +19844,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } if (insn->imm == BPF_FUNC_timer_set_callback) { @@ -19757,7 +19956,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } BUILD_BUG_ON(!__same_type(ops->map_lookup_elem, @@ -19788,31 +19987,31 @@ patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: insn->imm = BPF_CALL_IMM(ops->map_lookup_elem); - continue; + goto next_insn; case BPF_FUNC_map_update_elem: insn->imm = BPF_CALL_IMM(ops->map_update_elem); - continue; + goto next_insn; case BPF_FUNC_map_delete_elem: insn->imm = BPF_CALL_IMM(ops->map_delete_elem); - continue; + goto next_insn; case BPF_FUNC_map_push_elem: insn->imm = BPF_CALL_IMM(ops->map_push_elem); - continue; + goto next_insn; case BPF_FUNC_map_pop_elem: insn->imm = BPF_CALL_IMM(ops->map_pop_elem); - continue; + goto next_insn; case BPF_FUNC_map_peek_elem: insn->imm = BPF_CALL_IMM(ops->map_peek_elem); - continue; + goto next_insn; case BPF_FUNC_redirect_map: insn->imm = BPF_CALL_IMM(ops->map_redirect); - continue; + goto next_insn; case BPF_FUNC_for_each_map_elem: insn->imm = BPF_CALL_IMM(ops->map_for_each_callback); - continue; + goto next_insn; case BPF_FUNC_map_lookup_percpu_elem: insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem); - continue; + goto next_insn; } goto patch_call_imm; @@ -19840,7 +20039,7 @@ patch_map_ops_generic: delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement bpf_get_func_arg inline. */ @@ -19865,7 +20064,7 @@ patch_map_ops_generic: delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement bpf_get_func_ret inline. */ @@ -19893,7 +20092,7 @@ patch_map_ops_generic: delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement get_func_arg_cnt inline. */ @@ -19908,7 +20107,7 @@ patch_map_ops_generic: env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement bpf_get_func_ip inline. */ @@ -19923,7 +20122,7 @@ patch_map_ops_generic: env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement bpf_kptr_xchg inline */ @@ -19941,7 +20140,7 @@ patch_map_ops_generic: delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); @@ -19955,6 +20154,40 @@ patch_call_imm: return -EFAULT; } insn->imm = fn->func - __bpf_call_base; +next_insn: + if (subprogs[cur_subprog + 1].start == i + delta + 1) { + subprogs[cur_subprog].stack_depth += stack_depth_extra; + subprogs[cur_subprog].stack_extra = stack_depth_extra; + cur_subprog++; + stack_depth = subprogs[cur_subprog].stack_depth; + stack_depth_extra = 0; + } + i++; + insn++; + } + + env->prog->aux->stack_depth = subprogs[0].stack_depth; + for (i = 0; i < env->subprog_cnt; i++) { + int subprog_start = subprogs[i].start; + int stack_slots = subprogs[i].stack_extra / 8; + + if (!stack_slots) + continue; + if (stack_slots > 1) { + verbose(env, "verifier bug: stack_slots supports may_goto only\n"); + return -EFAULT; + } + + /* Add ST insn to subprog prologue to init extra stack */ + insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, + -subprogs[i].stack_depth, BPF_MAX_LOOPS); + /* Copy first actual insn to preserve it */ + insn_buf[1] = env->prog->insnsi[subprog_start]; + + new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2); + if (!new_prog) + return -ENOMEM; + env->prog = prog = new_prog; } /* Since poke tab is now finalized, publish aux to tracker. */ @@ -20230,6 +20463,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */ reg->btf_id = arg->btf_id; reg->id = ++env->id_gen; + } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) { + /* caller can pass either PTR_TO_ARENA or SCALAR */ + mark_reg_unknown(env, regs, i); } else { WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n", i - BPF_REG_1, arg->arg_type); @@ -20705,7 +20941,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, } } - if (prog->aux->sleepable) { + if (prog->sleepable) { ret = -EINVAL; switch (prog->type) { case BPF_PROG_TYPE_TRACING: @@ -20816,14 +21052,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) u64 key; if (prog->type == BPF_PROG_TYPE_SYSCALL) { - if (prog->aux->sleepable) + if (prog->sleepable) /* attach_btf_id checked to be zero already */ return 0; verbose(env, "Syscall programs can only be sleepable\n"); return -EINVAL; } - if (prog->aux->sleepable && !can_be_sleepable(prog)) { + if (prog->sleepable && !can_be_sleepable(prog)) { verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); return -EINVAL; } |