diff options
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/Makefile | 3 | ||||
-rw-r--r-- | kernel/bpf/bpf_lru_list.c | 695 | ||||
-rw-r--r-- | kernel/bpf/bpf_lru_list.h | 84 | ||||
-rw-r--r-- | kernel/bpf/cgroup.c | 200 | ||||
-rw-r--r-- | kernel/bpf/core.c | 1 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 438 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 12 | ||||
-rw-r--r-- | kernel/bpf/inode.c | 99 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 152 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 112 |
10 files changed, 1694 insertions, 102 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index eed911d091da..1276474ac3cd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,8 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif +obj-$(CONFIG_CGROUP_BPF) += cgroup.o diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c new file mode 100644 index 000000000000..89b7ef41c86b --- /dev/null +++ b/kernel/bpf/bpf_lru_list.c @@ -0,0 +1,695 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/cpumask.h> +#include <linux/spinlock.h> +#include <linux/percpu.h> + +#include "bpf_lru_list.h" + +#define LOCAL_FREE_TARGET (128) +#define LOCAL_NR_SCANS LOCAL_FREE_TARGET + +#define PERCPU_FREE_TARGET (16) +#define PERCPU_NR_SCANS PERCPU_FREE_TARGET + +/* Helpers to get the local list index */ +#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) +#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) +#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) +#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) + +static int get_next_cpu(int cpu) +{ + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(cpu_possible_mask); + return cpu; +} + +/* Local list helpers */ +static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) +{ + return &loc_l->lists[LOCAL_FREE_LIST_IDX]; +} + +static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) +{ + return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; +} + +/* bpf_lru_node helpers */ +static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) +{ + return node->ref; +} + +static void bpf_lru_list_count_inc(struct bpf_lru_list *l, + enum bpf_lru_list_type type) +{ + if (type < NR_BPF_LRU_LIST_COUNT) + l->counts[type]++; +} + +static void bpf_lru_list_count_dec(struct bpf_lru_list *l, + enum bpf_lru_list_type type) +{ + if (type < NR_BPF_LRU_LIST_COUNT) + l->counts[type]--; +} + +static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, + struct bpf_lru_node *node, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) +{ + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) + return; + + /* If the removing node is the next_inactive_rotation candidate, + * move the next_inactive_rotation pointer also. + */ + if (&node->list == l->next_inactive_rotation) + l->next_inactive_rotation = l->next_inactive_rotation->prev; + + bpf_lru_list_count_dec(l, node->type); + + node->type = tgt_free_type; + list_move(&node->list, free_list); +} + +/* Move nodes from local list to the LRU list */ +static void __bpf_lru_node_move_in(struct bpf_lru_list *l, + struct bpf_lru_node *node, + enum bpf_lru_list_type tgt_type) +{ + if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) || + WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) + return; + + bpf_lru_list_count_inc(l, tgt_type); + node->type = tgt_type; + node->ref = 0; + list_move(&node->list, &l->lists[tgt_type]); +} + +/* Move nodes between or within active and inactive list (like + * active to inactive, inactive to active or tail of active back to + * the head of active). + */ +static void __bpf_lru_node_move(struct bpf_lru_list *l, + struct bpf_lru_node *node, + enum bpf_lru_list_type tgt_type) +{ + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) || + WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) + return; + + if (node->type != tgt_type) { + bpf_lru_list_count_dec(l, node->type); + bpf_lru_list_count_inc(l, tgt_type); + node->type = tgt_type; + } + node->ref = 0; + + /* If the moving node is the next_inactive_rotation candidate, + * move the next_inactive_rotation pointer also. + */ + if (&node->list == l->next_inactive_rotation) + l->next_inactive_rotation = l->next_inactive_rotation->prev; + + list_move(&node->list, &l->lists[tgt_type]); +} + +static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l) +{ + return l->counts[BPF_LRU_LIST_T_INACTIVE] < + l->counts[BPF_LRU_LIST_T_ACTIVE]; +} + +/* Rotate the active list: + * 1. Start from tail + * 2. If the node has the ref bit set, it will be rotated + * back to the head of active list with the ref bit cleared. + * Give this node one more chance to survive in the active list. + * 3. If the ref bit is not set, move it to the head of the + * inactive list. + * 4. It will at most scan nr_scans nodes + */ +static void __bpf_lru_list_rotate_active(struct bpf_lru *lru, + struct bpf_lru_list *l) +{ + struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE]; + struct bpf_lru_node *node, *tmp_node, *first_node; + unsigned int i = 0; + + first_node = list_first_entry(active, struct bpf_lru_node, list); + list_for_each_entry_safe_reverse(node, tmp_node, active, list) { + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + else + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); + + if (++i == lru->nr_scans || node == first_node) + break; + } +} + +/* Rotate the inactive list. It starts from the next_inactive_rotation + * 1. If the node has ref bit set, it will be moved to the head + * of active list with the ref bit cleared. + * 2. If the node does not have ref bit set, it will leave it + * at its current location (i.e. do nothing) so that it can + * be considered during the next inactive_shrink. + * 3. It will at most scan nr_scans nodes + */ +static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru, + struct bpf_lru_list *l) +{ + struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + struct list_head *cur, *last, *next = inactive; + struct bpf_lru_node *node; + unsigned int i = 0; + + if (list_empty(inactive)) + return; + + last = l->next_inactive_rotation->next; + if (last == inactive) + last = last->next; + + cur = l->next_inactive_rotation; + while (i < lru->nr_scans) { + if (cur == inactive) { + cur = cur->prev; + continue; + } + + node = list_entry(cur, struct bpf_lru_node, list); + next = cur->prev; + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + if (cur == last) + break; + cur = next; + i++; + } + + l->next_inactive_rotation = next; +} + +/* Shrink the inactive list. It starts from the tail of the + * inactive list and only move the nodes without the ref bit + * set to the designated free list. + */ +static unsigned int +__bpf_lru_list_shrink_inactive(struct bpf_lru *lru, + struct bpf_lru_list *l, + unsigned int tgt_nshrink, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) +{ + struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + struct bpf_lru_node *node, *tmp_node, *first_node; + unsigned int nshrinked = 0; + unsigned int i = 0; + + first_node = list_first_entry(inactive, struct bpf_lru_node, list); + list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { + if (bpf_lru_node_is_ref(node)) { + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + } else if (lru->del_from_htab(lru->del_arg, node)) { + __bpf_lru_node_move_to_free(l, node, free_list, + tgt_free_type); + if (++nshrinked == tgt_nshrink) + break; + } + + if (++i == lru->nr_scans) + break; + } + + return nshrinked; +} + +/* 1. Rotate the active list (if needed) + * 2. Always rotate the inactive list + */ +static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l) +{ + if (bpf_lru_list_inactive_low(l)) + __bpf_lru_list_rotate_active(lru, l); + + __bpf_lru_list_rotate_inactive(lru, l); +} + +/* Calls __bpf_lru_list_shrink_inactive() to shrink some + * ref-bit-cleared nodes and move them to the designated + * free list. + * + * If it cannot get a free node after calling + * __bpf_lru_list_shrink_inactive(). It will just remove + * one node from either inactive or active list without + * honoring the ref-bit. It prefers inactive list to active + * list in this situation. + */ +static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, + struct bpf_lru_list *l, + unsigned int tgt_nshrink, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) + +{ + struct bpf_lru_node *node, *tmp_node; + struct list_head *force_shrink_list; + unsigned int nshrinked; + + nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink, + free_list, tgt_free_type); + if (nshrinked) + return nshrinked; + + /* Do a force shrink by ignoring the reference bit */ + if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE])) + force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + else + force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE]; + + list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, + list) { + if (lru->del_from_htab(lru->del_arg, node)) { + __bpf_lru_node_move_to_free(l, node, free_list, + tgt_free_type); + return 1; + } + } + + return 0; +} + +/* Flush the nodes from the local pending list to the LRU list */ +static void __local_list_flush(struct bpf_lru_list *l, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node, *tmp_node; + + list_for_each_entry_safe_reverse(node, tmp_node, + local_pending_list(loc_l), list) { + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); + else + __bpf_lru_node_move_in(l, node, + BPF_LRU_LIST_T_INACTIVE); + } +} + +static void bpf_lru_list_push_free(struct bpf_lru_list *l, + struct bpf_lru_node *node) +{ + unsigned long flags; + + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) + return; + + raw_spin_lock_irqsave(&l->lock, flags); + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); + raw_spin_unlock_irqrestore(&l->lock, flags); +} + +static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_list *l = &lru->common_lru.lru_list; + struct bpf_lru_node *node, *tmp_node; + unsigned int nfree = 0; + + raw_spin_lock(&l->lock); + + __local_list_flush(l, loc_l); + + __bpf_lru_list_rotate(lru, l); + + list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], + list) { + __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), + BPF_LRU_LOCAL_LIST_T_FREE); + if (++nfree == LOCAL_FREE_TARGET) + break; + } + + if (nfree < LOCAL_FREE_TARGET) + __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, + local_free_list(loc_l), + BPF_LRU_LOCAL_LIST_T_FREE); + + raw_spin_unlock(&l->lock); +} + +static void __local_list_add_pending(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l, + int cpu, + struct bpf_lru_node *node, + u32 hash) +{ + *(u32 *)((void *)node + lru->hash_offset) = hash; + node->cpu = cpu; + node->type = BPF_LRU_LOCAL_LIST_T_PENDING; + node->ref = 0; + list_add(&node->list, local_pending_list(loc_l)); +} + +struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node; + + node = list_first_entry_or_null(local_free_list(loc_l), + struct bpf_lru_node, + list); + if (node) + list_del(&node->list); + + return node; +} + +struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node; + bool force = false; + +ignore_ref: + /* Get from the tail (i.e. older element) of the pending list. */ + list_for_each_entry_reverse(node, local_pending_list(loc_l), + list) { + if ((!bpf_lru_node_is_ref(node) || force) && + lru->del_from_htab(lru->del_arg, node)) { + list_del(&node->list); + return node; + } + } + + if (!force) { + force = true; + goto ignore_ref; + } + + return NULL; +} + +static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, + u32 hash) +{ + struct list_head *free_list; + struct bpf_lru_node *node = NULL; + struct bpf_lru_list *l; + unsigned long flags; + int cpu = raw_smp_processor_id(); + + l = per_cpu_ptr(lru->percpu_lru, cpu); + + raw_spin_lock_irqsave(&l->lock, flags); + + __bpf_lru_list_rotate(lru, l); + + free_list = &l->lists[BPF_LRU_LIST_T_FREE]; + if (list_empty(free_list)) + __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list, + BPF_LRU_LIST_T_FREE); + + if (!list_empty(free_list)) { + node = list_first_entry(free_list, struct bpf_lru_node, list); + *(u32 *)((void *)node + lru->hash_offset) = hash; + node->ref = 0; + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); + } + + raw_spin_unlock_irqrestore(&l->lock, flags); + + return node; +} + +static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, + u32 hash) +{ + struct bpf_lru_locallist *loc_l, *steal_loc_l; + struct bpf_common_lru *clru = &lru->common_lru; + struct bpf_lru_node *node; + int steal, first_steal; + unsigned long flags; + int cpu = raw_smp_processor_id(); + + loc_l = per_cpu_ptr(clru->local_list, cpu); + + raw_spin_lock_irqsave(&loc_l->lock, flags); + + node = __local_list_pop_free(loc_l); + if (!node) { + bpf_lru_list_pop_free_to_local(lru, loc_l); + node = __local_list_pop_free(loc_l); + } + + if (node) + __local_list_add_pending(lru, loc_l, cpu, node, hash); + + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + + if (node) + return node; + + /* No free nodes found from the local free list and + * the global LRU list. + * + * Steal from the local free/pending list of the + * current CPU and remote CPU in RR. It starts + * with the loc_l->next_steal CPU. + */ + + first_steal = loc_l->next_steal; + steal = first_steal; + do { + steal_loc_l = per_cpu_ptr(clru->local_list, steal); + + raw_spin_lock_irqsave(&steal_loc_l->lock, flags); + + node = __local_list_pop_free(steal_loc_l); + if (!node) + node = __local_list_pop_pending(lru, steal_loc_l); + + raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + + steal = get_next_cpu(steal); + } while (!node && steal != first_steal); + + loc_l->next_steal = steal; + + if (node) { + raw_spin_lock_irqsave(&loc_l->lock, flags); + __local_list_add_pending(lru, loc_l, cpu, node, hash); + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + } + + return node; +} + +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) +{ + if (lru->percpu) + return bpf_percpu_lru_pop_free(lru, hash); + else + return bpf_common_lru_pop_free(lru, hash); +} + +static void bpf_common_lru_push_free(struct bpf_lru *lru, + struct bpf_lru_node *node) +{ + unsigned long flags; + + if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) || + WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE)) + return; + + if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) { + struct bpf_lru_locallist *loc_l; + + loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); + + raw_spin_lock_irqsave(&loc_l->lock, flags); + + if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + goto check_lru_list; + } + + node->type = BPF_LRU_LOCAL_LIST_T_FREE; + node->ref = 0; + list_move(&node->list, local_free_list(loc_l)); + + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + return; + } + +check_lru_list: + bpf_lru_list_push_free(&lru->common_lru.lru_list, node); +} + +static void bpf_percpu_lru_push_free(struct bpf_lru *lru, + struct bpf_lru_node *node) +{ + struct bpf_lru_list *l; + unsigned long flags; + + l = per_cpu_ptr(lru->percpu_lru, node->cpu); + + raw_spin_lock_irqsave(&l->lock, flags); + + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); + + raw_spin_unlock_irqrestore(&l->lock, flags); +} + +void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) +{ + if (lru->percpu) + bpf_percpu_lru_push_free(lru, node); + else + bpf_common_lru_push_free(lru, node); +} + +void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + struct bpf_lru_list *l = &lru->common_lru.lru_list; + u32 i; + + for (i = 0; i < nr_elems; i++) { + struct bpf_lru_node *node; + + node = (struct bpf_lru_node *)(buf + node_offset); + node->type = BPF_LRU_LIST_T_FREE; + node->ref = 0; + list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); + buf += elem_size; + } +} + +void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + u32 i, pcpu_entries; + int cpu; + struct bpf_lru_list *l; + + pcpu_entries = nr_elems / num_possible_cpus(); + + i = 0; + + for_each_possible_cpu(cpu) { + struct bpf_lru_node *node; + + l = per_cpu_ptr(lru->percpu_lru, cpu); +again: + node = (struct bpf_lru_node *)(buf + node_offset); + node->cpu = cpu; + node->type = BPF_LRU_LIST_T_FREE; + node->ref = 0; + list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); + i++; + buf += elem_size; + if (i == nr_elems) + break; + if (i % pcpu_entries) + goto again; + } +} + +void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + if (lru->percpu) + bpf_percpu_lru_populate(lru, buf, node_offset, elem_size, + nr_elems); + else + bpf_common_lru_populate(lru, buf, node_offset, elem_size, + nr_elems); +} + +static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) +{ + int i; + + for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) + INIT_LIST_HEAD(&loc_l->lists[i]); + + loc_l->next_steal = cpu; + + raw_spin_lock_init(&loc_l->lock); +} + +static void bpf_lru_list_init(struct bpf_lru_list *l) +{ + int i; + + for (i = 0; i < NR_BPF_LRU_LIST_T; i++) + INIT_LIST_HEAD(&l->lists[i]); + + for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++) + l->counts[i] = 0; + + l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + + raw_spin_lock_init(&l->lock); +} + +int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, + del_from_htab_func del_from_htab, void *del_arg) +{ + int cpu; + + if (percpu) { + lru->percpu_lru = alloc_percpu(struct bpf_lru_list); + if (!lru->percpu_lru) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct bpf_lru_list *l; + + l = per_cpu_ptr(lru->percpu_lru, cpu); + bpf_lru_list_init(l); + } + lru->nr_scans = PERCPU_NR_SCANS; + } else { + struct bpf_common_lru *clru = &lru->common_lru; + + clru->local_list = alloc_percpu(struct bpf_lru_locallist); + if (!clru->local_list) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct bpf_lru_locallist *loc_l; + + loc_l = per_cpu_ptr(clru->local_list, cpu); + bpf_lru_locallist_init(loc_l, cpu); + } + + bpf_lru_list_init(&clru->lru_list); + lru->nr_scans = LOCAL_NR_SCANS; + } + + lru->percpu = percpu; + lru->del_from_htab = del_from_htab; + lru->del_arg = del_arg; + lru->hash_offset = hash_offset; + + return 0; +} + +void bpf_lru_destroy(struct bpf_lru *lru) +{ + if (lru->percpu) + free_percpu(lru->percpu_lru); + else + free_percpu(lru->common_lru.local_list); +} diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h new file mode 100644 index 000000000000..5c35a98d02bf --- /dev/null +++ b/kernel/bpf/bpf_lru_list.h @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __BPF_LRU_LIST_H_ +#define __BPF_LRU_LIST_H_ + +#include <linux/list.h> +#include <linux/spinlock_types.h> + +#define NR_BPF_LRU_LIST_T (3) +#define NR_BPF_LRU_LIST_COUNT (2) +#define NR_BPF_LRU_LOCAL_LIST_T (2) +#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T + +enum bpf_lru_list_type { + BPF_LRU_LIST_T_ACTIVE, + BPF_LRU_LIST_T_INACTIVE, + BPF_LRU_LIST_T_FREE, + BPF_LRU_LOCAL_LIST_T_FREE, + BPF_LRU_LOCAL_LIST_T_PENDING, +}; + +struct bpf_lru_node { + struct list_head list; + u16 cpu; + u8 type; + u8 ref; +}; + +struct bpf_lru_list { + struct list_head lists[NR_BPF_LRU_LIST_T]; + unsigned int counts[NR_BPF_LRU_LIST_COUNT]; + /* The next inacitve list rotation starts from here */ + struct list_head *next_inactive_rotation; + + raw_spinlock_t lock ____cacheline_aligned_in_smp; +}; + +struct bpf_lru_locallist { + struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; + u16 next_steal; + raw_spinlock_t lock; +}; + +struct bpf_common_lru { + struct bpf_lru_list lru_list; + struct bpf_lru_locallist __percpu *local_list; +}; + +typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node); + +struct bpf_lru { + union { + struct bpf_common_lru common_lru; + struct bpf_lru_list __percpu *percpu_lru; + }; + del_from_htab_func del_from_htab; + void *del_arg; + unsigned int hash_offset; + unsigned int nr_scans; + bool percpu; +}; + +static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) +{ + /* ref is an approximation on access frequency. It does not + * have to be very accurate. Hence, no protection is used. + */ + node->ref = 1; +} + +int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, + del_from_htab_func del_from_htab, void *delete_arg); +void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems); +void bpf_lru_destroy(struct bpf_lru *lru); +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash); +void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node); +void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node); + +#endif diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c new file mode 100644 index 000000000000..a515f7b007c6 --- /dev/null +++ b/kernel/bpf/cgroup.c @@ -0,0 +1,200 @@ +/* + * Functions to manage eBPF programs attached to cgroups + * + * Copyright (c) 2016 Daniel Mack + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include <linux/kernel.h> +#include <linux/atomic.h> +#include <linux/cgroup.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/bpf-cgroup.h> +#include <net/sock.h> + +DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +EXPORT_SYMBOL(cgroup_bpf_enabled_key); + +/** + * cgroup_bpf_put() - put references of all bpf programs + * @cgrp: the cgroup to modify + */ +void cgroup_bpf_put(struct cgroup *cgrp) +{ + unsigned int type; + + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { + struct bpf_prog *prog = cgrp->bpf.prog[type]; + + if (prog) { + bpf_prog_put(prog); + static_branch_dec(&cgroup_bpf_enabled_key); + } + } +} + +/** + * cgroup_bpf_inherit() - inherit effective programs from parent + * @cgrp: the cgroup to modify + * @parent: the parent to inherit from + */ +void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) +{ + unsigned int type; + + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { + struct bpf_prog *e; + + e = rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + rcu_assign_pointer(cgrp->bpf.effective[type], e); + } +} + +/** + * __cgroup_bpf_update() - Update the pinned program of a cgroup, and + * propagate the change to descendants + * @cgrp: The cgroup which descendants to traverse + * @parent: The parent of @cgrp, or %NULL if @cgrp is the root + * @prog: A new program to pin + * @type: Type of pinning operation (ingress/egress) + * + * Each cgroup has a set of two pointers for bpf programs; one for eBPF + * programs it owns, and which is effective for execution. + * + * If @prog is not %NULL, this function attaches a new program to the cgroup + * and releases the one that is currently attached, if any. @prog is then made + * the effective program of type @type in that cgroup. + * + * If @prog is %NULL, the currently attached program of type @type is released, + * and the effective program of the parent cgroup (if any) is inherited to + * @cgrp. + * + * Then, the descendants of @cgrp are walked and the effective program for + * each of them is set to the effective program of @cgrp unless the + * descendant has its own program attached, in which case the subbranch is + * skipped. This ensures that delegated subcgroups with own programs are left + * untouched. + * + * Must be called with cgroup_mutex held. + */ +void __cgroup_bpf_update(struct cgroup *cgrp, + struct cgroup *parent, + struct bpf_prog *prog, + enum bpf_attach_type type) +{ + struct bpf_prog *old_prog, *effective; + struct cgroup_subsys_state *pos; + + old_prog = xchg(cgrp->bpf.prog + type, prog); + + effective = (!prog && parent) ? + rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)) : + prog; + + css_for_each_descendant_pre(pos, &cgrp->self) { + struct cgroup *desc = container_of(pos, struct cgroup, self); + + /* skip the subtree if the descendant has its own program */ + if (desc->bpf.prog[type] && desc != cgrp) + pos = css_rightmost_descendant(pos); + else + rcu_assign_pointer(desc->bpf.effective[type], + effective); + } + + if (prog) + static_branch_inc(&cgroup_bpf_enabled_key); + + if (old_prog) { + bpf_prog_put(old_prog); + static_branch_dec(&cgroup_bpf_enabled_key); + } +} + +/** + * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering + * @sk: The socken sending or receiving traffic + * @skb: The skb that is being sent or received + * @type: The type of program to be exectuted + * + * If no socket is passed, or the socket is not of type INET or INET6, + * this function does nothing and returns 0. + * + * The program type passed in via @type must be suitable for network + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_skb(struct sock *sk, + struct sk_buff *skb, + enum bpf_attach_type type) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + int ret = 0; + + if (!sk || !sk_fullsock(sk)) + return 0; + + if (sk->sk_family != AF_INET && + sk->sk_family != AF_INET6) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) { + unsigned int offset = skb->data - skb_network_header(skb); + + __skb_push(skb, offset); + ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; + __skb_pull(skb, offset); + } + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); + +/** + * __cgroup_bpf_run_filter_sk() - Run a program on a sock + * @sk: sock structure to manipulate + * @type: The type of program to be exectuted + * + * socket is passed is expected to be of type INET or INET6. + * + * The program type passed in via @type must be suitable for sock + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sk(struct sock *sk, + enum bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog *prog; + int ret = 0; + + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) + ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index aa6d98154106..82a04143368e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1043,6 +1043,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; +const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ad1bc67aff1b..34debc1a9641 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -15,6 +15,7 @@ #include <linux/filter.h> #include <linux/vmalloc.h> #include "percpu_freelist.h" +#include "bpf_lru_list.h" struct bucket { struct hlist_head head; @@ -25,7 +26,10 @@ struct bpf_htab { struct bpf_map map; struct bucket *buckets; void *elems; - struct pcpu_freelist freelist; + union { + struct pcpu_freelist freelist; + struct bpf_lru lru; + }; void __percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ @@ -48,11 +52,26 @@ struct htab_elem { union { struct rcu_head rcu; enum extra_elem_state state; + struct bpf_lru_node lru_node; }; u32 hash; char key[0] __aligned(8); }; +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); + +static bool htab_is_lru(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || + htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} + +static bool htab_is_percpu(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || + htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { @@ -73,7 +92,7 @@ static void htab_free_elems(struct bpf_htab *htab) { int i; - if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + if (!htab_is_percpu(htab)) goto free_elems; for (i = 0; i < htab->map.max_entries; i++) { @@ -87,7 +106,22 @@ free_elems: vfree(htab->elems); } -static int prealloc_elems_and_freelist(struct bpf_htab *htab) +static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, + u32 hash) +{ + struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); + struct htab_elem *l; + + if (node) { + l = container_of(node, struct htab_elem, lru_node); + memcpy(l->key, key, htab->map.key_size); + return l; + } + + return NULL; +} + +static int prealloc_init(struct bpf_htab *htab) { int err = -ENOMEM, i; @@ -95,7 +129,7 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) if (!htab->elems) return -ENOMEM; - if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + if (!htab_is_percpu(htab)) goto skip_percpu_elems; for (i = 0; i < htab->map.max_entries; i++) { @@ -110,12 +144,27 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) } skip_percpu_elems: - err = pcpu_freelist_init(&htab->freelist); + if (htab_is_lru(htab)) + err = bpf_lru_init(&htab->lru, + htab->map.map_flags & BPF_F_NO_COMMON_LRU, + offsetof(struct htab_elem, hash) - + offsetof(struct htab_elem, lru_node), + htab_lru_map_delete_node, + htab); + else + err = pcpu_freelist_init(&htab->freelist); + if (err) goto free_elems; - pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, - htab->map.max_entries); + if (htab_is_lru(htab)) + bpf_lru_populate(&htab->lru, htab->elems, + offsetof(struct htab_elem, lru_node), + htab->elem_size, htab->map.max_entries); + else + pcpu_freelist_populate(&htab->freelist, htab->elems, + htab->elem_size, htab->map.max_entries); + return 0; free_elems: @@ -123,6 +172,16 @@ free_elems: return err; } +static void prealloc_destroy(struct bpf_htab *htab) +{ + htab_free_elems(htab); + + if (htab_is_lru(htab)) + bpf_lru_destroy(&htab->lru); + else + pcpu_freelist_destroy(&htab->freelist); +} + static int alloc_extra_elems(struct bpf_htab *htab) { void __percpu *pptr; @@ -143,15 +202,37 @@ static int alloc_extra_elems(struct bpf_htab *htab) /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { - bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; + bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + /* percpu_lru means each cpu has its own LRU list. + * it is different from BPF_MAP_TYPE_PERCPU_HASH where + * the map's value itself is percpu. percpu_lru has + * nothing to do with the map's value. + */ + bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); + bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; int err, i; u64 cost; - if (attr->map_flags & ~BPF_F_NO_PREALLOC) + if (lru && !capable(CAP_SYS_ADMIN)) + /* LRU implementation is much complicated than other + * maps. Hence, limit to CAP_SYS_ADMIN for now. + */ + return ERR_PTR(-EPERM); + + if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU)) /* reserved bits should not be used */ return ERR_PTR(-EINVAL); + if (!lru && percpu_lru) + return ERR_PTR(-EINVAL); + + if (lru && !prealloc) + return ERR_PTR(-ENOTSUPP); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -171,6 +252,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.value_size == 0) goto free_htab; + if (percpu_lru) { + /* ensure each CPU's lru list has >=1 elements. + * since we are at it, make each lru list has the same + * number of elements. + */ + htab->map.max_entries = roundup(attr->max_entries, + num_possible_cpus()); + if (htab->map.max_entries < attr->max_entries) + htab->map.max_entries = rounddown(attr->max_entries, + num_possible_cpus()); + } + /* hash table size must be power of 2 */ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); @@ -241,14 +334,17 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - if (!percpu) { + if (!percpu && !lru) { + /* lru itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ err = alloc_extra_elems(htab); if (err) goto free_buckets; } - if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { - err = prealloc_elems_and_freelist(htab); + if (prealloc) { + err = prealloc_init(htab); if (err) goto free_extra_elems; } @@ -323,6 +419,46 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return l->key + round_up(map->key_size, 8); + } + + return NULL; +} + +/* It is called from the bpf_lru_list when the LRU needs to delete + * older elements from the htab. + */ +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) +{ + struct bpf_htab *htab = (struct bpf_htab *)arg; + struct htab_elem *l, *tgt_l; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + + tgt_l = container_of(node, struct htab_elem, lru_node); + b = __select_bucket(htab, tgt_l->hash); + head = &b->head; + + raw_spin_lock_irqsave(&b->lock, flags); + + hlist_for_each_entry_rcu(l, head, hash_node) + if (l == tgt_l) { + hlist_del_rcu(&l->hash_node); + break; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + + return l == tgt_l; +} + /* Called from syscall */ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -420,6 +556,24 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) } } +static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, + void *value, bool onallcpus) +{ + if (!onallcpus) { + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + } else { + u32 size = round_up(htab->map.value_size, 8); + int off = 0, cpu; + + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } +} + static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, @@ -479,18 +633,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } } - if (!onallcpus) { - /* copy true value_size bytes */ - memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); - } else { - int off = 0, cpu; + pcpu_copy_value(htab, pptr, value, onallcpus); - for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), - value + off, size); - off += size; - } - } if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); } else { @@ -571,6 +715,70 @@ err: return ret; } +static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new, *l_old = NULL; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* For LRU, we need to alloc before taking bucket's + * spinlock because getting free nodes from LRU may need + * to remove older elements from htab and this removal + * operation will need a bucket lock. + */ + l_new = prealloc_lru_pop(htab, key, hash); + if (!l_new) + return -ENOMEM; + memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + /* add new element to the head of the list, so that + * concurrent search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + bpf_lru_node_set_ref(&l_new->lru_node); + hlist_del_rcu(&l_old->hash_node); + } + ret = 0; + +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + + if (ret) + bpf_lru_push_free(&htab->lru, &l_new->lru_node); + else if (l_old) + bpf_lru_push_free(&htab->lru, &l_old->lru_node); + + return ret; +} + static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags, bool onallcpus) @@ -606,22 +814,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { - void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); - u32 size = htab->map.value_size; - /* per-cpu hash map can update value in-place */ - if (!onallcpus) { - memcpy(this_cpu_ptr(pptr), value, size); - } else { - int off = 0, cpu; - - size = round_up(size, 8); - for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), - value + off, size); - off += size; - } - } + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); } else { l_new = alloc_htab_elem(htab, key, value, key_size, hash, true, onallcpus, false); @@ -637,12 +832,84 @@ err: return ret; } +static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new = NULL, *l_old; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* For LRU, we need to alloc before taking bucket's + * spinlock because LRU's elem alloc may need + * to remove older elem from htab and this removal + * operation will need a bucket lock. + */ + if (map_flags != BPF_EXIST) { + l_new = prealloc_lru_pop(htab, key, hash); + if (!l_new) + return -ENOMEM; + } + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + bpf_lru_node_set_ref(&l_old->lru_node); + + /* per-cpu hash map can update value in-place */ + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); + } else { + pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), + value, onallcpus); + hlist_add_head_rcu(&l_new->hash_node, head); + l_new = NULL; + } + ret = 0; +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + if (l_new) + bpf_lru_push_free(&htab->lru, &l_new->lru_node); + return ret; +} + static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __htab_percpu_map_update_elem(map, key, value, map_flags, false); } +static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, + false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { @@ -676,6 +943,39 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) return ret; } +static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct bucket *b; + struct htab_elem *l; + unsigned long flags; + u32 hash, key_size; + int ret = -ENOENT; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_irqsave(&b->lock, flags); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) { + hlist_del_rcu(&l->hash_node); + ret = 0; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + if (l) + bpf_lru_push_free(&htab->lru, &l->lru_node); + return ret; +} + static void delete_all_elements(struct bpf_htab *htab) { int i; @@ -708,12 +1008,11 @@ static void htab_map_free(struct bpf_map *map) * not have executed. Wait for them. */ rcu_barrier(); - if (htab->map.map_flags & BPF_F_NO_PREALLOC) { + if (htab->map.map_flags & BPF_F_NO_PREALLOC) delete_all_elements(htab); - } else { - htab_free_elems(htab); - pcpu_freelist_destroy(&htab->freelist); - } + else + prealloc_destroy(htab); + free_percpu(htab->extra_elems); kvfree(htab->buckets); kfree(htab); @@ -733,6 +1032,20 @@ static struct bpf_map_type_list htab_type __read_mostly = { .type = BPF_MAP_TYPE_HASH, }; +static const struct bpf_map_ops htab_lru_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_lru_map_lookup_elem, + .map_update_elem = htab_lru_map_update_elem, + .map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_type __read_mostly = { + .ops = &htab_lru_ops, + .type = BPF_MAP_TYPE_LRU_HASH, +}; + /* Called from eBPF program */ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) { @@ -744,8 +1057,21 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); + } + + return NULL; +} + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l; void __percpu *pptr; int ret = -ENOENT; @@ -761,6 +1087,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) l = __htab_map_lookup_elem(map, key); if (!l) goto out; + if (htab_is_lru(htab)) + bpf_lru_node_set_ref(&l->lru_node); pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, @@ -776,10 +1104,16 @@ out: int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 map_flags) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); int ret; rcu_read_lock(); - ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); + if (htab_is_lru(htab)) + ret = __htab_lru_percpu_map_update_elem(map, key, value, + map_flags, true); + else + ret = __htab_percpu_map_update_elem(map, key, value, map_flags, + true); rcu_read_unlock(); return ret; @@ -799,10 +1133,26 @@ static struct bpf_map_type_list htab_percpu_type __read_mostly = { .type = BPF_MAP_TYPE_PERCPU_HASH, }; +static const struct bpf_map_ops htab_lru_percpu_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_lru_percpu_map_lookup_elem, + .map_update_elem = htab_lru_percpu_map_update_elem, + .map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = { + .ops = &htab_lru_percpu_ops, + .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); bpf_register_map_type(&htab_percpu_type); + bpf_register_map_type(&htab_lru_type); + bpf_register_map_type(&htab_lru_percpu_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 39918402e6e9..045cbe673356 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -13,6 +13,7 @@ #include <linux/rcupdate.h> #include <linux/random.h> #include <linux/smp.h> +#include <linux/topology.h> #include <linux/ktime.h> #include <linux/sched.h> #include <linux/uidgid.h> @@ -92,6 +93,17 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_get_numa_node_id) +{ + return numa_node_id(); +} + +const struct bpf_func_proto bpf_get_numa_node_id_proto = { + .func = bpf_get_numa_node_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 1ed8473ec537..0b030c9126d3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -18,6 +18,7 @@ #include <linux/namei.h> #include <linux/fs.h> #include <linux/kdev_t.h> +#include <linux/parser.h> #include <linux/filter.h> #include <linux/bpf.h> @@ -87,6 +88,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, switch (mode & S_IFMT) { case S_IFDIR: case S_IFREG: + case S_IFLNK: break; default: return ERR_PTR(-EINVAL); @@ -119,6 +121,16 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) return 0; } +static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, + struct inode *dir) +{ + d_instantiate(dentry, inode); + dget(dentry); + + dir->i_mtime = current_time(dir); + dir->i_ctime = dir->i_mtime; +} + static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -133,9 +145,7 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inc_nlink(inode); inc_nlink(dir); - d_instantiate(dentry, inode); - dget(dentry); - + bpf_dentry_finalize(dentry, inode, dir); return 0; } @@ -151,9 +161,7 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, inode->i_op = iops; inode->i_private = dentry->d_fsdata; - d_instantiate(dentry, inode); - dget(dentry); - + bpf_dentry_finalize(dentry, inode, dir); return 0; } @@ -181,13 +189,37 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { if (strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); + return simple_lookup(dir, dentry, flags); } +static int bpf_symlink(struct inode *dir, struct dentry *dentry, + const char *target) +{ + char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); + struct inode *inode; + + if (!link) + return -ENOMEM; + + inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK); + if (IS_ERR(inode)) { + kfree(link); + return PTR_ERR(inode); + } + + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = link; + + bpf_dentry_finalize(dentry, inode, dir); + return 0; +} + static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, .mknod = bpf_mkobj, .mkdir = bpf_mkdir, + .symlink = bpf_symlink, .rmdir = simple_rmdir, .rename = simple_rename, .link = simple_link, @@ -324,6 +356,8 @@ static void bpf_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); clear_inode(inode); + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); if (!bpf_inode_type(inode, &type)) bpf_any_put(inode->i_private, type); } @@ -331,15 +365,66 @@ static void bpf_evict_inode(struct inode *inode) static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, + .show_options = generic_show_options, .evict_inode = bpf_evict_inode, }; +enum { + OPT_MODE, + OPT_ERR, +}; + +static const match_table_t bpf_mount_tokens = { + { OPT_MODE, "mode=%o" }, + { OPT_ERR, NULL }, +}; + +struct bpf_mount_opts { + umode_t mode; +}; + +static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) +{ + substring_t args[MAX_OPT_ARGS]; + int option, token; + char *ptr; + + opts->mode = S_IRWXUGO; + + while ((ptr = strsep(&data, ",")) != NULL) { + if (!*ptr) + continue; + + token = match_token(ptr, bpf_mount_tokens, args); + switch (token) { + case OPT_MODE: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; + /* We might like to report bad mount options here, but + * traditionally we've ignored all mount options, so we'd + * better continue to ignore non-existing options for bpf. + */ + } + } + + return 0; +} + static int bpf_fill_super(struct super_block *sb, void *data, int silent) { static struct tree_descr bpf_rfiles[] = { { "" } }; + struct bpf_mount_opts opts; struct inode *inode; int ret; + save_mount_options(sb, data); + + ret = bpf_parse_options(data, &opts); + if (ret) + return ret; + ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; @@ -349,7 +434,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) inode = sb->s_root->d_inode; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= S_ISVTX | S_IRWXUGO; + inode->i_mode |= S_ISVTX | opts.mode; return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 237f3d6a7ddc..85af86c496cd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -17,6 +17,7 @@ #include <linux/license.h> #include <linux/filter.h> #include <linux/version.h> +#include <linux/kernel.h> DEFINE_PER_CPU(int, bpf_prog_active); @@ -137,18 +138,31 @@ static int bpf_map_release(struct inode *inode, struct file *filp) static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_map *map = filp->private_data; + const struct bpf_array *array; + u32 owner_prog_type = 0; + + if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + array = container_of(map, struct bpf_array, map); + owner_prog_type = array->owner_prog_type; + } seq_printf(m, "map_type:\t%u\n" "key_size:\t%u\n" "value_size:\t%u\n" "max_entries:\t%u\n" - "map_flags:\t%#x\n", + "map_flags:\t%#x\n" + "memlock:\t%llu\n", map->map_type, map->key_size, map->value_size, map->max_entries, - map->map_flags); + map->map_flags, + map->pages * 1ULL << PAGE_SHIFT); + + if (owner_prog_type) + seq_printf(m, "owner_prog_type:\t%u\n", + owner_prog_type); } #endif @@ -254,12 +268,6 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) return map; } -/* helper to convert user pointers passed inside __aligned_u64 fields */ -static void __user *u64_to_ptr(__u64 val) -{ - return (void __user *) (unsigned long) val; -} - int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { return -ENOTSUPP; @@ -270,8 +278,8 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) static int map_lookup_elem(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); - void __user *uvalue = u64_to_ptr(attr->value); + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; void *key, *value, *ptr; @@ -297,6 +305,7 @@ static int map_lookup_elem(union bpf_attr *attr) goto free_key; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else @@ -307,7 +316,8 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -344,8 +354,8 @@ err_put: static int map_update_elem(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); - void __user *uvalue = u64_to_ptr(attr->value); + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; @@ -371,6 +381,7 @@ static int map_update_elem(union bpf_attr *attr) goto free_key; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else @@ -390,7 +401,8 @@ static int map_update_elem(union bpf_attr *attr) */ preempt_disable(); __this_cpu_inc(bpf_prog_active); - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); @@ -422,7 +434,7 @@ err_put: static int map_delete_elem(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); + void __user *ukey = u64_to_user_ptr(attr->key); int ufd = attr->map_fd; struct bpf_map *map; struct fd f; @@ -466,8 +478,8 @@ err_put: static int map_get_next_key(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); - void __user *unext_key = u64_to_ptr(attr->next_key); + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *unext_key = u64_to_user_ptr(attr->next_key); int ufd = attr->map_fd; struct bpf_map *map; void *key, *next_key; @@ -682,10 +694,22 @@ struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) } EXPORT_SYMBOL_GPL(bpf_prog_add); +void bpf_prog_sub(struct bpf_prog *prog, int i) +{ + /* Only to be used for undoing previous bpf_prog_add() in some + * error path. We still know that another entity in our call + * path holds a reference to the program, thus atomic_sub() can + * be safely used in such cases! + */ + WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0); +} +EXPORT_SYMBOL_GPL(bpf_prog_sub); + struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) { return bpf_prog_add(prog, 1); } +EXPORT_SYMBOL_GPL(bpf_prog_inc); static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { @@ -732,7 +756,7 @@ static int bpf_prog_load(union bpf_attr *attr) return -EINVAL; /* copy eBPF program license from user space */ - if (strncpy_from_user(license, u64_to_ptr(attr->license), + if (strncpy_from_user(license, u64_to_user_ptr(attr->license), sizeof(license) - 1) < 0) return -EFAULT; license[sizeof(license) - 1] = 0; @@ -762,7 +786,7 @@ static int bpf_prog_load(union bpf_attr *attr) prog->len = attr->insn_cnt; err = -EFAULT; - if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), + if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), prog->len * sizeof(struct bpf_insn)) != 0) goto free_prog; @@ -813,7 +837,7 @@ static int bpf_obj_pin(const union bpf_attr *attr) if (CHECK_ATTR(BPF_OBJ)) return -EINVAL; - return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname)); + return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); } static int bpf_obj_get(const union bpf_attr *attr) @@ -821,9 +845,85 @@ static int bpf_obj_get(const union bpf_attr *attr) if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) return -EINVAL; - return bpf_obj_get_user(u64_to_ptr(attr->pathname)); + return bpf_obj_get_user(u64_to_user_ptr(attr->pathname)); } +#ifdef CONFIG_CGROUP_BPF + +#define BPF_PROG_ATTACH_LAST_FIELD attach_type + +static int bpf_prog_attach(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + enum bpf_prog_type ptype; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_ATTACH)) + return -EINVAL; + + switch (attr->attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + ptype = BPF_PROG_TYPE_CGROUP_SKB; + break; + case BPF_CGROUP_INET_SOCK_CREATE: + ptype = BPF_PROG_TYPE_CGROUP_SOCK; + break; + default: + return -EINVAL; + } + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) { + bpf_prog_put(prog); + return PTR_ERR(cgrp); + } + + cgroup_bpf_update(cgrp, prog, attr->attach_type); + cgroup_put(cgrp); + + return 0; +} + +#define BPF_PROG_DETACH_LAST_FIELD attach_type + +static int bpf_prog_detach(const union bpf_attr *attr) +{ + struct cgroup *cgrp; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_DETACH)) + return -EINVAL; + + switch (attr->attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_INET_SOCK_CREATE: + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + cgroup_bpf_update(cgrp, NULL, attr->attach_type); + cgroup_put(cgrp); + break; + + default: + return -EINVAL; + } + + return 0; +} +#endif /* CONFIG_CGROUP_BPF */ + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -890,6 +990,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; + +#ifdef CONFIG_CGROUP_BPF + case BPF_PROG_ATTACH: + err = bpf_prog_attach(&attr); + break; + case BPF_PROG_DETACH: + err = bpf_prog_detach(&attr); + break; +#endif + default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8199821f54cf..0e742210750e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19,6 +19,7 @@ #include <net/netlink.h> #include <linux/file.h> #include <linux/vmalloc.h> +#include <linux/stringify.h> /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. @@ -190,6 +191,22 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { + __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +static const char *func_id_name(int id) +{ + BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + + if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) + return func_id_str[id]; + else + return "unknown"; +} + static void print_verifier_state(struct bpf_verifier_state *state) { struct bpf_reg_state *reg; @@ -212,9 +229,10 @@ static void print_verifier_state(struct bpf_verifier_state *state) else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL || t == PTR_TO_MAP_VALUE_ADJ) - verbose("(ks=%d,vs=%d)", + verbose("(ks=%d,vs=%d,id=%u)", reg->map_ptr->key_size, - reg->map_ptr->value_size); + reg->map_ptr->value_size, + reg->id); if (reg->min_value != BPF_REGISTER_MIN_RANGE) verbose(",min_value=%lld", (long long)reg->min_value); @@ -353,7 +371,8 @@ static void print_bpf_insn(struct bpf_insn *insn) u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - verbose("(%02x) call %d\n", insn->code, insn->imm); + verbose("(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose("(%02x) goto pc%+d\n", insn->code, insn->off); @@ -447,6 +466,7 @@ static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) { BUG_ON(regno >= MAX_BPF_REG); regs[regno].type = UNKNOWN_VALUE; + regs[regno].id = 0; regs[regno].imm = 0; } @@ -613,12 +633,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, - const struct bpf_call_arg_meta *meta) + const struct bpf_call_arg_meta *meta, + enum bpf_access_type t) { switch (env->prog->type) { + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + /* dst_input() and dst_output() can't write for now */ + if (t == BPF_WRITE) + return false; case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_XMIT: if (meta) return meta->pkt_access; @@ -817,7 +844,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, err = check_stack_read(state, off, size, value_regno); } } else if (state->regs[regno].type == PTR_TO_PACKET) { - if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) { + if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose("cannot write into packet\n"); return -EACCES; } @@ -950,7 +977,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return 0; } - if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) { + if (type == PTR_TO_PACKET && + !may_access_direct_pkt_data(env, meta, BPF_READ)) { verbose("helper access to the packet is not allowed\n"); return -EACCES; } @@ -1112,8 +1140,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) return 0; error: - verbose("cannot pass map_type %d into func %d\n", - map->map_type, func_id); + verbose("cannot pass map_type %d into func %s#%d\n", + map->map_type, func_id_name(func_id), func_id); return -EINVAL; } @@ -1170,7 +1198,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) /* find function prototype */ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { - verbose("invalid func %d\n", func_id); + verbose("invalid func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; } @@ -1178,7 +1206,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) fn = env->prog->aux->ops->get_func_proto(func_id); if (!fn) { - verbose("unknown func %d\n", func_id); + verbose("unknown func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; } @@ -1198,7 +1226,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id) */ err = check_raw_mode(fn); if (err) { - verbose("kernel subsystem misconfigured func %d\n", func_id); + verbose("kernel subsystem misconfigured func %s#%d\n", + func_id_name(func_id), func_id); return err; } @@ -1252,9 +1281,10 @@ static int check_call(struct bpf_verifier_env *env, int func_id) return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; + regs[BPF_REG_0].id = ++env->id_gen; } else { - verbose("unknown return type %d of func %d\n", - fn->ret_type, func_id); + verbose("unknown return type %d of func %s#%d\n", + fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } @@ -1479,7 +1509,6 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; s64 min_val = BPF_REGISTER_MIN_RANGE; u64 max_val = BPF_REGISTER_MAX_RANGE; - bool min_set = false, max_set = false; u8 opcode = BPF_OP(insn->code); dst_reg = ®s[insn->dst_reg]; @@ -1502,7 +1531,6 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, } else if (insn->imm < BPF_REGISTER_MAX_RANGE && (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { min_val = max_val = insn->imm; - min_set = max_set = true; } /* We don't know anything about what was done to this register, mark it @@ -1668,8 +1696,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg); return -EACCES; } - regs[insn->dst_reg].type = UNKNOWN_VALUE; - regs[insn->dst_reg].map_ptr = NULL; + mark_reg_unknown_value(regs, insn->dst_reg); } } else { /* case: R = imm @@ -1931,6 +1958,38 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, check_reg_overflow(true_reg); } +static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, + enum bpf_reg_type type) +{ + struct bpf_reg_state *reg = ®s[regno]; + + if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { + reg->type = type; + if (type == UNKNOWN_VALUE) + mark_reg_unknown_value(regs, regno); + } +} + +/* The logic is similar to find_good_pkt_pointers(), both could eventually + * be folded together at some point. + */ +static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, + enum bpf_reg_type type) +{ + struct bpf_reg_state *regs = state->regs; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + mark_map_reg(regs, i, regs[regno].id, type); + + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (state->stack_slot_type[i] != STACK_SPILL) + continue; + mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, + regs[regno].id, type); + } +} + static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -2018,18 +2077,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (opcode == BPF_JEQ) { - /* next fallthrough insn can access memory via - * this register - */ - regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - /* branch targer cannot access it, since reg == 0 */ - mark_reg_unknown_value(other_branch->regs, - insn->dst_reg); - } else { - other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - mark_reg_unknown_value(regs, insn->dst_reg); - } + /* Mark all identical map registers in each branch as either + * safe or unknown depending R == 0 or R != 0 conditional. + */ + mark_map_regs(this_branch, insn->dst_reg, + opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE); + mark_map_regs(other_branch, insn->dst_reg, + opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { |