From 9c3997601d51069ec08d7d06cf31a17884056cc2 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 28 Oct 2014 15:11:41 -0700 Subject: bpf: reduce verifier memory consumption verifier keeps track of register state spilled to stack. registers are 8-byte wide and always aligned, so instead of tracking them in every byte-sized stack slot, use MAX_BPF_STACK / 8 array to track spilled register state. Though verifier runs in user context and its state freed immediately after verification, it makes sense to reduce its memory usage. This optimization reduces sizeof(struct verifier_state) from 12464 to 1712 on 64-bit and from 6232 to 1112 on 32-bit. Note, this patch doesn't change existing limits, which are there to bound time and memory during verification: 4k total number of insns in a program, 1k number of jumps (states to visit) and 32k number of processed insn (since an insn may be visited multiple times). Theoretical worst case memory during verification is 1712 * 1k = 17Mbyte. Out-of-memory situation triggers cleanup and rejects the program. Suggested-by: Andy Lutomirski Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 101 ++++++++++++++++++++++++++++---------------------- 1 file changed, 57 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9f81818f2941..b6a1f7c14a67 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -153,22 +153,19 @@ struct reg_state { enum bpf_stack_slot_type { STACK_INVALID, /* nothing was stored in this stack slot */ - STACK_SPILL, /* 1st byte of register spilled into stack */ - STACK_SPILL_PART, /* other 7 bytes of register spill */ + STACK_SPILL, /* register spilled into stack */ STACK_MISC /* BPF program wrote some data into this slot */ }; -struct bpf_stack_slot { - enum bpf_stack_slot_type stype; - struct reg_state reg_st; -}; +#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ /* state of the program: * type of all registers and stack info */ struct verifier_state { struct reg_state regs[MAX_BPF_REG]; - struct bpf_stack_slot stack[MAX_BPF_STACK]; + u8 stack_slot_type[MAX_BPF_STACK]; + struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; }; /* linked list of verifier states used to prune search */ @@ -259,10 +256,10 @@ static void print_verifier_state(struct verifier_env *env) env->cur_state.regs[i].map_ptr->key_size, env->cur_state.regs[i].map_ptr->value_size); } - for (i = 0; i < MAX_BPF_STACK; i++) { - if (env->cur_state.stack[i].stype == STACK_SPILL) + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (env->cur_state.stack_slot_type[i] == STACK_SPILL) verbose(" fp%d=%s", -MAX_BPF_STACK + i, - reg_type_str[env->cur_state.stack[i].reg_st.type]); + reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]); } verbose("\n"); } @@ -539,8 +536,10 @@ static int bpf_size_to_bytes(int bpf_size) static int check_stack_write(struct verifier_state *state, int off, int size, int value_regno) { - struct bpf_stack_slot *slot; int i; + /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, + * so it's aligned access and [off, off + size) are within stack limits + */ if (value_regno >= 0 && (state->regs[value_regno].type == PTR_TO_MAP_VALUE || @@ -548,30 +547,24 @@ static int check_stack_write(struct verifier_state *state, int off, int size, state->regs[value_regno].type == PTR_TO_CTX)) { /* register containing pointer is being spilled into stack */ - if (size != 8) { + if (size != BPF_REG_SIZE) { verbose("invalid size of register spill\n"); return -EACCES; } - slot = &state->stack[MAX_BPF_STACK + off]; - slot->stype = STACK_SPILL; /* save register state */ - slot->reg_st = state->regs[value_regno]; - for (i = 1; i < 8; i++) { - slot = &state->stack[MAX_BPF_STACK + off + i]; - slot->stype = STACK_SPILL_PART; - slot->reg_st.type = UNKNOWN_VALUE; - slot->reg_st.map_ptr = NULL; - } - } else { + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = + state->regs[value_regno]; + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; + } else { /* regular write of data into stack */ - for (i = 0; i < size; i++) { - slot = &state->stack[MAX_BPF_STACK + off + i]; - slot->stype = STACK_MISC; - slot->reg_st.type = UNKNOWN_VALUE; - slot->reg_st.map_ptr = NULL; - } + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = + (struct reg_state) {}; + + for (i = 0; i < size; i++) + state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; } return 0; } @@ -579,19 +572,18 @@ static int check_stack_write(struct verifier_state *state, int off, int size, static int check_stack_read(struct verifier_state *state, int off, int size, int value_regno) { + u8 *slot_type; int i; - struct bpf_stack_slot *slot; - slot = &state->stack[MAX_BPF_STACK + off]; + slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; - if (slot->stype == STACK_SPILL) { - if (size != 8) { + if (slot_type[0] == STACK_SPILL) { + if (size != BPF_REG_SIZE) { verbose("invalid size of register spill\n"); return -EACCES; } - for (i = 1; i < 8; i++) { - if (state->stack[MAX_BPF_STACK + off + i].stype != - STACK_SPILL_PART) { + for (i = 1; i < BPF_REG_SIZE; i++) { + if (slot_type[i] != STACK_SPILL) { verbose("corrupted spill memory\n"); return -EACCES; } @@ -599,12 +591,12 @@ static int check_stack_read(struct verifier_state *state, int off, int size, if (value_regno >= 0) /* restore register state from stack */ - state->regs[value_regno] = slot->reg_st; + state->regs[value_regno] = + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE]; return 0; } else { for (i = 0; i < size; i++) { - if (state->stack[MAX_BPF_STACK + off + i].stype != - STACK_MISC) { + if (slot_type[i] != STACK_MISC) { verbose("invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; @@ -747,7 +739,7 @@ static int check_stack_boundary(struct verifier_env *env, } for (i = 0; i < access_size; i++) { - if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) { + if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { verbose("invalid indirect read from stack off %d+%d size %d\n", off, i, access_size); return -EACCES; @@ -1417,12 +1409,33 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) } for (i = 0; i < MAX_BPF_STACK; i++) { - if (memcmp(&old->stack[i], &cur->stack[i], - sizeof(old->stack[0])) != 0) { - if (old->stack[i].stype == STACK_INVALID) - continue; + if (old->stack_slot_type[i] == STACK_INVALID) + continue; + if (old->stack_slot_type[i] != cur->stack_slot_type[i]) + /* Ex: old explored (safe) state has STACK_SPILL in + * this stack slot, but current has has STACK_MISC -> + * this verifier states are not equivalent, + * return false to continue verification of this path + */ return false; - } + if (i % BPF_REG_SIZE) + continue; + if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], + &cur->spilled_regs[i / BPF_REG_SIZE], + sizeof(old->spilled_regs[0]))) + /* when explored and current stack slot types are + * the same, check that stored pointers types + * are the same as well. + * Ex: explored safe path could have stored + * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} + * but current path has stored: + * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} + * such verifier states are not equivalent. + * return false to continue verification of this path + */ + return false; + else + continue; } return true; } -- cgit v1.2.3-70-g09d2 From 3274f52073d88b62f3c5ace82ae9d48546232e72 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Nov 2014 17:36:44 -0800 Subject: bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command the current meaning of BPF_MAP_UPDATE_ELEM syscall command is: either update existing map element or create a new one. Initially the plan was to add a new command to handle the case of 'create new element if it didn't exist', but 'flags' style looks cleaner and overall diff is much smaller (more code reused), so add 'flags' attribute to BPF_MAP_UPDATE_ELEM command with the following meaning: #define BPF_ANY 0 /* create new element or update existing */ #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ bpf_update_elem(fd, key, value, BPF_NOEXIST) call can fail with EEXIST if element already exists. bpf_update_elem(fd, key, value, BPF_EXIST) can fail with ENOENT if element doesn't exist. Userspace will call it as: int bpf_update_elem(int fd, void *key, void *value, __u64 flags) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), .value = ptr_to_u64(value), .flags = flags; }; return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); } First two bits of 'flags' are used to encode style of bpf_update_elem() command. Bits 2-63 are reserved for future use. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 +- include/uapi/linux/bpf.h | 8 +++++++- kernel/bpf/syscall.c | 4 ++-- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3cf91754a957..51e9242e4803 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -22,7 +22,7 @@ struct bpf_map_ops { /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); - int (*map_update_elem)(struct bpf_map *map, void *key, void *value); + int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); int (*map_delete_elem)(struct bpf_map *map, void *key); }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d18316f9e9c4..3e9e1b77f29d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -82,7 +82,7 @@ enum bpf_cmd { /* create or update key/value pair in a given map * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) - * Using attr->map_fd, attr->key, attr->value + * Using attr->map_fd, attr->key, attr->value, attr->flags * returns zero or negative error */ BPF_MAP_UPDATE_ELEM, @@ -117,6 +117,11 @@ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, }; +/* flags for BPF_MAP_UPDATE_ELEM command */ +#define BPF_ANY 0 /* create new element or update existing */ +#define BPF_NOEXIST 1 /* create new element if it didn't exist */ +#define BPF_EXIST 2 /* update existing element */ + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -132,6 +137,7 @@ union bpf_attr { __aligned_u64 value; __aligned_u64 next_key; }; + __u64 flags; }; struct { /* anonymous struct used by BPF_PROG_LOAD command */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ba61c8c16032..c0d03bf317a2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -190,7 +190,7 @@ err_put: return err; } -#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value +#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags static int map_update_elem(union bpf_attr *attr) { @@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr) * therefore all map accessors rely on this fact, so do the same here */ rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value); + err = map->ops->map_update_elem(map, key, value, attr->flags); rcu_read_unlock(); free_value: -- cgit v1.2.3-70-g09d2 From 0f8e4bd8a1fc8c4185f1630061d0a1f2d197a475 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Nov 2014 17:36:45 -0800 Subject: bpf: add hashtable type of eBPF maps add new map type BPF_MAP_TYPE_HASH and its implementation - maps are created/destroyed by userspace. Both userspace and eBPF programs can lookup/update/delete elements from the map - eBPF programs can be called in_irq(), so use spin_lock_irqsave() mechanism for concurrent updates - key/value are opaque range of bytes (aligned to 8 bytes) - user space provides 3 configuration attributes via BPF syscall: key_size, value_size, max_entries - map takes care of allocating/freeing key/value pairs - map_update_elem() must fail to insert new element when max_entries limit is reached to make sure that eBPF programs cannot exhaust memory - map_update_elem() replaces elements in an atomic way - optimized for speed of lookup() which can be called multiple times from eBPF program which itself is triggered by high volume of events . in the future JIT compiler may recognize lookup() call and optimize it further, since key_size is constant for life of eBPF program Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 2 +- kernel/bpf/hashtab.c | 362 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 364 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/hashtab.c (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3e9e1b77f29d..03a01fd609aa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -111,6 +111,7 @@ enum bpf_cmd { enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, + BPF_MAP_TYPE_HASH, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 0daf7f6ae7df..2c0ec7f9da78 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,5 @@ obj-y := core.o -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o ifdef CONFIG_TEST_BPF obj-$(CONFIG_BPF_SYSCALL) += test_stub.o endif diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c new file mode 100644 index 000000000000..d234a012f046 --- /dev/null +++ b/kernel/bpf/hashtab.c @@ -0,0 +1,362 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include + +struct bpf_htab { + struct bpf_map map; + struct hlist_head *buckets; + spinlock_t lock; + u32 count; /* number of elements in this hashtable */ + u32 n_buckets; /* number of hash buckets */ + u32 elem_size; /* size of each element in bytes */ +}; + +/* each htab element is struct htab_elem + key + value */ +struct htab_elem { + struct hlist_node hash_node; + struct rcu_head rcu; + u32 hash; + char key[0] __aligned(8); +}; + +/* Called from syscall */ +static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int err, i; + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + htab->map.key_size = attr->key_size; + htab->map.value_size = attr->value_size; + htab->map.max_entries = attr->max_entries; + + /* check sanity of attributes. + * value_size == 0 may be allowed in the future to use map as a set + */ + err = -EINVAL; + if (htab->map.max_entries == 0 || htab->map.key_size == 0 || + htab->map.value_size == 0) + goto free_htab; + + /* hash table size must be power of 2 */ + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); + + err = -E2BIG; + if (htab->map.key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + goto free_htab; + + err = -ENOMEM; + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), + GFP_USER | __GFP_NOWARN); + + if (!htab->buckets) { + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); + if (!htab->buckets) + goto free_htab; + } + + for (i = 0; i < htab->n_buckets; i++) + INIT_HLIST_HEAD(&htab->buckets[i]); + + spin_lock_init(&htab->lock); + htab->count = 0; + + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8) + + htab->map.value_size; + return &htab->map; + +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static inline u32 htab_map_hash(const void *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, + void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + + return NULL; +} + +/* Called from syscall or from eBPF program */ +static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + u32 hash, key_size; + + /* Must be called with rcu_read_lock. */ + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + head = select_bucket(htab, hash); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) + return l->key + round_up(map->key_size, 8); + + return NULL; +} + +/* Called from syscall */ +static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l, *next_l; + u32 hash, key_size; + int i; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + head = select_bucket(htab, hash); + + /* lookup the key */ + l = lookup_elem_raw(head, hash, key, key_size); + + if (!l) { + i = 0; + goto find_first_elem; + } + + /* key was found, get next key in the same bucket */ + next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + struct htab_elem, hash_node); + + if (next_l) { + /* if next elem in this hash list is non-zero, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + + /* no more elements in this hash list, go to the next bucket */ + i = hash & (htab->n_buckets - 1); + i++; + +find_first_elem: + /* iterate over buckets */ + for (; i < htab->n_buckets; i++) { + head = select_bucket(htab, i); + + /* pick first element in the bucket */ + next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct htab_elem, hash_node); + if (next_l) { + /* if it's not empty, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + } + + /* itereated over all buckets and all elements */ + return -ENOENT; +} + +/* Called from syscall or from eBPF program */ +static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new, *l_old; + struct hlist_head *head; + unsigned long flags; + u32 key_size; + int ret; + + if (map_flags > BPF_EXIST) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* allocate new element outside of lock */ + l_new = kmalloc(htab->elem_size, GFP_ATOMIC); + if (!l_new) + return -ENOMEM; + + key_size = map->key_size; + + memcpy(l_new->key, key, key_size); + memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); + + l_new->hash = htab_map_hash(l_new->key, key_size); + + /* bpf_map_update_elem() can be called in_irq() */ + spin_lock_irqsave(&htab->lock, flags); + + head = select_bucket(htab, l_new->hash); + + l_old = lookup_elem_raw(head, l_new->hash, key, key_size); + + if (!l_old && unlikely(htab->count >= map->max_entries)) { + /* if elem with this 'key' doesn't exist and we've reached + * max_entries limit, fail insertion of new elem + */ + ret = -E2BIG; + goto err; + } + + if (l_old && map_flags == BPF_NOEXIST) { + /* elem already exists */ + ret = -EEXIST; + goto err; + } + + if (!l_old && map_flags == BPF_EXIST) { + /* elem doesn't exist, cannot update it */ + ret = -ENOENT; + goto err; + } + + /* add new element to the head of the list, so that concurrent + * search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + hlist_del_rcu(&l_old->hash_node); + kfree_rcu(l_old, rcu); + } else { + htab->count++; + } + spin_unlock_irqrestore(&htab->lock, flags); + + return 0; +err: + spin_unlock_irqrestore(&htab->lock, flags); + kfree(l_new); + return ret; +} + +/* Called from syscall or from eBPF program */ +static int htab_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + unsigned long flags; + u32 hash, key_size; + int ret = -ENOENT; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + spin_lock_irqsave(&htab->lock, flags); + + head = select_bucket(htab, hash); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) { + hlist_del_rcu(&l->hash_node); + htab->count--; + kfree_rcu(l, rcu); + ret = 0; + } + + spin_unlock_irqrestore(&htab->lock, flags); + return ret; +} + +static void delete_all_elements(struct bpf_htab *htab) +{ + int i; + + for (i = 0; i < htab->n_buckets; i++) { + struct hlist_head *head = select_bucket(htab, i); + struct hlist_node *n; + struct htab_elem *l; + + hlist_for_each_entry_safe(l, n, head, hash_node) { + hlist_del_rcu(&l->hash_node); + htab->count--; + kfree(l); + } + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void htab_map_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + /* some of kfree_rcu() callbacks for elements of this map may not have + * executed. It's ok. Proceed to free residual elements and map itself + */ + delete_all_elements(htab); + kvfree(htab->buckets); + kfree(htab); +} + +static struct bpf_map_ops htab_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_map_lookup_elem, + .map_update_elem = htab_map_update_elem, + .map_delete_elem = htab_map_delete_elem, +}; + +static struct bpf_map_type_list tl = { + .ops = &htab_ops, + .type = BPF_MAP_TYPE_HASH, +}; + +static int __init register_htab_map(void) +{ + bpf_register_map_type(&tl); + return 0; +} +late_initcall(register_htab_map); -- cgit v1.2.3-70-g09d2 From 28fbcfa08d8ed7c5a50d41a0433aad222835e8e3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Nov 2014 17:36:46 -0800 Subject: bpf: add array type of eBPF maps add new map type BPF_MAP_TYPE_ARRAY and its implementation - optimized for fastest possible lookup() . in the future verifier/JIT may recognize lookup() with constant key and optimize it into constant pointer. Can optimize non-constant key into direct pointer arithmetic as well, since pointers and value_size are constant for the life of the eBPF program. In other words array_map_lookup_elem() may be 'inlined' by verifier/JIT while preserving concurrent access to this map from user space - two main use cases for array type: . 'global' eBPF variables: array of 1 element with key=0 and value is a collection of 'global' variables which programs can use to keep the state between events . aggregation of tracing events into fixed set of buckets - all array elements pre-allocated and zero initialized at init time - key as an index in array and can only be 4 byte - map_delete_elem() returns EINVAL, since elements cannot be deleted - map_update_elem() replaces elements in an non-atomic way (for atomic updates hashtable type should be used instead) Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 2 +- kernel/bpf/arraymap.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/arraymap.c (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 03a01fd609aa..0d662fe75df5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -112,6 +112,7 @@ enum bpf_cmd { enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, BPF_MAP_TYPE_HASH, + BPF_MAP_TYPE_ARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 2c0ec7f9da78..72ec98ba2d42 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,5 @@ obj-y := core.o -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o ifdef CONFIG_TEST_BPF obj-$(CONFIG_BPF_SYSCALL) += test_stub.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c new file mode 100644 index 000000000000..58b80c137afd --- /dev/null +++ b/kernel/bpf/arraymap.c @@ -0,0 +1,151 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include + +struct bpf_array { + struct bpf_map map; + u32 elem_size; + char value[0] __aligned(8); +}; + +/* Called from syscall */ +static struct bpf_map *array_map_alloc(union bpf_attr *attr) +{ + struct bpf_array *array; + u32 elem_size; + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size == 0) + return ERR_PTR(-EINVAL); + + elem_size = round_up(attr->value_size, 8); + + /* allocate all map elements and zero-initialize them */ + array = kzalloc(sizeof(*array) + attr->max_entries * elem_size, + GFP_USER | __GFP_NOWARN); + if (!array) { + array = vzalloc(array->map.max_entries * array->elem_size); + if (!array) + return ERR_PTR(-ENOMEM); + } + + /* copy mandatory map attributes */ + array->map.key_size = attr->key_size; + array->map.value_size = attr->value_size; + array->map.max_entries = attr->max_entries; + + array->elem_size = elem_size; + + return &array->map; + +} + +/* Called from syscall or from eBPF program */ +static void *array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (index >= array->map.max_entries) + return NULL; + + return array->value + array->elem_size * index; +} + +/* Called from syscall */ +static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +/* Called from syscall or from eBPF program */ +static int array_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (map_flags > BPF_EXIST) + /* unknown flags */ + return -EINVAL; + + if (index >= array->map.max_entries) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (map_flags == BPF_NOEXIST) + /* all elemenets already exist */ + return -EEXIST; + + memcpy(array->value + array->elem_size * index, value, array->elem_size); + return 0; +} + +/* Called from syscall or from eBPF program */ +static int array_map_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void array_map_free(struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding programs to complete + * and free the array + */ + synchronize_rcu(); + + kvfree(array); +} + +static struct bpf_map_ops array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list tl = { + .ops = &array_ops, + .type = BPF_MAP_TYPE_ARRAY, +}; + +static int __init register_array_map(void) +{ + bpf_register_map_type(&tl); + return 0; +} +late_initcall(register_array_map); -- cgit v1.2.3-70-g09d2 From a1854d6ac0008518bfc45e791172ad250999c2a2 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Nov 2014 17:36:47 -0800 Subject: bpf: fix BPF_MAP_LOOKUP_ELEM command return code fix errno of BPF_MAP_LOOKUP_ELEM command as bpf manpage described it in commit b4fc1a460f30("Merge branch 'bpf-next'"): ----- BPF_MAP_LOOKUP_ELEM int bpf_lookup_elem(int fd, void *key, void *value) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), .value = ptr_to_u64(value), }; return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); } bpf() syscall looks up an element with given key in a map fd. If element is found it returns zero and stores element's value into value. If element is not found it returns -1 and sets errno to ENOENT. and further down in manpage: ENOENT For BPF_MAP_LOOKUP_ELEM or BPF_MAP_DELETE_ELEM, indicates that element with given key was not found. ----- In general all BPF commands return ENOENT when map element is not found (including BPF_MAP_GET_NEXT_KEY and BPF_MAP_UPDATE_ELEM with flags == BPF_MAP_UPDATE_ONLY) Subsequent patch adds a testsuite to check return values for all of these combinations. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c0d03bf317a2..088ac0b1b106 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -169,7 +169,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; - err = -ESRCH; + err = -ENOENT; rcu_read_lock(); value = map->ops->map_lookup_elem(map, key); if (!value) -- cgit v1.2.3-70-g09d2 From d0003ec01c667b731c139e23de3306a8b328ccf5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Nov 2014 17:36:49 -0800 Subject: bpf: allow eBPF programs to use maps expose bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem() map accessors to eBPF programs Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 5 +++ include/uapi/linux/bpf.h | 3 ++ kernel/bpf/Makefile | 2 +- kernel/bpf/helpers.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/helpers.c (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 51e9242e4803..75e94eaa228b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -133,4 +133,9 @@ struct bpf_prog *bpf_prog_get(u32 ufd); /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog *fp, union bpf_attr *attr); +/* verifier prototypes for helper functions called from eBPF programs */ +extern struct bpf_func_proto bpf_map_lookup_elem_proto; +extern struct bpf_func_proto bpf_map_update_elem_proto; +extern struct bpf_func_proto bpf_map_delete_elem_proto; + #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0d662fe75df5..4a3d0f84f178 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -158,6 +158,9 @@ union bpf_attr { */ enum bpf_func_id { BPF_FUNC_unspec, + BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ + BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ + BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 72ec98ba2d42..a5ae60f0b0a2 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,5 @@ obj-y := core.o -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o ifdef CONFIG_TEST_BPF obj-$(CONFIG_BPF_SYSCALL) += test_stub.o endif diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c new file mode 100644 index 000000000000..9e3414d85459 --- /dev/null +++ b/kernel/bpf/helpers.c @@ -0,0 +1,89 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include + +/* If kernel subsystem is allowing eBPF programs to call this function, + * inside its own verifier_ops->get_func_proto() callback it should return + * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments + * + * Different map implementations will rely on rcu in map methods + * lookup/update/delete, therefore eBPF programs must run under rcu lock + * if program is allowed to access maps, so check rcu_read_lock_held in + * all three functions. + */ +static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* verifier checked that R1 contains a valid pointer to bpf_map + * and R2 points to a program stack and map->key_size bytes were + * initialized + */ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + void *value; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + value = map->ops->map_lookup_elem(map, key); + + /* lookup() returns either pointer to element value or NULL + * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type + */ + return (unsigned long) value; +} + +struct bpf_func_proto bpf_map_lookup_elem_proto = { + .func = bpf_map_lookup_elem, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, +}; + +static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + void *value = (void *) (unsigned long) r3; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + return map->ops->map_update_elem(map, key, value, r4); +} + +struct bpf_func_proto bpf_map_update_elem_proto = { + .func = bpf_map_update_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + .arg3_type = ARG_PTR_TO_MAP_VALUE, + .arg4_type = ARG_ANYTHING, +}; + +static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + return map->ops->map_delete_elem(map, key); +} + +struct bpf_func_proto bpf_map_delete_elem_proto = { + .func = bpf_map_delete_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, +}; -- cgit v1.2.3-70-g09d2 From 7943c0f329d33f531607d66f5781f2210e1e278c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Nov 2014 17:36:50 -0800 Subject: bpf: remove test map scaffolding and user proper types proper types and function helpers are ready. Use them in verifier testsuite. Remove temporary stubs Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/test_stub.c | 56 ++++++++------------------------------------- samples/bpf/test_verifier.c | 14 ++++++------ 2 files changed, 16 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c index fcaddff4003e..0ceae1e6e8b5 100644 --- a/kernel/bpf/test_stub.c +++ b/kernel/bpf/test_stub.c @@ -18,26 +18,18 @@ struct bpf_context { u64 arg2; }; -static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) -{ - return 0; -} - -static struct bpf_func_proto test_funcs[] = { - [BPF_FUNC_unspec] = { - .func = test_func, - .gpl_only = true, - .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_KEY, - }, -}; - static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) { - if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs)) + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + default: return NULL; - return &test_funcs[func_id]; + } } static const struct bpf_context_access { @@ -78,38 +70,8 @@ static struct bpf_prog_type_list tl_prog = { .type = BPF_PROG_TYPE_UNSPEC, }; -static struct bpf_map *test_map_alloc(union bpf_attr *attr) -{ - struct bpf_map *map; - - map = kzalloc(sizeof(*map), GFP_USER); - if (!map) - return ERR_PTR(-ENOMEM); - - map->key_size = attr->key_size; - map->value_size = attr->value_size; - map->max_entries = attr->max_entries; - return map; -} - -static void test_map_free(struct bpf_map *map) -{ - kfree(map); -} - -static struct bpf_map_ops test_map_ops = { - .map_alloc = test_map_alloc, - .map_free = test_map_free, -}; - -static struct bpf_map_type_list tl_map = { - .ops = &test_map_ops, - .type = BPF_MAP_TYPE_UNSPEC, -}; - static int __init register_test_ops(void) { - bpf_register_map_type(&tl_map); bpf_register_prog_type(&tl_prog); return 0; } diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c index 63402742345e..b96175e90363 100644 --- a/samples/bpf/test_verifier.c +++ b/samples/bpf/test_verifier.c @@ -261,7 +261,7 @@ static struct bpf_test tests[] = { BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), }, .fixup = {2}, @@ -417,7 +417,7 @@ static struct bpf_test tests[] = { BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_delete_elem), BPF_EXIT_INSN(), }, .errstr = "fd 0 is not pointing to valid bpf_map", @@ -430,7 +430,7 @@ static struct bpf_test tests[] = { BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), BPF_EXIT_INSN(), }, @@ -445,7 +445,7 @@ static struct bpf_test tests[] = { BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), BPF_EXIT_INSN(), @@ -461,7 +461,7 @@ static struct bpf_test tests[] = { BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), BPF_EXIT_INSN(), @@ -548,7 +548,7 @@ static struct bpf_test tests[] = { BPF_ST_MEM(BPF_DW, BPF_REG_2, -56, 0), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -56), BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_delete_elem), BPF_EXIT_INSN(), }, .fixup = {24}, @@ -659,7 +659,7 @@ static int create_map(void) long long key, value = 0; int map_fd; - map_fd = bpf_create_map(BPF_MAP_TYPE_UNSPEC, sizeof(key), sizeof(value), 1024); + map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 1024); if (map_fd < 0) { printf("failed to create map '%s'\n", strerror(errno)); } -- cgit v1.2.3-70-g09d2 From daaf427c6ab392bedcd018e326b2ffa1e1110cd6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 18 Nov 2014 17:32:16 -0800 Subject: bpf: fix arraymap NULL deref and missing overflow and zero size checks - fix NULL pointer dereference: kernel/bpf/arraymap.c:41 array_map_alloc() error: potential null dereference 'array'. (kzalloc returns null) kernel/bpf/arraymap.c:41 array_map_alloc() error: we previously assumed 'array' could be null (see line 40) - integer overflow check was missing in arraymap (hashmap checks for overflow via kmalloc_array()) - arraymap can round_up(value_size, 8) to zero. check was missing. - hashmap was missing zero size check as well, since roundup_pow_of_two() can truncate into zero - found a typo in the arraymap comment and unnecessary empty line Fix all of these issues and make both overflow checks explicit U32 in size. Reported-by: kbuild test robot Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 17 +++++++++++------ kernel/bpf/hashtab.c | 5 +++++ 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 58b80c137afd..9eb4d8a7cd87 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -25,7 +25,7 @@ struct bpf_array { static struct bpf_map *array_map_alloc(union bpf_attr *attr) { struct bpf_array *array; - u32 elem_size; + u32 elem_size, array_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -34,11 +34,17 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); + /* check round_up into zero and u32 overflow */ + if (elem_size == 0 || + attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size) + return ERR_PTR(-ENOMEM); + + array_size = sizeof(*array) + attr->max_entries * elem_size; + /* allocate all map elements and zero-initialize them */ - array = kzalloc(sizeof(*array) + attr->max_entries * elem_size, - GFP_USER | __GFP_NOWARN); + array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); if (!array) { - array = vzalloc(array->map.max_entries * array->elem_size); + array = vzalloc(array_size); if (!array) return ERR_PTR(-ENOMEM); } @@ -51,7 +57,6 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->elem_size = elem_size; return &array->map; - } /* Called from syscall or from eBPF program */ @@ -101,7 +106,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -E2BIG; if (map_flags == BPF_NOEXIST) - /* all elemenets already exist */ + /* all elements already exist */ return -EEXIST; memcpy(array->value + array->elem_size * index, value, array->elem_size); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d234a012f046..b3ba43674310 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -65,6 +65,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) goto free_htab; err = -ENOMEM; + /* prevent zero size kmalloc and check for u32 overflow */ + if (htab->n_buckets == 0 || + htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) + goto free_htab; + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), GFP_USER | __GFP_NOWARN); -- cgit v1.2.3-70-g09d2 From ddd872bc3098f9d9abe1680a6b2013e59e3337f7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Dec 2014 15:06:34 -0800 Subject: bpf: verifier: add checks for BPF_ABS | BPF_IND instructions introduce program type BPF_PROG_TYPE_SOCKET_FILTER that is used for attaching programs to sockets where ctx == skb. add verifier checks for ABS/IND instructions which can only be seen in socket filters, therefore the check: if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4a3d0f84f178..45da7ec7d274 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -117,6 +117,7 @@ enum bpf_map_type { enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, + BPF_PROG_TYPE_SOCKET_FILTER, }; /* flags for BPF_MAP_UPDATE_ELEM command */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b6a1f7c14a67..a28e09c7825d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1172,6 +1172,70 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) return 0; } +/* verify safety of LD_ABS|LD_IND instructions: + * - they can only appear in the programs where ctx == skb + * - since they are wrappers of function calls, they scratch R1-R5 registers, + * preserve R6-R9, and store return value into R0 + * + * Implicit input: + * ctx == skb == R6 == CTX + * + * Explicit input: + * SRC == any register + * IMM == 32-bit immediate + * + * Output: + * R0 - 8/16/32-bit skb data converted to cpu endianness + */ +static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + u8 mode = BPF_MODE(insn->code); + struct reg_state *reg; + int i, err; + + if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { + verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); + return -EINVAL; + } + + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || + (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { + verbose("BPF_LD_ABS uses reserved fields\n"); + return -EINVAL; + } + + /* check whether implicit source operand (register R6) is readable */ + err = check_reg_arg(regs, BPF_REG_6, SRC_OP); + if (err) + return err; + + if (regs[BPF_REG_6].type != PTR_TO_CTX) { + verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); + return -EINVAL; + } + + if (mode == BPF_IND) { + /* check explicit source operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } + + /* reset caller saved regs to unreadable */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + reg = regs + caller_saved[i]; + reg->type = NOT_INIT; + reg->imm = 0; + } + + /* mark destination R0 register as readable, since it contains + * the value fetched from the packet + */ + regs[BPF_REG_0].type = UNKNOWN_VALUE; + return 0; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -1677,8 +1741,10 @@ process_bpf_exit: u8 mode = BPF_MODE(insn->code); if (mode == BPF_ABS || mode == BPF_IND) { - verbose("LD_ABS is not supported yet\n"); - return -EINVAL; + err = check_ld_abs(env, insn); + if (err) + return err; + } else if (mode == BPF_IMM) { err = check_ld_imm(env, insn); if (err) -- cgit v1.2.3-70-g09d2