From d8de01b763e0d8b3b418d3606d26f203983b6637 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Wed, 26 Dec 2018 06:35:23 -0600 Subject: phy.h: fix obvious errors in doc and kerneldoc content 1) note that gianfar_phy.c was removed years ago 2) fix obvious copy and paste error in regular doc 3) change regular doc into kerneldoc for phy_modes() Signed-off-by: Robert P. J. Day Signed-off-by: David S. Miller --- include/linux/phy.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index da039f211c22..3b051f761450 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1,6 +1,6 @@ /* * Framework and drivers for configuring and reading different PHYs - * Based on code in sungem_phy.c and gianfar_phy.c + * Based on code in sungem_phy.c and (long-removed) gianfar_phy.c * * Author: Andy Fleming * @@ -110,9 +110,9 @@ typedef enum { * @speeds: buffer to store supported speeds in. * @size: size of speeds buffer. * - * Description: Returns the number of supported speeds, and - * fills the speeds * buffer with the supported speeds. If speeds buffer is - * too small to contain * all currently supported speeds, will return as + * Description: Returns the number of supported speeds, and fills + * the speeds buffer with the supported speeds. If speeds buffer is + * too small to contain all currently supported speeds, will return as * many speeds as can fit. */ unsigned int phy_supported_speeds(struct phy_device *phy, @@ -120,7 +120,10 @@ unsigned int phy_supported_speeds(struct phy_device *phy, unsigned int size); /** - * It maps 'enum phy_interface_t' found in include/linux/phy.h + * phy_modes - map phy_interface_t enum to device tree binding of phy-mode + * @interface: enum phy_interface_t value + * + * Description: maps 'enum phy_interface_t' defined in this file * into the device tree binding of 'phy-mode', so that Ethernet * device driver can get phy interface from device tree. */ -- cgit v1.3.1 From a3c9311f62b4943228ae90f769775dd3bcbfa7c0 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Thu, 27 Dec 2018 16:10:59 -0500 Subject: include/linux/phy/phy.h: fix minor kerneldoc errors Correct two minor kerneldoc errors: 1) missing reference to @mode in struct phy_ops 2) obsolete reference to @init_data in struct_phy_attrs, removed in dbc98635e0d42f0e62ea92813df1e0e4c90f8375 Signed-off-by: Robert P. J. Day Signed-off-by: David S. Miller --- include/linux/phy/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index 03b319f89a34..66d1560f1a26 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -69,6 +69,7 @@ struct phy_ops { /** * struct phy_attrs - represents phy attributes * @bus_width: Data path width implemented by PHY + * @mode: PHY mode */ struct phy_attrs { u32 bus_width; @@ -80,7 +81,6 @@ struct phy_attrs { * @dev: phy device * @id: id of the phy device * @ops: function pointers for performing phy operations - * @init_data: list of PHY consumers (non-dt only) * @mutex: mutex to protect phy_ops * @init_count: used to protect when the PHY is used by multiple consumers * @power_count: used to protect when the PHY is used by multiple consumers -- cgit v1.3.1 From df4a902509766897f7371fdfa4c3bf8bc321b55d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 Dec 2018 01:24:46 +0100 Subject: netfilter: nf_conncount: merge lookup and add functions 'lookup' is always followed by 'add'. Merge both and make the list-walk part of nf_conncount_add(). This also avoids one unneeded unlock/re-lock pair. Extra care needs to be taken in count_tree, as we only hold rcu read lock, i.e. we can only insert to an existing tree node after acquiring its lock and making sure it has a nonzero count. As a zero count should be rare, just fall back to insert_tree() (which acquires tree lock). This issue and its solution were pointed out by Shawn Bohrer during patch review. Reviewed-by: Shawn Bohrer Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 18 +--- net/netfilter/nf_conncount.c | 146 +++++++++++++---------------- net/netfilter/nft_connlimit.c | 14 +-- 3 files changed, 72 insertions(+), 106 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index 4b2b2baf8ab4..aa66775c15f4 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -5,12 +5,6 @@ struct nf_conncount_data; -enum nf_conncount_list_add { - NF_CONNCOUNT_ADDED, /* list add was ok */ - NF_CONNCOUNT_ERR, /* -ENOMEM, must drop skb */ - NF_CONNCOUNT_SKIP, /* list is already reclaimed by gc */ -}; - struct nf_conncount_list { spinlock_t list_lock; struct list_head head; /* connections with the same filtering key */ @@ -29,18 +23,12 @@ unsigned int nf_conncount_count(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); -void nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, - bool *addit); +int nf_conncount_add(struct net *net, struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone); void nf_conncount_list_init(struct nf_conncount_list *list); -enum nf_conncount_list_add -nf_conncount_add(struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); - bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list); diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 0a83c694a8f1..ce7f7d1212a6 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -83,38 +83,6 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen) return memcmp(a, b, klen * sizeof(u32)); } -enum nf_conncount_list_add -nf_conncount_add(struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) -{ - struct nf_conncount_tuple *conn; - - if (WARN_ON_ONCE(list->count > INT_MAX)) - return NF_CONNCOUNT_ERR; - - conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); - if (conn == NULL) - return NF_CONNCOUNT_ERR; - - conn->tuple = *tuple; - conn->zone = *zone; - conn->cpu = raw_smp_processor_id(); - conn->jiffies32 = (u32)jiffies; - conn->dead = false; - spin_lock_bh(&list->list_lock); - if (list->dead == true) { - kmem_cache_free(conncount_conn_cachep, conn); - spin_unlock_bh(&list->list_lock); - return NF_CONNCOUNT_SKIP; - } - list_add_tail(&conn->node, &list->head); - list->count++; - spin_unlock_bh(&list->list_lock); - return NF_CONNCOUNT_ADDED; -} -EXPORT_SYMBOL_GPL(nf_conncount_add); - static void __conn_free(struct rcu_head *h) { struct nf_conncount_tuple *conn; @@ -177,11 +145,10 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, return ERR_PTR(-EAGAIN); } -void nf_conncount_lookup(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, - bool *addit) +static int __nf_conncount_add(struct net *net, + struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; @@ -189,9 +156,6 @@ void nf_conncount_lookup(struct net *net, unsigned int collect = 0; bool free_entry = false; - /* best effort only */ - *addit = tuple ? true : false; - /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { if (collect > CONNCOUNT_GC_MAX_NODES) @@ -201,21 +165,19 @@ void nf_conncount_lookup(struct net *net, if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { - if (!tuple) - continue; - if (nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_id(&conn->zone, conn->zone.dir) == nf_ct_zone_id(zone, zone->dir)) - *addit = false; - } else if (PTR_ERR(found) == -ENOENT) + return 0; /* already exists */ + } else { collect++; + } continue; } found_ct = nf_ct_tuplehash_to_ctrack(found); - if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* * We should not see tuples twice unless someone hooks @@ -223,7 +185,8 @@ void nf_conncount_lookup(struct net *net, * * Attempt to avoid a re-add in this case. */ - *addit = false; + nf_ct_put(found_ct); + return 0; } else if (already_closed(found_ct)) { /* * we do not care about connections which are @@ -237,8 +200,38 @@ void nf_conncount_lookup(struct net *net, nf_ct_put(found_ct); } + + if (WARN_ON_ONCE(list->count > INT_MAX)) + return -EOVERFLOW; + + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); + if (conn == NULL) + return -ENOMEM; + + conn->tuple = *tuple; + conn->zone = *zone; + conn->cpu = raw_smp_processor_id(); + conn->jiffies32 = (u32)jiffies; + list_add_tail(&conn->node, &list->head); + list->count++; + return 0; } -EXPORT_SYMBOL_GPL(nf_conncount_lookup); + +int nf_conncount_add(struct net *net, + struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) +{ + int ret; + + /* check the saved connections */ + spin_lock_bh(&list->list_lock); + ret = __nf_conncount_add(net, list, tuple, zone); + spin_unlock_bh(&list->list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_conncount_add); void nf_conncount_list_init(struct nf_conncount_list *list) { @@ -339,13 +332,11 @@ insert_tree(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { - enum nf_conncount_list_add ret; struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; struct rb_node **rbnode, *parent; struct nf_conncount_rb *rbconn; struct nf_conncount_tuple *conn; unsigned int count = 0, gc_count = 0; - bool node_found = false; bool do_gc = true; spin_lock_bh(&nf_conncount_locks[hash]); @@ -363,20 +354,15 @@ restart: } else if (diff > 0) { rbnode = &((*rbnode)->rb_right); } else { - /* unlikely: other cpu added node already */ - node_found = true; - ret = nf_conncount_add(&rbconn->list, tuple, zone); - if (ret == NF_CONNCOUNT_ERR) { + int ret; + + ret = nf_conncount_add(net, &rbconn->list, tuple, zone); + if (ret) count = 0; /* hotdrop */ - } else if (ret == NF_CONNCOUNT_ADDED) { + else count = rbconn->list.count; - } else { - /* NF_CONNCOUNT_SKIP, rbconn is already - * reclaimed by gc, insert a new tree node - */ - node_found = false; - } - break; + tree_nodes_free(root, gc_nodes, gc_count); + goto out_unlock; } if (gc_count >= ARRAY_SIZE(gc_nodes)) @@ -394,9 +380,6 @@ restart: goto restart; } - if (node_found) - goto out_unlock; - /* expected case: match, insert new node */ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); if (rbconn == NULL) @@ -431,7 +414,6 @@ count_tree(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { - enum nf_conncount_list_add ret; struct rb_root *root; struct rb_node *parent; struct nf_conncount_rb *rbconn; @@ -444,7 +426,6 @@ count_tree(struct net *net, parent = rcu_dereference_raw(root->rb_node); while (parent) { int diff; - bool addit; rbconn = rb_entry(parent, struct nf_conncount_rb, node); @@ -454,24 +435,29 @@ count_tree(struct net *net, } else if (diff > 0) { parent = rcu_dereference_raw(parent->rb_right); } else { - /* same source network -> be counted! */ - nf_conncount_lookup(net, &rbconn->list, tuple, zone, - &addit); + int ret; - if (!addit) + if (!tuple) { + nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; + } - ret = nf_conncount_add(&rbconn->list, tuple, zone); - if (ret == NF_CONNCOUNT_ERR) { - return 0; /* hotdrop */ - } else if (ret == NF_CONNCOUNT_ADDED) { - return rbconn->list.count; - } else { - /* NF_CONNCOUNT_SKIP, rbconn is already - * reclaimed by gc, insert a new tree node - */ + spin_lock_bh(&rbconn->list.list_lock); + /* Node might be about to be free'd. + * We need to defer to insert_tree() in this case. + */ + if (rbconn->list.count == 0) { + spin_unlock_bh(&rbconn->list.list_lock); break; } + + /* same source network -> be counted! */ + ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); + spin_unlock_bh(&rbconn->list.list_lock); + if (ret) + return 0; /* hotdrop */ + else + return rbconn->list.count; } } diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index b90d96ba4a12..af1497ab9464 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -30,7 +30,6 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int count; - bool addit; tuple_ptr = &tuple; @@ -44,19 +43,12 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, return; } - nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, - &addit); - count = priv->list.count; - - if (!addit) - goto out; - - if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) { + if (nf_conncount_add(nft_net(pkt), &priv->list, tuple_ptr, zone)) { regs->verdict.code = NF_DROP; return; } - count++; -out: + + count = priv->list.count; if ((count > priv->limit) ^ priv->invert) { regs->verdict.code = NFT_BREAK; -- cgit v1.3.1 From c80f10bc973af2ace6b1414724eeff61eaa71837 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 28 Dec 2018 01:24:48 +0100 Subject: netfilter: nf_conncount: speculative garbage collection on empty lists Instead of removing a empty list node that might be reintroduced soon thereafter, tentatively place the empty list node on the list passed to tree_nodes_free(), then re-check if the list is empty again before erasing it from the tree. [ Florian: rebase on top of pending nf_conncount fixes ] Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search") Reviewed-by: Shawn Bohrer Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 1 - net/netfilter/nf_conncount.c | 47 ++++++++++-------------------- 2 files changed, 15 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index aa66775c15f4..f32fc8289473 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -9,7 +9,6 @@ struct nf_conncount_list { spinlock_t list_lock; struct list_head head; /* connections with the same filtering key */ unsigned int count; /* length of list */ - bool dead; }; struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index d0fd195b19a8..f0b05dfebc6e 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -81,27 +81,20 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen) return memcmp(a, b, klen * sizeof(u32)); } -static bool conn_free(struct nf_conncount_list *list, +static void conn_free(struct nf_conncount_list *list, struct nf_conncount_tuple *conn) { - bool free_entry = false; - lockdep_assert_held(&list->list_lock); list->count--; list_del(&conn->node); - if (list->count == 0) { - list->dead = true; - free_entry = true; - } kmem_cache_free(conncount_conn_cachep, conn); - return free_entry; } static const struct nf_conntrack_tuple_hash * find_or_evict(struct net *net, struct nf_conncount_list *list, - struct nf_conncount_tuple *conn, bool *free_entry) + struct nf_conncount_tuple *conn) { const struct nf_conntrack_tuple_hash *found; unsigned long a, b; @@ -121,7 +114,7 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, */ age = a - b; if (conn->cpu == cpu || age >= 2) { - *free_entry = conn_free(list, conn); + conn_free(list, conn); return ERR_PTR(-ENOENT); } @@ -137,14 +130,13 @@ static int __nf_conncount_add(struct net *net, struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collect = 0; - bool free_entry = false; /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { if (collect > CONNCOUNT_GC_MAX_NODES) break; - found = find_or_evict(net, list, conn, &free_entry); + found = find_or_evict(net, list, conn); if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { @@ -221,7 +213,6 @@ void nf_conncount_list_init(struct nf_conncount_list *list) spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); list->count = 0; - list->dead = false; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); @@ -233,7 +224,6 @@ bool nf_conncount_gc_list(struct net *net, struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collected = 0; - bool free_entry = false; bool ret = false; /* don't bother if other cpu is already doing GC */ @@ -241,15 +231,10 @@ bool nf_conncount_gc_list(struct net *net, return false; list_for_each_entry_safe(conn, conn_n, &list->head, node) { - found = find_or_evict(net, list, conn, &free_entry); + found = find_or_evict(net, list, conn); if (IS_ERR(found)) { - if (PTR_ERR(found) == -ENOENT) { - if (free_entry) { - spin_unlock(&list->list_lock); - return true; - } + if (PTR_ERR(found) == -ENOENT) collected++; - } continue; } @@ -260,10 +245,7 @@ bool nf_conncount_gc_list(struct net *net, * closed already -> ditch it */ nf_ct_put(found_ct); - if (conn_free(list, conn)) { - spin_unlock(&list->list_lock); - return true; - } + conn_free(list, conn); collected++; continue; } @@ -273,10 +255,8 @@ bool nf_conncount_gc_list(struct net *net, break; } - if (!list->count) { - list->dead = true; + if (!list->count) ret = true; - } spin_unlock(&list->list_lock); return ret; @@ -291,6 +271,7 @@ static void __tree_nodes_free(struct rcu_head *h) kmem_cache_free(conncount_rb_cachep, rbconn); } +/* caller must hold tree nf_conncount_locks[] lock */ static void tree_nodes_free(struct rb_root *root, struct nf_conncount_rb *gc_nodes[], unsigned int gc_count) @@ -300,8 +281,10 @@ static void tree_nodes_free(struct rb_root *root, while (gc_count) { rbconn = gc_nodes[--gc_count]; spin_lock(&rbconn->list.list_lock); - rb_erase(&rbconn->node, root); - call_rcu(&rbconn->rcu_head, __tree_nodes_free); + if (!rbconn->list.count) { + rb_erase(&rbconn->node, root); + call_rcu(&rbconn->rcu_head, __tree_nodes_free); + } spin_unlock(&rbconn->list.list_lock); } } @@ -318,7 +301,6 @@ insert_tree(struct net *net, struct rb_root *root, unsigned int hash, const u32 *key, - u8 keylen, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { @@ -327,6 +309,7 @@ insert_tree(struct net *net, struct nf_conncount_rb *rbconn; struct nf_conncount_tuple *conn; unsigned int count = 0, gc_count = 0; + u8 keylen = data->keylen; bool do_gc = true; spin_lock_bh(&nf_conncount_locks[hash]); @@ -454,7 +437,7 @@ count_tree(struct net *net, if (!tuple) return 0; - return insert_tree(net, data, root, hash, key, keylen, tuple, zone); + return insert_tree(net, data, root, hash, key, tuple, zone); } static void tree_gc_worker(struct work_struct *work) -- cgit v1.3.1 From 3a0ed3e9619738067214871e9cb826fa23b2ddb9 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Thu, 27 Dec 2018 18:55:09 -0800 Subject: sock: Make sock->sk_stamp thread-safe Al Viro mentioned (Message-ID <20170626041334.GZ10672@ZenIV.linux.org.uk>) that there is probably a race condition lurking in accesses of sk_stamp on 32-bit machines. sock->sk_stamp is of type ktime_t which is always an s64. On a 32 bit architecture, we might run into situations of unsafe access as the access to the field becomes non atomic. Use seqlocks for synchronization. This allows us to avoid using spinlocks for readers as readers do not need mutual exclusion. Another approach to solve this is to require sk_lock for all modifications of the timestamps. The current approach allows for timestamps to have their own lock: sk_stamp_lock. This allows for the patch to not compete with already existing critical sections, and side effects are limited to the paths in the patch. The addition of the new field maintains the data locality optimizations from commit 9115e8cd2a0c ("net: reorganize struct sock for better data locality") Note that all the instances of the sk_stamp accesses are either through the ioctl or the syscall recvmsg. Signed-off-by: Deepa Dinamani Signed-off-by: David S. Miller --- include/net/sock.h | 38 +++++++++++++++++++++++++++++++++++--- net/compat.c | 15 +++++++++------ net/core/sock.c | 15 ++++++++++----- net/sunrpc/svcsock.c | 2 +- 4 files changed, 55 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index a6235c286ef9..2b229f7be8eb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -298,6 +298,7 @@ struct sock_common { * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received + * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING socket options * @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_zckey: counter to order MSG_ZEROCOPY notifications @@ -474,6 +475,9 @@ struct sock { const struct cred *sk_peer_cred; long sk_rcvtimeo; ktime_t sk_stamp; +#if BITS_PER_LONG==32 + seqlock_t sk_stamp_seq; +#endif u16 sk_tsflags; u8 sk_shutdown; u32 sk_tskey; @@ -2297,6 +2301,34 @@ static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) atomic_add(segs, &sk->sk_drops); } +static inline ktime_t sock_read_timestamp(struct sock *sk) +{ +#if BITS_PER_LONG==32 + unsigned int seq; + ktime_t kt; + + do { + seq = read_seqbegin(&sk->sk_stamp_seq); + kt = sk->sk_stamp; + } while (read_seqretry(&sk->sk_stamp_seq, seq)); + + return kt; +#else + return sk->sk_stamp; +#endif +} + +static inline void sock_write_timestamp(struct sock *sk, ktime_t kt) +{ +#if BITS_PER_LONG==32 + write_seqlock(&sk->sk_stamp_seq); + sk->sk_stamp = kt; + write_sequnlock(&sk->sk_stamp_seq); +#else + sk->sk_stamp = kt; +#endif +} + void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, @@ -2321,7 +2353,7 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else - sk->sk_stamp = kt; + sock_write_timestamp(sk, kt); if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid) __sock_recv_wifi_status(msg, sk, skb); @@ -2342,9 +2374,9 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY) __sock_recv_ts_and_drops(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) - sk->sk_stamp = skb->tstamp; + sock_write_timestamp(sk, skb->tstamp); else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP)) - sk->sk_stamp = 0; + sock_write_timestamp(sk, 0); } void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); diff --git a/net/compat.c b/net/compat.c index 47a614b370cd..d1f3a8a0b3ef 100644 --- a/net/compat.c +++ b/net/compat.c @@ -467,12 +467,14 @@ int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) ctv = (struct compat_timeval __user *) userstamp; err = -ENOENT; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sk->sk_stamp); + tv = ktime_to_timeval(sock_read_timestamp(sk)); + if (tv.tv_sec == -1) return err; if (tv.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - tv = ktime_to_timeval(sk->sk_stamp); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + tv = ktime_to_timeval(kt); } err = 0; if (put_user(tv.tv_sec, &ctv->tv_sec) || @@ -494,12 +496,13 @@ int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *usersta ctv = (struct compat_timespec __user *) userstamp; err = -ENOENT; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sk->sk_stamp); + ts = ktime_to_timespec(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return err; if (ts.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - ts = ktime_to_timespec(sk->sk_stamp); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + ts = ktime_to_timespec(kt); } err = 0; if (put_user(ts.tv_sec, &ctv->tv_sec) || diff --git a/net/core/sock.c b/net/core/sock.c index f00902c532cc..6aa2e7e0b4fb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2751,6 +2751,9 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; +#if BITS_PER_LONG==32 + seqlock_init(&sk->sk_stamp_seq); +#endif atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL @@ -2850,12 +2853,13 @@ int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) struct timeval tv; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sk->sk_stamp); + tv = ktime_to_timeval(sock_read_timestamp(sk)); if (tv.tv_sec == -1) return -ENOENT; if (tv.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - tv = ktime_to_timeval(sk->sk_stamp); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + tv = ktime_to_timeval(kt); } return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; } @@ -2866,11 +2870,12 @@ int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) struct timespec ts; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sk->sk_stamp); + ts = ktime_to_timespec(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return -ENOENT; if (ts.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); ts = ktime_to_timespec(sk->sk_stamp); } return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 986f3ed7d1a2..b7e67310ec37 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -549,7 +549,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) /* Don't enable netstamp, sunrpc doesn't need that much accuracy */ } - svsk->sk_sk->sk_stamp = skb->tstamp; + sock_write_timestamp(svsk->sk_sk, skb->tstamp); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ len = skb->len; -- cgit v1.3.1 From aff6db454599d62191aabc208930e891748e4322 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 30 Dec 2018 12:43:42 -0800 Subject: ptr_ring: wrap back ->producer in __ptr_ring_swap_queue() __ptr_ring_swap_queue() tries to move pointers from the old ring to the new one, but it forgets to check if ->producer is beyond the new size at the end of the operation. This leads to an out-of-bound access in __ptr_ring_produce() as reported by syzbot. Reported-by: syzbot+8993c0fa96d57c399735@syzkaller.appspotmail.com Fixes: 5d49de532002 ("ptr_ring: resize support") Cc: "Michael S. Tsirkin" Cc: John Fastabend Cc: Jason Wang Signed-off-by: Cong Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/linux/ptr_ring.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 6894976b54e3..186cd8e970c7 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -573,6 +573,8 @@ static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue, else if (destroy) destroy(ptr); + if (producer >= size) + producer = 0; __ptr_ring_set_size(r, size); r->producer = producer; r->consumer_head = 0; -- cgit v1.3.1 From cb9f1b783850b14cbd7f87d061d784a666dfba1f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 30 Dec 2018 17:24:36 -0500 Subject: ip: validate header length on virtual device xmit KMSAN detected read beyond end of buffer in vti and sit devices when passing truncated packets with PF_PACKET. The issue affects additional ip tunnel devices. Extend commit 76c0ddd8c3a6 ("ip6_tunnel: be careful when accessing the inner header") and commit ccfec9e5cb2d ("ip_tunnel: be careful when accessing the inner header"). Move the check to a separate helper and call at the start of each ndo_start_xmit function in net/ipv4 and net/ipv6. Minor changes: - convert dev_kfree_skb to kfree_skb on error path, as dev_kfree_skb calls consume_skb which is not for error paths. - use pskb_network_may_pull even though that is pedantic here, as the same as pskb_may_pull for devices without llheaders. - do not cache ipv6 hdrs if used only once (unsafe across pskb_may_pull, was more relevant to earlier patch) Reported-by: syzbot Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 20 ++++++++++++++++++++ net/ipv4/ip_gre.c | 9 +++++++++ net/ipv4/ip_tunnel.c | 9 --------- net/ipv4/ip_vti.c | 12 +++++++++--- net/ipv6/ip6_gre.c | 10 +++++++--- net/ipv6/ip6_tunnel.c | 10 +++------- net/ipv6/ip6_vti.c | 8 ++++---- net/ipv6/ip6mr.c | 17 +++++++++++------ net/ipv6/sit.c | 3 +++ 9 files changed, 66 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index cbcf35ce1b14..34f019650941 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -308,6 +308,26 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); +static inline bool pskb_inet_may_pull(struct sk_buff *skb) +{ + int nhlen; + + switch (skb->protocol) { +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + nhlen = sizeof(struct ipv6hdr); + break; +#endif + case htons(ETH_P_IP): + nhlen = sizeof(struct iphdr); + break; + default: + nhlen = 0; + } + + return pskb_network_may_pull(skb, nhlen); +} + static inline int ip_encap_hlen(struct ip_tunnel_encap *e) { const struct ip_tunnel_encap_ops *ops; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c7a7bd58a23c..d1d09f3e5f9e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -676,6 +676,9 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tnl_params; + if (!pskb_inet_may_pull(skb)) + goto free_skb; + if (tunnel->collect_md) { gre_fb_xmit(skb, dev, skb->protocol); return NETDEV_TX_OK; @@ -719,6 +722,9 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); bool truncate = false; + if (!pskb_inet_may_pull(skb)) + goto free_skb; + if (tunnel->collect_md) { erspan_fb_xmit(skb, dev, skb->protocol); return NETDEV_TX_OK; @@ -762,6 +768,9 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, { struct ip_tunnel *tunnel = netdev_priv(dev); + if (!pskb_inet_may_pull(skb)) + goto free_skb; + if (tunnel->collect_md) { gre_fb_xmit(skb, dev, htons(ETH_P_TEB)); return NETDEV_TX_OK; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 284a22154b4e..c4f5602308ed 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -627,7 +627,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); - unsigned int inner_nhdr_len = 0; const struct iphdr *inner_iph; struct flowi4 fl4; u8 tos, ttl; @@ -637,14 +636,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, __be32 dst; bool connected; - /* ensure we can access the inner net header, for several users below */ - if (skb->protocol == htons(ETH_P_IP)) - inner_nhdr_len = sizeof(struct iphdr); - else if (skb->protocol == htons(ETH_P_IPV6)) - inner_nhdr_len = sizeof(struct ipv6hdr); - if (unlikely(!pskb_may_pull(skb, inner_nhdr_len))) - goto tx_error; - inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index de31b302d69c..d7b43e700023 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -241,6 +241,9 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); struct flowi fl; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { @@ -253,15 +256,18 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); break; default: - dev->stats.tx_errors++; - dev_kfree_skb(skb); - return NETDEV_TX_OK; + goto tx_err; } /* override mark with tunnel output key */ fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key); return vti_xmit(skb, dev, &fl); + +tx_err: + dev->stats.tx_errors++; + kfree_skb(skb); + return NETDEV_TX_OK; } static int vti4_err(struct sk_buff *skb, u32 info) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 229e55c99021..09d0826742f8 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -881,6 +881,9 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb, struct net_device_stats *stats = &t->dev->stats; int ret; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) goto tx_err; @@ -923,6 +926,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, int nhoff; int thoff; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) goto tx_err; @@ -995,8 +1001,6 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, goto tx_err; } } else { - struct ipv6hdr *ipv6h = ipv6_hdr(skb); - switch (skb->protocol) { case htons(ETH_P_IP): memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1004,7 +1008,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, &dsfield, &encap_limit); break; case htons(ETH_P_IPV6): - if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) + if (ipv6_addr_equal(&t->parms.raddr, &ipv6_hdr(skb)->saddr)) goto tx_err; if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 99179b9c8384..0c6403cf8b52 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1243,10 +1243,6 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; - /* ensure we can access the full inner ip header */ - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return -1; - iph = ip_hdr(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1321,9 +1317,6 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; - if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) - return -1; - ipv6h = ipv6_hdr(skb); tproto = READ_ONCE(t->parms.proto); if ((tproto != IPPROTO_IPV6 && tproto != 0) || @@ -1405,6 +1398,9 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) struct net_device_stats *stats = &t->dev->stats; int ret; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + switch (skb->protocol) { case htons(ETH_P_IP): ret = ip4ip6_tnl_xmit(skb, dev); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 706fe42e4928..8b6eefff2f7e 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -522,18 +522,18 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net_device_stats *stats = &t->dev->stats; - struct ipv6hdr *ipv6h; struct flowi fl; int ret; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): - ipv6h = ipv6_hdr(skb); - if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || - vti6_addr_conflict(t, ipv6h)) + vti6_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; xfrm_decode_session(skb, &fl, AF_INET6); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 8276f1224f16..30337b38274b 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -599,13 +600,12 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; - int err; - err = ip6mr_fib_lookup(net, &fl6, &mrt); - if (err < 0) { - kfree_skb(skb); - return err; - } + if (!pskb_inet_may_pull(skb)) + goto tx_err; + + if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) + goto tx_err; read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; @@ -614,6 +614,11 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, read_unlock(&mrt_lock); kfree_skb(skb); return NETDEV_TX_OK; + +tx_err: + dev->stats.tx_errors++; + kfree_skb(skb); + return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 51c9f75f34b9..1e03305c0549 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1021,6 +1021,9 @@ tx_error: static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { + if (!pskb_inet_may_pull(skb)) + goto tx_err; + switch (skb->protocol) { case htons(ETH_P_IP): sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP); -- cgit v1.3.1 From c08435ec7f2bc8f4109401f696fd55159b4b40cb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:27 +0100 Subject: bpf: move {prev_,}insn_idx into verifier env Move prev_insn_idx and insn_idx from the do_check() function into the verifier environment, so they can be read inside the various helper functions for handling the instructions. It's easier to put this into the environment rather than changing all call-sites only to pass it along. insn_idx is useful in particular since this later on allows to hold state in env->insn_aux_data[env->insn_idx]. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ kernel/bpf/verifier.c | 76 ++++++++++++++++++++++---------------------- 2 files changed, 40 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c233efc106c6..3f84f3e87704 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -212,6 +212,8 @@ struct bpf_subprog_info { * one verifier_env per bpf_check() call */ struct bpf_verifier_env { + u32 insn_idx; + u32 prev_insn_idx; struct bpf_prog *prog; /* eBPF program being verified */ const struct bpf_verifier_ops *ops; struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 71d86e3024ae..afa8515bbb34 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5650,7 +5650,6 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; int insn_cnt = env->prog->len, i; - int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; @@ -5670,19 +5669,19 @@ static int do_check(struct bpf_verifier_env *env) BPF_MAIN_FUNC /* callsite */, 0 /* frameno */, 0 /* subprogno, zero == main subprog */); - insn_idx = 0; + for (;;) { struct bpf_insn *insn; u8 class; int err; - if (insn_idx >= insn_cnt) { + if (env->insn_idx >= insn_cnt) { verbose(env, "invalid insn idx %d insn_cnt %d\n", - insn_idx, insn_cnt); + env->insn_idx, insn_cnt); return -EFAULT; } - insn = &insns[insn_idx]; + insn = &insns[env->insn_idx]; class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { @@ -5692,7 +5691,7 @@ static int do_check(struct bpf_verifier_env *env) return -E2BIG; } - err = is_state_visited(env, insn_idx); + err = is_state_visited(env, env->insn_idx); if (err < 0) return err; if (err == 1) { @@ -5700,9 +5699,9 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level) { if (do_print_state) verbose(env, "\nfrom %d to %d: safe\n", - prev_insn_idx, insn_idx); + env->prev_insn_idx, env->insn_idx); else - verbose(env, "%d: safe\n", insn_idx); + verbose(env, "%d: safe\n", env->insn_idx); } goto process_bpf_exit; } @@ -5715,10 +5714,10 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level > 1 || (env->log.level && do_print_state)) { if (env->log.level > 1) - verbose(env, "%d:", insn_idx); + verbose(env, "%d:", env->insn_idx); else verbose(env, "\nfrom %d to %d:", - prev_insn_idx, insn_idx); + env->prev_insn_idx, env->insn_idx); print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; } @@ -5729,20 +5728,20 @@ static int do_check(struct bpf_verifier_env *env) .private_data = env, }; - verbose_linfo(env, insn_idx, "; "); - verbose(env, "%d: ", insn_idx); + verbose_linfo(env, env->insn_idx, "; "); + verbose(env, "%d: ", env->insn_idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } if (bpf_prog_is_dev_bound(env->prog->aux)) { - err = bpf_prog_offload_verify_insn(env, insn_idx, - prev_insn_idx); + err = bpf_prog_offload_verify_insn(env, env->insn_idx, + env->prev_insn_idx); if (err) return err; } regs = cur_regs(env); - env->insn_aux_data[insn_idx].seen = true; + env->insn_aux_data[env->insn_idx].seen = true; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); @@ -5768,13 +5767,13 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ - err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, - insn->dst_reg, false); + err = check_mem_access(env, env->insn_idx, insn->src_reg, + insn->off, BPF_SIZE(insn->code), + BPF_READ, insn->dst_reg, false); if (err) return err; - prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; + prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_src_type == NOT_INIT) { /* saw a valid insn @@ -5799,10 +5798,10 @@ static int do_check(struct bpf_verifier_env *env) enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, insn_idx, insn); + err = check_xadd(env, env->insn_idx, insn); if (err) return err; - insn_idx++; + env->insn_idx++; continue; } @@ -5818,13 +5817,13 @@ static int do_check(struct bpf_verifier_env *env) dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, - insn->src_reg, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, insn->src_reg, false); if (err) return err; - prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type; + prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_dst_type == NOT_INIT) { *prev_dst_type = dst_reg_type; @@ -5852,9 +5851,9 @@ static int do_check(struct bpf_verifier_env *env) } /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, - -1, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, -1, false); if (err) return err; @@ -5872,9 +5871,9 @@ static int do_check(struct bpf_verifier_env *env) } if (insn->src_reg == BPF_PSEUDO_CALL) - err = check_func_call(env, insn, &insn_idx); + err = check_func_call(env, insn, &env->insn_idx); else - err = check_helper_call(env, insn->imm, insn_idx); + err = check_helper_call(env, insn->imm, env->insn_idx); if (err) return err; @@ -5887,7 +5886,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - insn_idx += insn->off + 1; + env->insn_idx += insn->off + 1; continue; } else if (opcode == BPF_EXIT) { @@ -5901,8 +5900,8 @@ static int do_check(struct bpf_verifier_env *env) if (state->curframe) { /* exit from nested function */ - prev_insn_idx = insn_idx; - err = prepare_func_exit(env, &insn_idx); + env->prev_insn_idx = env->insn_idx; + err = prepare_func_exit(env, &env->insn_idx); if (err) return err; do_print_state = true; @@ -5932,7 +5931,8 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: - err = pop_stack(env, &prev_insn_idx, &insn_idx); + err = pop_stack(env, &env->prev_insn_idx, + &env->insn_idx); if (err < 0) { if (err != -ENOENT) return err; @@ -5942,7 +5942,7 @@ process_bpf_exit: continue; } } else { - err = check_cond_jmp_op(env, insn, &insn_idx); + err = check_cond_jmp_op(env, insn, &env->insn_idx); if (err) return err; } @@ -5959,8 +5959,8 @@ process_bpf_exit: if (err) return err; - insn_idx++; - env->insn_aux_data[insn_idx].seen = true; + env->insn_idx++; + env->insn_aux_data[env->insn_idx].seen = true; } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -5970,7 +5970,7 @@ process_bpf_exit: return -EINVAL; } - insn_idx++; + env->insn_idx++; } verbose(env, "processed %d insns (limit %d), stack depth ", -- cgit v1.3.1 From 144cd91c4c2bced6eb8a7e25e590f6618a11e854 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:28 +0100 Subject: bpf: move tmp variable into ax register in interpreter This change moves the on-stack 64 bit tmp variable in ___bpf_prog_run() into the hidden ax register. The latter is currently only used in JITs for constant blinding as a temporary scratch register, meaning the BPF interpreter will never see the use of ax. Therefore it is safe to use it for the cases where tmp has been used earlier. This is needed to later on allow restricted hidden use of ax in both interpreter and JITs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 ++- kernel/bpf/core.c | 34 +++++++++++++++++----------------- 2 files changed, 19 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 8c8544b375eb..84a6a98f8328 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -60,7 +60,8 @@ struct sock_reuseport; * constants. See JIT pre-step in bpf_jit_blind_constants(). */ #define BPF_REG_AX MAX_BPF_REG -#define MAX_BPF_JIT_REG (MAX_BPF_REG + 1) +#define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) +#define MAX_BPF_JIT_REG MAX_BPF_EXT_REG /* unused opcode to mark special call to bpf_tail_call() helper */ #define BPF_TAIL_CALL 0xf0 diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 38de580abcc2..a34312a5eea2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -54,6 +54,7 @@ #define DST regs[insn->dst_reg] #define SRC regs[insn->src_reg] #define FP regs[BPF_REG_FP] +#define AX regs[BPF_REG_AX] #define ARG1 regs[BPF_REG_ARG1] #define CTX regs[BPF_REG_CTX] #define IMM insn->imm @@ -1188,7 +1189,6 @@ bool bpf_opcode_in_insntable(u8 code) */ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { - u64 tmp; #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void *jumptable[256] = { @@ -1268,36 +1268,36 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - div64_u64_rem(DST, SRC, &tmp); - DST = tmp; + div64_u64_rem(DST, SRC, &AX); + DST = AX; CONT; ALU_MOD_X: - tmp = (u32) DST; - DST = do_div(tmp, (u32) SRC); + AX = (u32) DST; + DST = do_div(AX, (u32) SRC); CONT; ALU64_MOD_K: - div64_u64_rem(DST, IMM, &tmp); - DST = tmp; + div64_u64_rem(DST, IMM, &AX); + DST = AX; CONT; ALU_MOD_K: - tmp = (u32) DST; - DST = do_div(tmp, (u32) IMM); + AX = (u32) DST; + DST = do_div(AX, (u32) IMM); CONT; ALU64_DIV_X: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - tmp = (u32) DST; - do_div(tmp, (u32) SRC); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) SRC); + DST = (u32) AX; CONT; ALU64_DIV_K: DST = div64_u64(DST, IMM); CONT; ALU_DIV_K: - tmp = (u32) DST; - do_div(tmp, (u32) IMM); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) IMM); + DST = (u32) AX; CONT; ALU_END_TO_BE: switch (IMM) { @@ -1553,7 +1553,7 @@ STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_REG]; \ + u64 regs[MAX_BPF_EXT_REG]; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ @@ -1566,7 +1566,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_REG]; \ + u64 regs[MAX_BPF_EXT_REG]; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ -- cgit v1.3.1 From 9b73bfdd08e73231d6a90ae6db4b46b3fbf56c30 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:29 +0100 Subject: bpf: enable access to ax register also from verifier rewrite Right now we are using BPF ax register in JIT for constant blinding as well as in interpreter as temporary variable. Verifier will not be able to use it simply because its use will get overridden from the former in bpf_jit_blind_insn(). However, it can be made to work in that blinding will be skipped if there is prior use in either source or destination register on the instruction. Taking constraints of ax into account, the verifier is then open to use it in rewrites under some constraints. Note, ax register already has mappings in every eBPF JIT. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 7 +------ kernel/bpf/core.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 84a6a98f8328..ad106d845b22 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -53,12 +53,7 @@ struct sock_reuseport; #define BPF_REG_D BPF_REG_8 /* data, callee-saved */ #define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ -/* Kernel hidden auxiliary/helper register for hardening step. - * Only used by eBPF JITs. It's nothing more than a temporary - * register that JITs use internally, only that here it's part - * of eBPF instructions that have been rewritten for blinding - * constants. See JIT pre-step in bpf_jit_blind_constants(). - */ +/* Kernel hidden auxiliary/helper register. */ #define BPF_REG_AX MAX_BPF_REG #define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) #define MAX_BPF_JIT_REG MAX_BPF_EXT_REG diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a34312a5eea2..f908b9356025 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -858,6 +858,26 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + /* Constraints on AX register: + * + * AX register is inaccessible from user space. It is mapped in + * all JITs, and used here for constant blinding rewrites. It is + * typically "stateless" meaning its contents are only valid within + * the executed instruction, but not across several instructions. + * There are a few exceptions however which are further detailed + * below. + * + * Constant blinding is only used by JITs, not in the interpreter. + * The interpreter uses AX in some occasions as a local temporary + * register e.g. in DIV or MOD instructions. + * + * In restricted circumstances, the verifier can also use the AX + * register for rewrites as long as they do not interfere with + * the above cases! + */ + if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX) + goto out; + if (from->imm == 0 && (from->code == (BPF_ALU | BPF_MOV | BPF_K) || from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { -- cgit v1.3.1 From 979d63d50c0c0f7bc537bf821e056cc9fe5abd38 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:34 +0100 Subject: bpf: prevent out of bounds speculation on pointer arithmetic Jann reported that the original commit back in b2157399cc98 ("bpf: prevent out-of-bounds speculation") was not sufficient to stop CPU from speculating out of bounds memory access: While b2157399cc98 only focussed on masking array map access for unprivileged users for tail calls and data access such that the user provided index gets sanitized from BPF program and syscall side, there is still a more generic form affected from BPF programs that applies to most maps that hold user data in relation to dynamic map access when dealing with unknown scalars or "slow" known scalars as access offset, for example: - Load a map value pointer into R6 - Load an index into R7 - Do a slow computation (e.g. with a memory dependency) that loads a limit into R8 (e.g. load the limit from a map for high latency, then mask it to make the verifier happy) - Exit if R7 >= R8 (mispredicted branch) - Load R0 = R6[R7] - Load R0 = R6[R0] For unknown scalars there are two options in the BPF verifier where we could derive knowledge from in order to guarantee safe access to the memory: i) While /<=/>= variants won't allow to derive any lower or upper bounds from the unknown scalar where it would be safe to add it to the map value pointer, it is possible through ==/!= test however. ii) another option is to transform the unknown scalar into a known scalar, for example, through ALU ops combination such as R &= followed by R |= or any similar combination where the original information from the unknown scalar would be destroyed entirely leaving R with a constant. The initial slow load still precedes the latter ALU ops on that register, so the CPU executes speculatively from that point. Once we have the known scalar, any compare operation would work then. A third option only involving registers with known scalars could be crafted as described in [0] where a CPU port (e.g. Slow Int unit) would be filled with many dependent computations such that the subsequent condition depending on its outcome has to wait for evaluation on its execution port and thereby executing speculatively if the speculated code can be scheduled on a different execution port, or any other form of mistraining as described in [1], for example. Given this is not limited to only unknown scalars, not only map but also stack access is affected since both is accessible for unprivileged users and could potentially be used for out of bounds access under speculation. In order to prevent any of these cases, the verifier is now sanitizing pointer arithmetic on the offset such that any out of bounds speculation would be masked in a way where the pointer arithmetic result in the destination register will stay unchanged, meaning offset masked into zero similar as in array_index_nospec() case. With regards to implementation, there are three options that were considered: i) new insn for sanitation, ii) push/pop insn and sanitation as inlined BPF, iii) reuse of ax register and sanitation as inlined BPF. Option i) has the downside that we end up using from reserved bits in the opcode space, but also that we would require each JIT to emit masking as native arch opcodes meaning mitigation would have slow adoption till everyone implements it eventually which is counter-productive. Option ii) and iii) have both in common that a temporary register is needed in order to implement the sanitation as inlined BPF since we are not allowed to modify the source register. While a push / pop insn in ii) would be useful to have in any case, it requires once again that every JIT needs to implement it first. While possible, amount of changes needed would also be unsuitable for a -stable patch. Therefore, the path which has fewer changes, less BPF instructions for the mitigation and does not require anything to be changed in the JITs is option iii) which this work is pursuing. The ax register is already mapped to a register in all JITs (modulo arm32 where it's mapped to stack as various other BPF registers there) and used in constant blinding for JITs-only so far. It can be reused for verifier rewrites under certain constraints. The interpreter's tmp "register" has therefore been remapped into extending the register set with hidden ax register and reusing that for a number of instructions that needed the prior temporary variable internally (e.g. div, mod). This allows for zero increase in stack space usage in the interpreter, and enables (restricted) generic use in rewrites otherwise as long as such a patchlet does not make use of these instructions. The sanitation mask is dynamic and relative to the offset the map value or stack pointer currently holds. There are various cases that need to be taken under consideration for the masking, e.g. such operation could look as follows: ptr += val or val += ptr or ptr -= val. Thus, the value to be sanitized could reside either in source or in destination register, and the limit is different depending on whether the ALU op is addition or subtraction and depending on the current known and bounded offset. The limit is derived as follows: limit := max_value_size - (smin_value + off). For subtraction: limit := umax_value + off. This holds because we do not allow any pointer arithmetic that would temporarily go out of bounds or would have an unknown value with mixed signed bounds where it is unclear at verification time whether the actual runtime value would be either negative or positive. For example, we have a derived map pointer value with constant offset and bounded one, so limit based on smin_value works because the verifier requires that statically analyzed arithmetic on the pointer must be in bounds, and thus it checks if resulting smin_value + off and umax_value + off is still within map value bounds at time of arithmetic in addition to time of access. Similarly, for the case of stack access we derive the limit as follows: MAX_BPF_STACK + off for subtraction and -off for the case of addition where off := ptr_reg->off + ptr_reg->var_off.value. Subtraction is a special case for the masking which can be in form of ptr += -val, ptr -= -val, or ptr -= val. In the first two cases where we know that the value is negative, we need to temporarily negate the value in order to do the sanitation on a positive value where we later swap the ALU op, and restore original source register if the value was in source. The sanitation of pointer arithmetic alone is still not fully sufficient as is, since a scenario like the following could happen ... PTR += 0x1000 (e.g. K-based imm) PTR -= BIG_NUMBER_WITH_SLOW_COMPARISON PTR += 0x1000 PTR -= BIG_NUMBER_WITH_SLOW_COMPARISON [...] ... which under speculation could end up as ... PTR += 0x1000 PTR -= 0 [ truncated by mitigation ] PTR += 0x1000 PTR -= 0 [ truncated by mitigation ] [...] ... and therefore still access out of bounds. To prevent such case, the verifier is also analyzing safety for potential out of bounds access under speculative execution. Meaning, it is also simulating pointer access under truncation. We therefore "branch off" and push the current verification state after the ALU operation with known 0 to the verification stack for later analysis. Given the current path analysis succeeded it is likely that the one under speculation can be pruned. In any case, it is also subject to existing complexity limits and therefore anything beyond this point will be rejected. In terms of pruning, it needs to be ensured that the verification state from speculative execution simulation must never prune a non-speculative execution path, therefore, we mark verifier state accordingly at the time of push_stack(). If verifier detects out of bounds access under speculative execution from one of the possible paths that includes a truncation, it will reject such program. Given we mask every reg-based pointer arithmetic for unprivileged programs, we've been looking into how it could affect real-world programs in terms of size increase. As the majority of programs are targeted for privileged-only use case, we've unconditionally enabled masking (with its alu restrictions on top of it) for privileged programs for the sake of testing in order to check i) whether they get rejected in its current form, and ii) by how much the number of instructions and size will increase. We've tested this by using Katran, Cilium and test_l4lb from the kernel selftests. For Katran we've evaluated balancer_kern.o, Cilium bpf_lxc.o and an older test object bpf_lxc_opt_-DUNKNOWN.o and l4lb we've used test_l4lb.o as well as test_l4lb_noinline.o. We found that none of the programs got rejected by the verifier with this change, and that impact is rather minimal to none. balancer_kern.o had 13,904 bytes (1,738 insns) xlated and 7,797 bytes JITed before and after the change. Most complex program in bpf_lxc.o had 30,544 bytes (3,817 insns) xlated and 18,538 bytes JITed before and after and none of the other tail call programs in bpf_lxc.o had any changes either. For the older bpf_lxc_opt_-DUNKNOWN.o object we found a small increase from 20,616 bytes (2,576 insns) and 12,536 bytes JITed before to 20,664 bytes (2,582 insns) and 12,558 bytes JITed after the change. Other programs from that object file had similar small increase. Both test_l4lb.o had no change and remained at 6,544 bytes (817 insns) xlated and 3,401 bytes JITed and for test_l4lb_noinline.o constant at 5,080 bytes (634 insns) xlated and 3,313 bytes JITed. This can be explained in that LLVM typically optimizes stack based pointer arithmetic by using K-based operations and that use of dynamic map access is not overly frequent. However, in future we may decide to optimize the algorithm further under known guarantees from branch and value speculation. Latter seems also unclear in terms of prediction heuristics that today's CPUs apply as well as whether there could be collisions in e.g. the predictor's Value History/Pattern Table for triggering out of bounds access, thus masking is performed unconditionally at this point but could be subject to relaxation later on. We were generally also brainstorming various other approaches for mitigation, but the blocker was always lack of available registers at runtime and/or overhead for runtime tracking of limits belonging to a specific pointer. Thus, we found this to be minimally intrusive under given constraints. With that in place, a simple example with sanitized access on unprivileged load at post-verification time looks as follows: # bpftool prog dump xlated id 282 [...] 28: (79) r1 = *(u64 *)(r7 +0) 29: (79) r2 = *(u64 *)(r7 +8) 30: (57) r1 &= 15 31: (79) r3 = *(u64 *)(r0 +4608) 32: (57) r3 &= 1 33: (47) r3 |= 1 34: (2d) if r2 > r3 goto pc+19 35: (b4) (u32) r11 = (u32) 20479 | 36: (1f) r11 -= r2 | Dynamic sanitation for pointer 37: (4f) r11 |= r2 | arithmetic with registers 38: (87) r11 = -r11 | containing bounded or known 39: (c7) r11 s>>= 63 | scalars in order to prevent 40: (5f) r11 &= r2 | out of bounds speculation. 41: (0f) r4 += r11 | 42: (71) r4 = *(u8 *)(r4 +0) 43: (6f) r4 <<= r1 [...] For the case where the scalar sits in the destination register as opposed to the source register, the following code is emitted for the above example: [...] 16: (b4) (u32) r11 = (u32) 20479 17: (1f) r11 -= r2 18: (4f) r11 |= r2 19: (87) r11 = -r11 20: (c7) r11 s>>= 63 21: (5f) r2 &= r11 22: (0f) r2 += r0 23: (61) r0 = *(u32 *)(r2 +0) [...] JIT blinding example with non-conflicting use of r10: [...] d5: je 0x0000000000000106 _ d7: mov 0x0(%rax),%edi | da: mov $0xf153246,%r10d | Index load from map value and e0: xor $0xf153259,%r10 | (const blinded) mask with 0x1f. e7: and %r10,%rdi |_ ea: mov $0x2f,%r10d | f0: sub %rdi,%r10 | Sanitized addition. Both use r10 f3: or %rdi,%r10 | but do not interfere with each f6: neg %r10 | other. (Neither do these instructions f9: sar $0x3f,%r10 | interfere with the use of ax as temp fd: and %r10,%rdi | in interpreter.) 100: add %rax,%rdi |_ 103: mov 0x0(%rdi),%eax [...] Tested that it fixes Jann's reproducer, and also checked that test_verifier and test_progs suite with interpreter, JIT and JIT with hardening enabled on x86-64 and arm64 runs successfully. [0] Speculose: Analyzing the Security Implications of Speculative Execution in CPUs, Giorgi Maisuradze and Christian Rossow, https://arxiv.org/pdf/1801.04084.pdf [1] A Systematic Evaluation of Transient Execution Attacks and Defenses, Claudio Canella, Jo Van Bulck, Michael Schwarz, Moritz Lipp, Benjamin von Berg, Philipp Ortner, Frank Piessens, Dmitry Evtyushkin, Daniel Gruss, https://arxiv.org/pdf/1811.05441.pdf Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Reported-by: Jann Horn Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 10 +++ kernel/bpf/verifier.c | 185 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 189 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3f84f3e87704..27b74947cd2b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -148,6 +148,7 @@ struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; u32 curframe; + bool speculative; }; #define bpf_get_spilled_reg(slot, frame) \ @@ -167,15 +168,24 @@ struct bpf_verifier_state_list { struct bpf_verifier_state_list *next; }; +/* Possible states for alu_state member. */ +#define BPF_ALU_SANITIZE_SRC 1U +#define BPF_ALU_SANITIZE_DST 2U +#define BPF_ALU_NEG_VALUE (1U << 2) +#define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ + BPF_ALU_SANITIZE_DST) + struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ unsigned long map_state; /* pointer/poison value for maps */ s32 call_imm; /* saved imm field of call insn */ + u32 alu_limit; /* limit for add/sub register with pointer */ }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ + u8 alu_state; /* used in combination with alu_limit */ }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8e5da1ce5da4..f6bc62a9ee8e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -710,6 +710,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, free_func_state(dst_state->frame[i]); dst_state->frame[i] = NULL; } + dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; @@ -754,7 +755,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, } static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) + int insn_idx, int prev_insn_idx, + bool speculative) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem; @@ -772,6 +774,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, err = copy_verifier_state(&elem->st, cur); if (err) goto err; + elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose(env, "BPF program is too complex\n"); goto err; @@ -3067,6 +3070,102 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +{ + return &env->insn_aux_data[env->insn_idx]; +} + +static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, + u32 *ptr_limit, u8 opcode, bool off_is_neg) +{ + bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || + (opcode == BPF_SUB && !off_is_neg); + u32 off; + + switch (ptr_reg->type) { + case PTR_TO_STACK: + off = ptr_reg->off + ptr_reg->var_off.value; + if (mask_to_left) + *ptr_limit = MAX_BPF_STACK + off; + else + *ptr_limit = -off; + return 0; + case PTR_TO_MAP_VALUE: + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->value_size - off; + } + return 0; + default: + return -EINVAL; + } +} + +static int sanitize_ptr_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn, + const struct bpf_reg_state *ptr_reg, + struct bpf_reg_state *dst_reg, + bool off_is_neg) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_insn_aux_data *aux = cur_aux(env); + bool ptr_is_dst_reg = ptr_reg == dst_reg; + u8 opcode = BPF_OP(insn->code); + u32 alu_state, alu_limit; + struct bpf_reg_state tmp; + bool ret; + + if (env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K) + return 0; + + /* We already marked aux for masking from non-speculative + * paths, thus we got here in the first place. We only care + * to explore bad access from here. + */ + if (vstate->speculative) + goto do_sim; + + alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= ptr_is_dst_reg ? + BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + + if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg)) + return 0; + + /* If we arrived here from different branches with different + * limits to sanitize, then this won't work. + */ + if (aux->alu_state && + (aux->alu_state != alu_state || + aux->alu_limit != alu_limit)) + return -EACCES; + + /* Corresponding fixup done in fixup_bpf_calls(). */ + aux->alu_state = alu_state; + aux->alu_limit = alu_limit; + +do_sim: + /* Simulate and find potential out-of-bounds access under + * speculative execution from truncation as a result of + * masking when off was not within expected range. If off + * sits in dst, then we temporarily need to move ptr there + * to simulate dst (== 0) +/-= ptr. Needed, for example, + * for cases where we use K-based arithmetic in one direction + * and truncated reg-based in the other in order to explore + * bad access. + */ + if (!ptr_is_dst_reg) { + tmp = *dst_reg; + *dst_reg = *ptr_reg; + } + ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + if (!ptr_is_dst_reg) + *dst_reg = tmp; + return !ret ? -EFAULT : 0; +} + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -3087,6 +3186,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; u32 dst = insn->dst_reg, src = insn->src_reg; u8 opcode = BPF_OP(insn->code); + int ret; dst_reg = ®s[dst]; @@ -3142,6 +3242,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose(env, "R%d tried to add from different maps or paths\n", dst); + return ret; + } /* We can take a fixed offset as long as it doesn't overflow * the s32 'off' field */ @@ -3192,6 +3297,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } break; case BPF_SUB: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose(env, "R%d tried to sub from different maps or paths\n", dst); + return ret; + } if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ verbose(env, "R%d tried to subtract pointer from scalar\n", @@ -4389,7 +4499,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } } - other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, + false); if (!other_branch) return -EFAULT; other_branch_regs = other_branch->frame[other_branch->curframe]->regs; @@ -5499,6 +5610,12 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->curframe != cur->curframe) return false; + /* Verification state from speculative execution simulation + * must never prune a non-speculative execution one. + */ + if (old->speculative && !cur->speculative) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -5700,6 +5817,7 @@ static int do_check(struct bpf_verifier_env *env) if (!state) return -ENOMEM; state->curframe = 0; + state->speculative = false; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); @@ -5739,8 +5857,10 @@ static int do_check(struct bpf_verifier_env *env) /* found equivalent state, can prune the search */ if (env->log.level) { if (do_print_state) - verbose(env, "\nfrom %d to %d: safe\n", - env->prev_insn_idx, env->insn_idx); + verbose(env, "\nfrom %d to %d%s: safe\n", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); else verbose(env, "%d: safe\n", env->insn_idx); } @@ -5757,8 +5877,10 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level > 1) verbose(env, "%d:", env->insn_idx); else - verbose(env, "\nfrom %d to %d:", - env->prev_insn_idx, env->insn_idx); + verbose(env, "\nfrom %d to %d%s:", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; } @@ -6750,6 +6872,57 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { + const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; + const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; + struct bpf_insn insn_buf[16]; + struct bpf_insn *patch = &insn_buf[0]; + bool issrc, isneg; + u32 off_reg; + + aux = &env->insn_aux_data[i + delta]; + if (!aux->alu_state) + continue; + + isneg = aux->alu_state & BPF_ALU_NEG_VALUE; + issrc = (aux->alu_state & BPF_ALU_SANITIZE) == + BPF_ALU_SANITIZE_SRC; + + off_reg = issrc ? insn->src_reg : insn->dst_reg; + if (isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1); + *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); + *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); + if (issrc) { + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, + off_reg); + insn->src_reg = BPF_REG_AX; + } else { + *patch++ = BPF_ALU64_REG(BPF_AND, off_reg, + BPF_REG_AX); + } + if (isneg) + insn->code = insn->code == code_add ? + code_sub : code_add; + *patch++ = *insn; + if (issrc && isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + cnt = patch - insn_buf; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; if (insn->src_reg == BPF_PSEUDO_CALL) -- cgit v1.3.1