From 955d3411a17f590364238bd0d3329b61f20c1cd2 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 30 Dec 2018 12:46:01 +0100
Subject: batman-adv: Avoid WARN on net_device without parent in netns

It is not allowed to use WARN* helpers on potential incorrect input from
the user or transient problems because systems configured as panic_on_warn
will reboot due to such a problem.

A NULL return value of __dev_get_by_index can be caused by various problems
which can either be related to the system configuration or problems
(incorrectly returned network namespaces) in other (virtual) net_device
drivers. batman-adv should not cause a (harmful) WARN in this situation and
instead only report it via a simple message.

Fixes: b7eddd0b3950 ("batman-adv: prevent using any virtual device created on batman-adv as hard-interface")
Reported-by: syzbot+c764de0fcfadca9a8595@syzkaller.appspotmail.com
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/hard-interface.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 508f4416dfc9..415d494cbe22 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -20,7 +20,6 @@
 #include "main.h"
 
 #include <linux/atomic.h>
-#include <linux/bug.h>
 #include <linux/byteorder/generic.h>
 #include <linux/errno.h>
 #include <linux/gfp.h>
@@ -179,8 +178,10 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
 	parent_dev = __dev_get_by_index((struct net *)parent_net,
 					dev_get_iflink(net_dev));
 	/* if we got a NULL parent_dev there is something broken.. */
-	if (WARN(!parent_dev, "Cannot find parent device"))
+	if (!parent_dev) {
+		pr_err("Cannot find parent device\n");
 		return false;
+	}
 
 	if (batadv_mutual_parents(net_dev, net, parent_dev, parent_net))
 		return false;
-- 
cgit v1.2.3-70-g09d2


From 9114daa825fc3f335f9bea3313ce667090187280 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Mon, 31 Dec 2018 22:31:01 +0100
Subject: batman-adv: Force mac header to start of data on xmit

The caller of ndo_start_xmit may not already have called
skb_reset_mac_header. The returned value of skb_mac_header/eth_hdr
therefore can be in the wrong position and even outside the current skbuff.
This for example happens when the user binds to the device using a
PF_PACKET-SOCK_RAW with enabled qdisc-bypass:

  int opt = 4;
  setsockopt(sock, SOL_PACKET, PACKET_QDISC_BYPASS, &opt, sizeof(opt));

Since eth_hdr is used all over the codebase, the batadv_interface_tx
function must always take care of resetting it.

Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol")
Reported-by: syzbot+9d7405c7faa390e60b4e@syzkaller.appspotmail.com
Reported-by: syzbot+7d20bc3f1ddddc0f9079@syzkaller.appspotmail.com
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/soft-interface.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 5db5a0a4c959..b85ca809e509 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -221,6 +221,8 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb,
 
 	netif_trans_update(soft_iface);
 	vid = batadv_get_vid(skb, 0);
+
+	skb_reset_mac_header(skb);
 	ethhdr = eth_hdr(skb);
 
 	switch (ntohs(ethhdr->h_proto)) {
-- 
cgit v1.2.3-70-g09d2


From 355b00d1e14051c13aea48c1c5430c486fed2d7a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 4 Jan 2019 14:17:00 +0100
Subject: xfrm: policy: use hlist rcu variants on inexact insert, part 2

This function was modeled on the 'exact' insert one, which did not use
the rcu variant either.

When I fixed the 'exact' insert I forgot to propagate this to my
development tree, so the inexact variant retained the bug.

Fixes: 9cf545ebd591d ("xfrm: policy: store inexact policies in a tree ordered by destination address")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 934492bad8e0..628b389af2ba 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -856,9 +856,9 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net,
 		}
 
 		if (newpos)
-			hlist_add_behind(&policy->bydst, newpos);
+			hlist_add_behind_rcu(&policy->bydst, newpos);
 		else
-			hlist_add_head(&policy->bydst, &n->hhead);
+			hlist_add_head_rcu(&policy->bydst, &n->hhead);
 
 		/* paranoia checks follow.
 		 * Check that the reinserted policy matches at least
-- 
cgit v1.2.3-70-g09d2


From 7a474c36586f4277f930ab7e6865c97e44dfc3bc Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 4 Jan 2019 14:17:01 +0100
Subject: xfrm: policy: increment xfrm_hash_generation on hash rebuild

Hash rebuild will re-set all the inexact entries, then re-insert them.
Lookups that can occur in parallel will therefore not find any policies.

This was safe when lookups were still guarded by rwlock.
After rcu-ification, lookups check the hash_generation seqcount to detect
when a hash resize takes place.  Hash rebuild missed the needed increment.

Hash resizes and hash rebuilds cannot occur in parallel (both acquire
hash_resize_mutex), so just increment xfrm_hash_generation, like resize.

Fixes: a7c44247f704e3 ("xfrm: policy: make xfrm_policy_lookup_bytype lockless")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 628b389af2ba..d8fba27a4bfb 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1235,6 +1235,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 	} while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
 
 	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+	write_seqcount_begin(&xfrm_policy_hash_generation);
 
 	/* make sure that we can insert the indirect policies again before
 	 * we start with destructive action.
@@ -1334,6 +1335,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 
 out_unlock:
 	__xfrm_policy_inexact_flush(net);
+	write_seqcount_end(&xfrm_policy_hash_generation);
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	mutex_unlock(&hash_resize_mutex);
-- 
cgit v1.2.3-70-g09d2


From 1548bc4e0512700cf757192c106b3a20ab639223 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 4 Jan 2019 14:17:02 +0100
Subject: xfrm: policy: delete inexact policies from inexact list on hash
 rebuild

An xfrm hash rebuild has to reset the inexact policy list before the
policies get re-inserted: A change of hash thresholds will result in
policies to get moved from inexact tree to the policy hash table.

If the thresholds are increased again later, they get moved from hash
table to inexact tree.

We must unlink all policies from the inexact tree before re-insertion.

Otherwise 'migrate' may find policies that are in main hash table a
second time, when it searches the inexact lists.

Furthermore, re-insertion without deletion can cause elements ->next to
point back to itself, causing soft lockups or double-frees.

Reported-by: syzbot+9d971dd21eb26567036b@syzkaller.appspotmail.com
Fixes: 9cf545ebd591da ("xfrm: policy: store inexact policies in a tree ordered by destination address")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d8fba27a4bfb..24dfd1e47cf0 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -680,16 +680,6 @@ static void xfrm_hash_resize(struct work_struct *work)
 	mutex_unlock(&hash_resize_mutex);
 }
 
-static void xfrm_hash_reset_inexact_table(struct net *net)
-{
-	struct xfrm_pol_inexact_bin *b;
-
-	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
-
-	list_for_each_entry(b, &net->xfrm.inexact_bins, inexact_bins)
-		INIT_HLIST_HEAD(&b->hhead);
-}
-
 /* Make sure *pol can be inserted into fastbin.
  * Useful to check that later insert requests will be sucessful
  * (provided xfrm_policy_lock is held throughout).
@@ -1279,10 +1269,14 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 	}
 
 	/* reset the bydst and inexact table in all directions */
-	xfrm_hash_reset_inexact_table(net);
-
 	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
-		INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
+		struct hlist_node *n;
+
+		hlist_for_each_entry_safe(policy, n,
+					  &net->xfrm.policy_inexact[dir],
+					  bydst_inexact_list)
+			hlist_del_init(&policy->bydst_inexact_list);
+
 		hmask = net->xfrm.policy_bydst[dir].hmask;
 		odst = net->xfrm.policy_bydst[dir].table;
 		for (i = hmask; i >= 0; i--)
@@ -1314,6 +1308,9 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 		newpos = NULL;
 		chain = policy_hash_bysel(net, &policy->selector,
 					  policy->family, dir);
+
+		hlist_del_rcu(&policy->bydst);
+
 		if (!chain) {
 			void *p = xfrm_policy_inexact_insert(policy, dir, 0);
 
-- 
cgit v1.2.3-70-g09d2


From 1d38900cb85d5d311dbd23c2c93294527b82cd2b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 4 Jan 2019 14:17:03 +0100
Subject: xfrm: policy: fix reinsertion on node merge

"newpos" has wrong scope.  It must be NULL on each iteration of the loop.
Otherwise, when policy is to be inserted at the start, we would instead
insert at point found by the previous loop-iteration instead.

Also, we need to unlink the policy before we reinsert it to the new node,
else we can get next-points-to-self loops.

Because policies are only ordered by priority it is irrelevant which policy
is "more recent" except when two policies have same priority.
(the more recent one is placed after the older one).

In these cases, we can use the ->pos id number to know which one is the
'older': the higher the id, the more recent the policy.

So we only need to unlink all policies from the node that is about to be
removed, and insert them to the replacement node.

Fixes: 9cf545ebd591da ("xfrm: policy: store inexact policies in a tree ordered by destination address")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 24dfd1e47cf0..e691683223ee 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -823,13 +823,13 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net,
 					      u16 family)
 {
 	unsigned int matched_s, matched_d;
-	struct hlist_node *newpos = NULL;
 	struct xfrm_policy *policy, *p;
 
 	matched_s = 0;
 	matched_d = 0;
 
 	list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
+		struct hlist_node *newpos = NULL;
 		bool matches_s, matches_d;
 
 		if (!policy->bydst_reinsert)
@@ -839,7 +839,10 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net,
 
 		policy->bydst_reinsert = false;
 		hlist_for_each_entry(p, &n->hhead, bydst) {
-			if (policy->priority >= p->priority)
+			if (policy->priority > p->priority)
+				newpos = &p->bydst;
+			else if (policy->priority == p->priority &&
+				 policy->pos > p->pos)
 				newpos = &p->bydst;
 			else
 				break;
@@ -955,12 +958,11 @@ static void xfrm_policy_inexact_node_merge(struct net *net,
 						  family);
 	}
 
-	hlist_for_each_entry(tmp, &v->hhead, bydst)
-		tmp->bydst_reinsert = true;
-	hlist_for_each_entry(tmp, &n->hhead, bydst)
+	hlist_for_each_entry(tmp, &v->hhead, bydst) {
 		tmp->bydst_reinsert = true;
+		hlist_del_rcu(&tmp->bydst);
+	}
 
-	INIT_HLIST_HEAD(&n->hhead);
 	xfrm_policy_inexact_list_reinsert(net, n, family);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 12750abad517a991c4568969bc748db302ab52cd Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 4 Jan 2019 14:17:05 +0100
Subject: xfrm: policy: fix infinite loop when merging src-nodes

With very small change to test script we can trigger softlockup due to
bogus assignment of 'p' (policy to be examined) on restart.

Previously the two to-be-merged nodes had same address/prefixlength pair,
so no erase/reinsert was necessary, we only had to append the list from
node a to b.

If prefix lengths are different, the node has to be deleted and re-inserted
into the tree, with the updated prefix length.  This was broken; due to
bogus update to 'p' this loops forever.

Add a 'restart' label and use that instead.

While at it, don't perform the unneeded reinserts of the policies that
are already sorted into the 'new' node.

A previous patch in this series made xfrm_policy_inexact_list_reinsert()
use the relative position indicator to sort policies according to age in
case priorities are identical.

Fixes: 6ac098b2a9d30 ("xfrm: policy: add 2nd-level saddr trees for inexact policies")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c                     | 15 +++++++--------
 tools/testing/selftests/net/xfrm_policy.sh |  4 ++--
 2 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index e691683223ee..8cfd75b62396 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -886,12 +886,13 @@ static void xfrm_policy_inexact_node_reinsert(struct net *net,
 					      struct rb_root *new,
 					      u16 family)
 {
-	struct rb_node **p, *parent = NULL;
 	struct xfrm_pol_inexact_node *node;
+	struct rb_node **p, *parent;
 
 	/* we should not have another subtree here */
 	WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root));
-
+restart:
+	parent = NULL;
 	p = &new->rb_node;
 	while (*p) {
 		u8 prefixlen;
@@ -911,12 +912,11 @@ static void xfrm_policy_inexact_node_reinsert(struct net *net,
 		} else {
 			struct xfrm_policy *tmp;
 
-			hlist_for_each_entry(tmp, &node->hhead, bydst)
-				tmp->bydst_reinsert = true;
-			hlist_for_each_entry(tmp, &n->hhead, bydst)
+			hlist_for_each_entry(tmp, &n->hhead, bydst) {
 				tmp->bydst_reinsert = true;
+				hlist_del_rcu(&tmp->bydst);
+			}
 
-			INIT_HLIST_HEAD(&node->hhead);
 			xfrm_policy_inexact_list_reinsert(net, node, family);
 
 			if (node->prefixlen == n->prefixlen) {
@@ -928,8 +928,7 @@ static void xfrm_policy_inexact_node_reinsert(struct net *net,
 			kfree_rcu(n, rcu);
 			n = node;
 			n->prefixlen = prefixlen;
-			*p = new->rb_node;
-			parent = NULL;
+			goto restart;
 		}
 	}
 
diff --git a/tools/testing/selftests/net/xfrm_policy.sh b/tools/testing/selftests/net/xfrm_policy.sh
index 8ce54600d4d1..71d7fdc513c1 100755
--- a/tools/testing/selftests/net/xfrm_policy.sh
+++ b/tools/testing/selftests/net/xfrm_policy.sh
@@ -78,8 +78,8 @@ do_overlap()
     # adds a new node in the 10.0.0.0/24 tree (dst node exists).
     ip -net $ns xfrm policy add src 10.2.0.0/24 dst 10.0.0.0/24 dir fwd priority 200 action block
 
-    # adds a 10.2.0.0/24 node, but for different dst.
-    ip -net $ns xfrm policy add src 10.2.0.0/24 dst 10.0.1.0/24 dir fwd priority 200 action block
+    # adds a 10.2.0.0/23 node, but for different dst.
+    ip -net $ns xfrm policy add src 10.2.0.0/23 dst 10.0.1.0/24 dir fwd priority 200 action block
 
     # dst now overlaps with the 10.0.1.0/24 ESP policy in fwd.
     # kernel must 'promote' existing one (10.0.0.0/24) to 10.0.0.0/23.
-- 
cgit v1.2.3-70-g09d2


From dd9ee3444014e8f28c0eefc9fffc9ac9c5248c12 Mon Sep 17 00:00:00 2001
From: Su Yanjun <suyj.fnst@cn.fujitsu.com>
Date: Sun, 6 Jan 2019 21:31:20 -0500
Subject: vti4: Fix a ipip packet processing bug in 'IPCOMP' virtual tunnel

Recently we run a network test over ipcomp virtual tunnel.We find that
if a ipv4 packet needs fragment, then the peer can't receive
it.

We deep into the code and find that when packet need fragment the smaller
fragment will be encapsulated by ipip not ipcomp. So when the ipip packet
goes into xfrm, it's skb->dev is not properly set. The ipv4 reassembly code
always set skb'dev to the last fragment's dev. After ipv4 defrag processing,
when the kernel rp_filter parameter is set, the skb will be drop by -EXDEV
error.

This patch adds compatible support for the ipip process in ipcomp virtual tunnel.

Signed-off-by: Su Yanjun <suyj.fnst@cn.fujitsu.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ip_vti.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index d7b43e700023..68a21bf75dd0 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -74,6 +74,33 @@ drop:
 	return 0;
 }
 
+static int vti_input_ipip(struct sk_buff *skb, int nexthdr, __be32 spi,
+		     int encap_type)
+{
+	struct ip_tunnel *tunnel;
+	const struct iphdr *iph = ip_hdr(skb);
+	struct net *net = dev_net(skb->dev);
+	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+				  iph->saddr, iph->daddr, 0);
+	if (tunnel) {
+		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+			goto drop;
+
+		XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
+
+		skb->dev = tunnel->dev;
+
+		return xfrm_input(skb, nexthdr, spi, encap_type);
+	}
+
+	return -EINVAL;
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
 static int vti_rcv(struct sk_buff *skb)
 {
 	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
@@ -82,6 +109,14 @@ static int vti_rcv(struct sk_buff *skb)
 	return vti_input(skb, ip_hdr(skb)->protocol, 0, 0);
 }
 
+static int vti_rcv_ipip(struct sk_buff *skb)
+{
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+	return vti_input_ipip(skb, ip_hdr(skb)->protocol, ip_hdr(skb)->saddr, 0);
+}
+
 static int vti_rcv_cb(struct sk_buff *skb, int err)
 {
 	unsigned short family;
@@ -435,6 +470,12 @@ static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
 	.priority	=	100,
 };
 
+static struct xfrm_tunnel ipip_handler __read_mostly = {
+	.handler	=	vti_rcv_ipip,
+	.err_handler	=	vti4_err,
+	.priority	=	0,
+};
+
 static int __net_init vti_init_net(struct net *net)
 {
 	int err;
@@ -603,6 +644,13 @@ static int __init vti_init(void)
 	if (err < 0)
 		goto xfrm_proto_comp_failed;
 
+	msg = "ipip tunnel";
+	err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
+	if (err < 0) {
+		pr_info("%s: cant't register tunnel\n",__func__);
+		goto xfrm_tunnel_failed;
+	}
+
 	msg = "netlink interface";
 	err = rtnl_link_register(&vti_link_ops);
 	if (err < 0)
@@ -612,6 +660,8 @@ static int __init vti_init(void)
 
 rtnl_link_failed:
 	xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+xfrm_tunnel_failed:
+	xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
 xfrm_proto_comp_failed:
 	xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
 xfrm_proto_ah_failed:
-- 
cgit v1.2.3-70-g09d2


From 35e6103861a3a970de6c84688c6e7a1f65b164ca Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 9 Jan 2019 14:37:34 +0100
Subject: xfrm: refine validation of template and selector families

The check assumes that in transport mode, the first templates family
must match the address family of the policy selector.

Syzkaller managed to build a template using MODE_ROUTEOPTIMIZATION,
with ipv4-in-ipv6 chain, leading to following splat:

BUG: KASAN: stack-out-of-bounds in xfrm_state_find+0x1db/0x1854
Read of size 4 at addr ffff888063e57aa0 by task a.out/2050
 xfrm_state_find+0x1db/0x1854
 xfrm_tmpl_resolve+0x100/0x1d0
 xfrm_resolve_and_create_bundle+0x108/0x1000 [..]

Problem is that addresses point into flowi4 struct, but xfrm_state_find
treats them as being ipv6 because it uses templ->encap_family is used
(AF_INET6 in case of reproducer) rather than family (AF_INET).

This patch inverts the logic: Enforce 'template family must match
selector' EXCEPT for tunnel and BEET mode.

In BEET and Tunnel mode, xfrm_tmpl_resolve_one will have remote/local
address pointers changed to point at the addresses found in the template,
rather than the flowi ones, so no oob read will occur.

Reported-by: 3ntr0py1337@gmail.com
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 277c1c46fe94..c6d26afcf89d 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1488,10 +1488,15 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
 		if (!ut[i].family)
 			ut[i].family = family;
 
-		if ((ut[i].mode == XFRM_MODE_TRANSPORT) &&
-		    (ut[i].family != prev_family))
-			return -EINVAL;
-
+		switch (ut[i].mode) {
+		case XFRM_MODE_TUNNEL:
+		case XFRM_MODE_BEET:
+			break;
+		default:
+			if (ut[i].family != prev_family)
+				return -EINVAL;
+			break;
+		}
 		if (ut[i].mode >= XFRM_MODE_MAX)
 			return -EINVAL;
 
-- 
cgit v1.2.3-70-g09d2


From e2612cd496e7b465711d219ea6118893d7253f52 Mon Sep 17 00:00:00 2001
From: Benedict Wong <benedictwong@google.com>
Date: Mon, 14 Jan 2019 11:24:38 -0800
Subject: xfrm: Make set-mark default behavior backward compatible
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes 9b42c1f179a6, which changed the default route lookup behavior for
tunnel mode SAs in the outbound direction to use the skb mark, whereas
previously mark=0 was used if the output mark was unspecified. In
mark-based routing schemes such as Android’s, this change in default
behavior causes routing loops or lookup failures.

This patch restores the default behavior of using a 0 mark while still
incorporating the skb mark if the SET_MARK (and SET_MARK_MASK) is
specified.

Tested with additions to Android's kernel unit test suite:
https://android-review.googlesource.com/c/kernel/tests/+/860150

Fixes: 9b42c1f179a6 ("xfrm: Extend the output_mark to support input direction and masking")
Signed-off-by: Benedict Wong <benedictwong@google.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 8cfd75b62396..ba0a4048c846 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2600,7 +2600,10 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		dst_copy_metrics(dst1, dst);
 
 		if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
-			__u32 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);
+			__u32 mark = 0;
+
+			if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m)
+				mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);
 
 			family = xfrm[i]->props.family;
 			dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
-- 
cgit v1.2.3-70-g09d2


From 12c44aba6618b7f6c437076e5722237190f6cd5f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 Jan 2019 14:28:48 +0100
Subject: netfilter: nft_compat: use refcnt_t type for nft_xt reference count

Using standard integer type was fine while all operations on it were
guarded by the nftnl subsys mutex.

This isn't true anymore:
1. transactions are guarded only by a pernet mutex, so concurrent
   rule manipulation in different netns is racy
2. the ->destroy hook runs from a work queue after the transaction
   mutex has been released already.

cpu0                           cpu1 (net 1)        cpu2 (net 2)
 kworker
    nft_compat->destroy        nft_compat->init    nft_compat->init
      if (--nft_xt->ref == 0)   nft_xt->ref++        nft_xt->ref++

Switch to refcount_t.  Doing this however only fixes a minor aspect,
nft_compat also performs linked-list operations in an unsafe way.

This is addressed in the next two patches.

Fixes: f102d66b335a ("netfilter: nf_tables: use dedicated mutex to guard transactions")
Fixes: 0935d5588400 ("netfilter: nf_tables: asynchronous release")
Reported-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 7334e0b80a5e..acc85acad31b 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -26,7 +26,7 @@
 struct nft_xt {
 	struct list_head	head;
 	struct nft_expr_ops	ops;
-	unsigned int		refcnt;
+	refcount_t		refcnt;
 
 	/* Unlike other expressions, ops doesn't have static storage duration.
 	 * nft core assumes they do.  We use kfree_rcu so that nft core can
@@ -45,7 +45,7 @@ struct nft_xt_match_priv {
 
 static bool nft_xt_put(struct nft_xt *xt)
 {
-	if (--xt->refcnt == 0) {
+	if (refcount_dec_and_test(&xt->refcnt)) {
 		list_del(&xt->head);
 		kfree_rcu(xt, rcu_head);
 		return true;
@@ -273,7 +273,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		return -EINVAL;
 
 	nft_xt = container_of(expr->ops, struct nft_xt, ops);
-	nft_xt->refcnt++;
+	refcount_inc(&nft_xt->refcnt);
 	return 0;
 }
 
@@ -486,7 +486,7 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		return ret;
 
 	nft_xt = container_of(expr->ops, struct nft_xt, ops);
-	nft_xt->refcnt++;
+	refcount_inc(&nft_xt->refcnt);
 	return 0;
 }
 
@@ -789,7 +789,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 		goto err;
 	}
 
-	nft_match->refcnt = 0;
+	refcount_set(&nft_match->refcnt, 0);
 	nft_match->ops.type = &nft_match_type;
 	nft_match->ops.eval = nft_match_eval;
 	nft_match->ops.init = nft_match_init;
@@ -893,7 +893,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 		goto err;
 	}
 
-	nft_target->refcnt = 0;
+	refcount_set(&nft_target->refcnt, 0);
 	nft_target->ops.type = &nft_target_type;
 	nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize));
 	nft_target->ops.init = nft_target_init;
@@ -964,7 +964,7 @@ static void __exit nft_compat_module_exit(void)
 	list_for_each_entry_safe(xt, next, &nft_target_list, head) {
 		struct xt_target *target = xt->ops.data;
 
-		if (WARN_ON_ONCE(xt->refcnt))
+		if (WARN_ON_ONCE(refcount_read(&xt->refcnt)))
 			continue;
 		module_put(target->me);
 		kfree(xt);
@@ -973,7 +973,7 @@ static void __exit nft_compat_module_exit(void)
 	list_for_each_entry_safe(xt, next, &nft_match_list, head) {
 		struct xt_match *match = xt->ops.data;
 
-		if (WARN_ON_ONCE(xt->refcnt))
+		if (WARN_ON_ONCE(refcount_read(&xt->refcnt)))
 			continue;
 		module_put(match->me);
 		kfree(xt);
-- 
cgit v1.2.3-70-g09d2


From cf52572ebbd7189a1966c2b5fc34b97078cd1dce Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 Jan 2019 14:28:49 +0100
Subject: netfilter: nft_compat: make lists per netns

There are two problems with nft_compat since the netlink config
plane uses a per-netns mutex:

1. Concurrent add/del accesses to the same list
2. accesses to a list element after it has been free'd already.

This patch fixes the first problem.

Freeing occurs from a work queue, after transaction mutexes have been
released, i.e., it still possible for a new transaction (even from
same net ns) to find the to-be-deleted expression in the list.

The ->destroy functions are not allowed to have any such side effects,
i.e. the list_del() in the destroy function is not allowed.

This part of the problem is solved in the next patch.
I tried to make this work by serializing list access via mutex
and by moving list_del() to a deactivate callback, but
Taehee spotted following race on this approach:

  NET #0                          NET #1
   >select_ops()
   ->init()
                                   ->select_ops()
   ->deactivate()
   ->destroy()
      nft_xt_put()
       kfree_rcu(xt, rcu_head);
                                   ->init() <-- use-after-free occurred.

Unfortunately, we can't increment reference count in
select_ops(), because we can't undo the refcount increase in
case a different expression fails in the same batch.

(The destroy hook will only be called in case the expression
 was initialized successfully).

Fixes: f102d66b335a ("netfilter: nf_tables: use dedicated mutex to guard transactions")
Reported-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 129 +++++++++++++++++++++++++++++++--------------
 1 file changed, 89 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index acc85acad31b..abed3490a8f8 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -22,6 +22,7 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_arp/arp_tables.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netns/generic.h>
 
 struct nft_xt {
 	struct list_head	head;
@@ -43,6 +44,20 @@ struct nft_xt_match_priv {
 	void *info;
 };
 
+struct nft_compat_net {
+	struct list_head nft_target_list;
+	struct list_head nft_match_list;
+};
+
+static unsigned int nft_compat_net_id __read_mostly;
+static struct nft_expr_type nft_match_type;
+static struct nft_expr_type nft_target_type;
+
+static struct nft_compat_net *nft_compat_pernet(struct net *net)
+{
+	return net_generic(net, nft_compat_net_id);
+}
+
 static bool nft_xt_put(struct nft_xt *xt)
 {
 	if (refcount_dec_and_test(&xt->refcnt)) {
@@ -734,10 +749,6 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = {
 	.cb		= nfnl_nft_compat_cb,
 };
 
-static LIST_HEAD(nft_match_list);
-
-static struct nft_expr_type nft_match_type;
-
 static bool nft_match_cmp(const struct xt_match *match,
 			  const char *name, u32 rev, u32 family)
 {
@@ -749,6 +760,7 @@ static const struct nft_expr_ops *
 nft_match_select_ops(const struct nft_ctx *ctx,
 		     const struct nlattr * const tb[])
 {
+	struct nft_compat_net *cn;
 	struct nft_xt *nft_match;
 	struct xt_match *match;
 	unsigned int matchsize;
@@ -765,8 +777,10 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 	rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV]));
 	family = ctx->family;
 
+	cn = nft_compat_pernet(ctx->net);
+
 	/* Re-use the existing match if it's already loaded. */
-	list_for_each_entry(nft_match, &nft_match_list, head) {
+	list_for_each_entry(nft_match, &cn->nft_match_list, head) {
 		struct xt_match *match = nft_match->ops.data;
 
 		if (nft_match_cmp(match, mt_name, rev, family))
@@ -810,7 +824,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 
 	nft_match->ops.size = matchsize;
 
-	list_add(&nft_match->head, &nft_match_list);
+	list_add(&nft_match->head, &cn->nft_match_list);
 
 	return &nft_match->ops;
 err:
@@ -826,10 +840,6 @@ static struct nft_expr_type nft_match_type __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
-static LIST_HEAD(nft_target_list);
-
-static struct nft_expr_type nft_target_type;
-
 static bool nft_target_cmp(const struct xt_target *tg,
 			   const char *name, u32 rev, u32 family)
 {
@@ -841,6 +851,7 @@ static const struct nft_expr_ops *
 nft_target_select_ops(const struct nft_ctx *ctx,
 		      const struct nlattr * const tb[])
 {
+	struct nft_compat_net *cn;
 	struct nft_xt *nft_target;
 	struct xt_target *target;
 	char *tg_name;
@@ -861,8 +872,9 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 	    strcmp(tg_name, "standard") == 0)
 		return ERR_PTR(-EINVAL);
 
+	cn = nft_compat_pernet(ctx->net);
 	/* Re-use the existing target if it's already loaded. */
-	list_for_each_entry(nft_target, &nft_target_list, head) {
+	list_for_each_entry(nft_target, &cn->nft_target_list, head) {
 		struct xt_target *target = nft_target->ops.data;
 
 		if (!target->target)
@@ -907,7 +919,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 	else
 		nft_target->ops.eval = nft_target_eval_xt;
 
-	list_add(&nft_target->head, &nft_target_list);
+	list_add(&nft_target->head, &cn->nft_target_list);
 
 	return &nft_target->ops;
 err:
@@ -923,13 +935,74 @@ static struct nft_expr_type nft_target_type __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
+static int __net_init nft_compat_init_net(struct net *net)
+{
+	struct nft_compat_net *cn = nft_compat_pernet(net);
+
+	INIT_LIST_HEAD(&cn->nft_target_list);
+	INIT_LIST_HEAD(&cn->nft_match_list);
+
+	return 0;
+}
+
+static void __net_exit nft_compat_exit_net(struct net *net)
+{
+	struct nft_compat_net *cn = nft_compat_pernet(net);
+	struct nft_xt *xt, *next;
+
+	if (list_empty(&cn->nft_match_list) &&
+	    list_empty(&cn->nft_target_list))
+		return;
+
+	/* If there was an error that caused nft_xt expr to not be initialized
+	 * fully and noone else requested the same expression later, the lists
+	 * contain 0-refcount entries that still hold module reference.
+	 *
+	 * Clean them here.
+	 */
+	mutex_lock(&net->nft.commit_mutex);
+	list_for_each_entry_safe(xt, next, &cn->nft_target_list, head) {
+		struct xt_target *target = xt->ops.data;
+
+		list_del_init(&xt->head);
+
+		if (refcount_read(&xt->refcnt))
+			continue;
+		module_put(target->me);
+		kfree(xt);
+	}
+
+	list_for_each_entry_safe(xt, next, &cn->nft_match_list, head) {
+		struct xt_match *match = xt->ops.data;
+
+		list_del_init(&xt->head);
+
+		if (refcount_read(&xt->refcnt))
+			continue;
+		module_put(match->me);
+		kfree(xt);
+	}
+	mutex_unlock(&net->nft.commit_mutex);
+}
+
+static struct pernet_operations nft_compat_net_ops = {
+	.init	= nft_compat_init_net,
+	.exit	= nft_compat_exit_net,
+	.id	= &nft_compat_net_id,
+	.size	= sizeof(struct nft_compat_net),
+};
+
 static int __init nft_compat_module_init(void)
 {
 	int ret;
 
+	ret = register_pernet_subsys(&nft_compat_net_ops);
+	if (ret < 0)
+		goto err_target;
+
 	ret = nft_register_expr(&nft_match_type);
 	if (ret < 0)
-		return ret;
+		goto err_pernet;
 
 	ret = nft_register_expr(&nft_target_type);
 	if (ret < 0)
@@ -942,45 +1015,21 @@ static int __init nft_compat_module_init(void)
 	}
 
 	return ret;
-
 err_target:
 	nft_unregister_expr(&nft_target_type);
 err_match:
 	nft_unregister_expr(&nft_match_type);
+err_pernet:
+	unregister_pernet_subsys(&nft_compat_net_ops);
 	return ret;
 }
 
 static void __exit nft_compat_module_exit(void)
 {
-	struct nft_xt *xt, *next;
-
-	/* list should be empty here, it can be non-empty only in case there
-	 * was an error that caused nft_xt expr to not be initialized fully
-	 * and noone else requested the same expression later.
-	 *
-	 * In this case, the lists contain 0-refcount entries that still
-	 * hold module reference.
-	 */
-	list_for_each_entry_safe(xt, next, &nft_target_list, head) {
-		struct xt_target *target = xt->ops.data;
-
-		if (WARN_ON_ONCE(refcount_read(&xt->refcnt)))
-			continue;
-		module_put(target->me);
-		kfree(xt);
-	}
-
-	list_for_each_entry_safe(xt, next, &nft_match_list, head) {
-		struct xt_match *match = xt->ops.data;
-
-		if (WARN_ON_ONCE(refcount_read(&xt->refcnt)))
-			continue;
-		module_put(match->me);
-		kfree(xt);
-	}
 	nfnetlink_subsys_unregister(&nfnl_compat_subsys);
 	nft_unregister_expr(&nft_target_type);
 	nft_unregister_expr(&nft_match_type);
+	unregister_pernet_subsys(&nft_compat_net_ops);
 }
 
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT);
-- 
cgit v1.2.3-70-g09d2


From b2e3d68d1251a051a620f9086e18f7ffa6833b5b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 Jan 2019 14:28:50 +0100
Subject: netfilter: nft_compat: destroy function must not have side effects

The nft_compat destroy function deletes the nft_xt object from a list.
This isn't allowed anymore. Destroy functions are called asynchronously,
i.e. next batch can find the object that has a pending ->destroy()
invocation:

cpu0                       cpu1
 worker
   ->destroy               for_each_entry()
	                     if (x == ...
			        return x->ops;
     list_del(x)
     kfree_rcu(x)
                           expr->ops->... // ops was free'd

To resolve this, the list_del needs to occur before the transaction
mutex gets released.  nf_tables has a 'deactivate' hook for this
purpose, so use that to unlink the object from the list.

Fixes: 0935d5588400 ("netfilter: nf_tables: asynchronous release")
Reported-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 48 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index abed3490a8f8..5eb269428832 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -29,6 +29,9 @@ struct nft_xt {
 	struct nft_expr_ops	ops;
 	refcount_t		refcnt;
 
+	/* used only when transaction mutex is locked */
+	unsigned int		listcnt;
+
 	/* Unlike other expressions, ops doesn't have static storage duration.
 	 * nft core assumes they do.  We use kfree_rcu so that nft core can
 	 * can check expr->ops->size even after nft_compat->destroy() frees
@@ -61,7 +64,7 @@ static struct nft_compat_net *nft_compat_pernet(struct net *net)
 static bool nft_xt_put(struct nft_xt *xt)
 {
 	if (refcount_dec_and_test(&xt->refcnt)) {
-		list_del(&xt->head);
+		WARN_ON_ONCE(!list_empty(&xt->head));
 		kfree_rcu(xt, rcu_head);
 		return true;
 	}
@@ -555,6 +558,43 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 	__nft_match_destroy(ctx, expr, nft_expr_priv(expr));
 }
 
+static void nft_compat_activate(const struct nft_ctx *ctx,
+				const struct nft_expr *expr,
+				struct list_head *h)
+{
+	struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops);
+
+	if (xt->listcnt == 0)
+		list_add(&xt->head, h);
+
+	xt->listcnt++;
+}
+
+static void nft_compat_activate_mt(const struct nft_ctx *ctx,
+				   const struct nft_expr *expr)
+{
+	struct nft_compat_net *cn = nft_compat_pernet(ctx->net);
+
+	nft_compat_activate(ctx, expr, &cn->nft_match_list);
+}
+
+static void nft_compat_activate_tg(const struct nft_ctx *ctx,
+				   const struct nft_expr *expr)
+{
+	struct nft_compat_net *cn = nft_compat_pernet(ctx->net);
+
+	nft_compat_activate(ctx, expr, &cn->nft_target_list);
+}
+
+static void nft_compat_deactivate(const struct nft_ctx *ctx,
+				  const struct nft_expr *expr)
+{
+	struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops);
+
+	if (--xt->listcnt == 0)
+		list_del_init(&xt->head);
+}
+
 static void
 nft_match_large_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 {
@@ -808,6 +848,8 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 	nft_match->ops.eval = nft_match_eval;
 	nft_match->ops.init = nft_match_init;
 	nft_match->ops.destroy = nft_match_destroy;
+	nft_match->ops.activate = nft_compat_activate_mt;
+	nft_match->ops.deactivate = nft_compat_deactivate;
 	nft_match->ops.dump = nft_match_dump;
 	nft_match->ops.validate = nft_match_validate;
 	nft_match->ops.data = match;
@@ -824,6 +866,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 
 	nft_match->ops.size = matchsize;
 
+	nft_match->listcnt = 1;
 	list_add(&nft_match->head, &cn->nft_match_list);
 
 	return &nft_match->ops;
@@ -910,6 +953,8 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 	nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize));
 	nft_target->ops.init = nft_target_init;
 	nft_target->ops.destroy = nft_target_destroy;
+	nft_target->ops.activate = nft_compat_activate_tg;
+	nft_target->ops.deactivate = nft_compat_deactivate;
 	nft_target->ops.dump = nft_target_dump;
 	nft_target->ops.validate = nft_target_validate;
 	nft_target->ops.data = target;
@@ -919,6 +964,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 	else
 		nft_target->ops.eval = nft_target_eval_xt;
 
+	nft_target->listcnt = 1;
 	list_add(&nft_target->head, &cn->nft_target_list);
 
 	return &nft_target->ops;
-- 
cgit v1.2.3-70-g09d2


From c9e4576743eeda8d24dedc164d65b78877f9a98c Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 23 Jan 2019 12:37:19 +0800
Subject: bpf: sock recvbuff must be limited by rmem_max in bpf_setsockopt()

When sock recvbuff is set by bpf_setsockopt(), the value must by
limited by rmem_max. It is the same with sendbuff.

Fixes: 8c4b4c7e9ff0 ("bpf: Add setsockopt helper function to bpf")
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 7559d6835ecb..7a54dc11ac2d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4112,10 +4112,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 		/* Only some socketops are supported */
 		switch (optname) {
 		case SO_RCVBUF:
+			val = min_t(u32, val, sysctl_rmem_max);
 			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 			sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 			break;
 		case SO_SNDBUF:
+			val = min_t(u32, val, sysctl_wmem_max);
 			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 			sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 			break;
-- 
cgit v1.2.3-70-g09d2


From 53ab60baa1ac4f20b080a22c13b77b6373922fd7 Mon Sep 17 00:00:00 2001
From: ZhangXiaoxu <zhangxiaoxu5@huawei.com>
Date: Thu, 10 Jan 2019 16:39:06 +0800
Subject: ipvs: Fix signed integer overflow when setsockopt timeout

There is a UBSAN bug report as below:
UBSAN: Undefined behaviour in net/netfilter/ipvs/ip_vs_ctl.c:2227:21
signed integer overflow:
-2147483647 * 1000 cannot be represented in type 'int'

Reproduce program:
	#include <stdio.h>
	#include <sys/types.h>
	#include <sys/socket.h>

	#define IPPROTO_IP 0
	#define IPPROTO_RAW 255

	#define IP_VS_BASE_CTL		(64+1024+64)
	#define IP_VS_SO_SET_TIMEOUT	(IP_VS_BASE_CTL+10)

	/* The argument to IP_VS_SO_GET_TIMEOUT */
	struct ipvs_timeout_t {
		int tcp_timeout;
		int tcp_fin_timeout;
		int udp_timeout;
	};

	int main() {
		int ret = -1;
		int sockfd = -1;
		struct ipvs_timeout_t to;

		sockfd = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
		if (sockfd == -1) {
			printf("socket init error\n");
			return -1;
		}

		to.tcp_timeout = -2147483647;
		to.tcp_fin_timeout = -2147483647;
		to.udp_timeout = -2147483647;

		ret = setsockopt(sockfd,
				 IPPROTO_IP,
				 IP_VS_SO_SET_TIMEOUT,
				 (char *)(&to),
				 sizeof(to));

		printf("setsockopt return %d\n", ret);
		return ret;
	}

Return -EINVAL if the timeout value is negative or max than 'INT_MAX / HZ'.

Signed-off-by: ZhangXiaoxu <zhangxiaoxu5@huawei.com>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_ctl.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 432141f04af3..7d6318664eb2 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2220,6 +2220,18 @@ static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user
 		  u->tcp_fin_timeout,
 		  u->udp_timeout);
 
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) ||
+	    u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) {
+		return -EINVAL;
+	}
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_UDP
+	if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ))
+		return -EINVAL;
+#endif
+
 #ifdef CONFIG_IP_VS_PROTO_TCP
 	if (u->tcp_timeout) {
 		pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
-- 
cgit v1.2.3-70-g09d2


From 7d652669b61d702c6e62a39579d17f6881670ab6 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 25 Jan 2019 08:21:26 +0100
Subject: batman-adv: release station info tidstats

With the addition of TXQ stats in the per-tid statistics the struct
station_info grew significantly. This resulted in stack size warnings
due to the structure itself being above the limit for the warnings.

To work around this, the TID array was allocated dynamically. Also a
function to free this content was introduced with commit 7ea3e110f2f8
("cfg80211: release station info tidstats where needed") but the necessary
changes were not provided for batman-adv's B.A.T.M.A.N. V implementation.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Fixes: 8689c051a201 ("cfg80211: dynamically allocate per-tid stats for station info")
[sven@narfation.org: add commit message]
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bat_v_elp.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index e8090f099eb8..ef0dec20c7d8 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -104,6 +104,9 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
 
 		ret = cfg80211_get_station(real_netdev, neigh->addr, &sinfo);
 
+		/* free the TID stats immediately */
+		cfg80211_sinfo_release_content(&sinfo);
+
 		dev_put(real_netdev);
 		if (ret == -ENOENT) {
 			/* Node is not associated anymore! It would be
-- 
cgit v1.2.3-70-g09d2


From 63346650c1a94a92be61a57416ac88c0a47c4327 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 24 Jan 2019 14:18:18 -0800
Subject: netrom: switch to sock timer API

sk_reset_timer() and sk_stop_timer() properly handle
sock refcnt for timer function. Switching to them
could fix a refcounting bug reported by syzbot.

Reported-and-tested-by: syzbot+defa700d16f1bd1b9a05@syzkaller.appspotmail.com
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-hams@vger.kernel.org
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netrom/nr_timer.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c
index cbd51ed5a2d7..908e53ab47a4 100644
--- a/net/netrom/nr_timer.c
+++ b/net/netrom/nr_timer.c
@@ -52,21 +52,21 @@ void nr_start_t1timer(struct sock *sk)
 {
 	struct nr_sock *nr = nr_sk(sk);
 
-	mod_timer(&nr->t1timer, jiffies + nr->t1);
+	sk_reset_timer(sk, &nr->t1timer, jiffies + nr->t1);
 }
 
 void nr_start_t2timer(struct sock *sk)
 {
 	struct nr_sock *nr = nr_sk(sk);
 
-	mod_timer(&nr->t2timer, jiffies + nr->t2);
+	sk_reset_timer(sk, &nr->t2timer, jiffies + nr->t2);
 }
 
 void nr_start_t4timer(struct sock *sk)
 {
 	struct nr_sock *nr = nr_sk(sk);
 
-	mod_timer(&nr->t4timer, jiffies + nr->t4);
+	sk_reset_timer(sk, &nr->t4timer, jiffies + nr->t4);
 }
 
 void nr_start_idletimer(struct sock *sk)
@@ -74,37 +74,37 @@ void nr_start_idletimer(struct sock *sk)
 	struct nr_sock *nr = nr_sk(sk);
 
 	if (nr->idle > 0)
-		mod_timer(&nr->idletimer, jiffies + nr->idle);
+		sk_reset_timer(sk, &nr->idletimer, jiffies + nr->idle);
 }
 
 void nr_start_heartbeat(struct sock *sk)
 {
-	mod_timer(&sk->sk_timer, jiffies + 5 * HZ);
+	sk_reset_timer(sk, &sk->sk_timer, jiffies + 5 * HZ);
 }
 
 void nr_stop_t1timer(struct sock *sk)
 {
-	del_timer(&nr_sk(sk)->t1timer);
+	sk_stop_timer(sk, &nr_sk(sk)->t1timer);
 }
 
 void nr_stop_t2timer(struct sock *sk)
 {
-	del_timer(&nr_sk(sk)->t2timer);
+	sk_stop_timer(sk, &nr_sk(sk)->t2timer);
 }
 
 void nr_stop_t4timer(struct sock *sk)
 {
-	del_timer(&nr_sk(sk)->t4timer);
+	sk_stop_timer(sk, &nr_sk(sk)->t4timer);
 }
 
 void nr_stop_idletimer(struct sock *sk)
 {
-	del_timer(&nr_sk(sk)->idletimer);
+	sk_stop_timer(sk, &nr_sk(sk)->idletimer);
 }
 
 void nr_stop_heartbeat(struct sock *sk)
 {
-	del_timer(&sk->sk_timer);
+	sk_stop_timer(sk, &sk->sk_timer);
 }
 
 int nr_t1timer_running(struct sock *sk)
-- 
cgit v1.2.3-70-g09d2


From b0cf029234f9b18e10703ba5147f0389c382bccc Mon Sep 17 00:00:00 2001
From: Bernard Pidoux <f6bvp@free.fr>
Date: Fri, 25 Jan 2019 11:46:40 +0100
Subject: net/rose: fix NULL ax25_cb kernel panic

When an internally generated frame is handled by rose_xmit(),
rose_route_frame() is called:

        if (!rose_route_frame(skb, NULL)) {
                dev_kfree_skb(skb);
                stats->tx_errors++;
                return NETDEV_TX_OK;
        }

We have the same code sequence in Net/Rom where an internally generated
frame is handled by nr_xmit() calling nr_route_frame(skb, NULL).
However, in this function NULL argument is tested while it is not in
rose_route_frame().
Then kernel panic occurs later on when calling ax25cmp() with a NULL
ax25_cb argument as reported many times and recently with syzbot.

We need to test if ax25 is NULL before using it.

Testing:
Built kernel with CONFIG_ROSE=y.

Signed-off-by: Bernard Pidoux <f6bvp@free.fr>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Reported-by: syzbot+1a2c456a1ea08fa5b5f7@syzkaller.appspotmail.com
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Bernard Pidoux <f6bvp@free.fr>
Cc: linux-hams@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rose/rose_route.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index 77e9f85a2c92..f2ff21d7df08 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -850,6 +850,7 @@ void rose_link_device_down(struct net_device *dev)
 
 /*
  *	Route a frame to an appropriate AX.25 connection.
+ *	A NULL ax25_cb indicates an internally generated frame.
  */
 int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
 {
@@ -867,6 +868,10 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
 
 	if (skb->len < ROSE_MIN_LEN)
 		return res;
+
+	if (!ax25)
+		return rose_loopback_queue(skb, NULL);
+
 	frametype = skb->data[2];
 	lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF);
 	if (frametype == ROSE_CALL_REQUEST &&
-- 
cgit v1.2.3-70-g09d2


From 50c2936634bcb1db78a8ca63249236810c11a80f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Sat, 26 Jan 2019 21:12:19 +0100
Subject: decnet: fix DN_IFREQ_SIZE

Digging through the ioctls with Al because of the previous
patches, we found that on 64-bit decnet's dn_dev_ioctl()
is wrong, because struct ifreq::ifr_ifru is actually 24
bytes (not 16 as expected from struct sockaddr) due to the
ifru_map and ifru_settings members.

Clearly, decnet expects the ioctl to be called with a struct
like
  struct ifreq_dn {
    char ifr_name[IFNAMSIZ];
    struct sockaddr_dn ifr_addr;
  };

since it does
  struct ifreq *ifr = ...;
  struct sockaddr_dn *sdn = (struct sockaddr_dn *)&ifr->ifr_addr;

This means that DN_IFREQ_SIZE is too big for what it wants on
64-bit, as it is
  sizeof(struct ifreq) - sizeof(struct sockaddr) +
  sizeof(struct sockaddr_dn)

This assumes that sizeof(struct sockaddr) is the size of ifr_ifru
but that isn't true.

Fix this to use offsetof(struct ifreq, ifr_ifru).

This indeed doesn't really matter much - the result is that we
copy in/out 8 bytes more than we should on 64-bit platforms. In
case the "struct ifreq_dn" lands just on the end of a page though
it might lead to faults.

As far as I can tell, it has been like this forever, so it seems
very likely that nobody cares.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/decnet/dn_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index d0b3e69c6b39..0962f9201baa 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -56,7 +56,7 @@
 #include <net/dn_neigh.h>
 #include <net/dn_fib.h>
 
-#define DN_IFREQ_SIZE (sizeof(struct ifreq) - sizeof(struct sockaddr) + sizeof(struct sockaddr_dn))
+#define DN_IFREQ_SIZE (offsetof(struct ifreq, ifr_ifru) + sizeof(struct sockaddr_dn))
 
 static char dn_rt_all_end_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x04,0x00,0x00};
 static char dn_rt_all_rt_mcast[ETH_ALEN]  = {0xAB,0x00,0x00,0x03,0x00,0x00};
-- 
cgit v1.2.3-70-g09d2


From 146820cc240f4389cf33481c058d9493aef95e25 Mon Sep 17 00:00:00 2001
From: Nir Dotan <nird@mellanox.com>
Date: Sun, 27 Jan 2019 09:26:22 +0200
Subject: ip6mr: Fix notifiers call on mroute_clean_tables()

When the MC route socket is closed, mroute_clean_tables() is called to
cleanup existing routes. Mistakenly notifiers call was put on the cleanup
of the unresolved MC route entries cache.
In a case where the MC socket closes before an unresolved route expires,
the notifier call leads to a crash, caused by the driver trying to
increment a non initialized refcount_t object [1] and then when handling
is done, to decrement it [2]. This was detected by a test recently added in
commit 6d4efada3b82 ("selftests: forwarding: Add multicast routing test").

Fix that by putting notifiers call on the resolved entries traversal,
instead of on the unresolved entries traversal.

[1]

[  245.748967] refcount_t: increment on 0; use-after-free.
[  245.754829] WARNING: CPU: 3 PID: 3223 at lib/refcount.c:153 refcount_inc_checked+0x2b/0x30
...
[  245.802357] Hardware name: Mellanox Technologies Ltd. MSN2740/SA001237, BIOS 5.6.5 06/07/2016
[  245.811873] RIP: 0010:refcount_inc_checked+0x2b/0x30
...
[  245.907487] Call Trace:
[  245.910231]  mlxsw_sp_router_fib_event.cold.181+0x42/0x47 [mlxsw_spectrum]
[  245.917913]  notifier_call_chain+0x45/0x7
[  245.922484]  atomic_notifier_call_chain+0x15/0x20
[  245.927729]  call_fib_notifiers+0x15/0x30
[  245.932205]  mroute_clean_tables+0x372/0x3f
[  245.936971]  ip6mr_sk_done+0xb1/0xc0
[  245.940960]  ip6_mroute_setsockopt+0x1da/0x5f0
...

[2]

[  246.128487] refcount_t: underflow; use-after-free.
[  246.133859] WARNING: CPU: 0 PID: 7 at lib/refcount.c:187 refcount_sub_and_test_checked+0x4c/0x60
[  246.183521] Hardware name: Mellanox Technologies Ltd. MSN2740/SA001237, BIOS 5.6.5 06/07/2016
...
[  246.193062] Workqueue: mlxsw_core_ordered mlxsw_sp_router_fibmr_event_work [mlxsw_spectrum]
[  246.202394] RIP: 0010:refcount_sub_and_test_checked+0x4c/0x60
...
[  246.298889] Call Trace:
[  246.301617]  refcount_dec_and_test_checked+0x11/0x20
[  246.307170]  mlxsw_sp_router_fibmr_event_work.cold.196+0x47/0x78 [mlxsw_spectrum]
[  246.315531]  process_one_work+0x1fa/0x3f0
[  246.320005]  worker_thread+0x2f/0x3e0
[  246.324083]  kthread+0x118/0x130
[  246.327683]  ? wq_update_unbound_numa+0x1b0/0x1b0
[  246.332926]  ? kthread_park+0x80/0x80
[  246.337013]  ret_from_fork+0x1f/0x30

Fixes: 088aa3eec2ce ("ip6mr: Support fib notifications")
Signed-off-by: Nir Dotan <nird@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 30337b38274b..cc01aa3f2b5e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1516,6 +1516,9 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 			continue;
 		rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params);
 		list_del_rcu(&c->list);
+		call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
+					       FIB_EVENT_ENTRY_DEL,
+					       (struct mfc6_cache *)c, mrt->id);
 		mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE);
 		mr_cache_put(c);
 	}
@@ -1524,10 +1527,6 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 		spin_lock_bh(&mfc_unres_lock);
 		list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
 			list_del(&c->list);
-			call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
-						       FIB_EVENT_ENTRY_DEL,
-						       (struct mfc6_cache *)c,
-						       mrt->id);
 			mr6_netlink_event(mrt, (struct mfc6_cache *)c,
 					  RTM_DELROUTE);
 			ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c);
-- 
cgit v1.2.3-70-g09d2


From 2035f3ff8eaa29cfb5c8e2160b0f6e85eeb21a95 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 21 Jan 2019 21:54:36 +0100
Subject: netfilter: ebtables: compat: un-break 32bit setsockopt when no rules
 are present

Unlike ip(6)tables ebtables only counts user-defined chains.

The effect is that a 32bit ebtables binary on a 64bit kernel can do
'ebtables -N FOO' only after adding at least one rule, else the request
fails with -EINVAL.

This is a similar fix as done in
3f1e53abff84 ("netfilter: ebtables: don't attempt to allocate 0-sized compat array").

Fixes: 7d7d7e02111e9 ("netfilter: compat: reject huge allocation requests")
Reported-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebtables.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 5e55cef0cec3..6693e209efe8 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -2293,9 +2293,12 @@ static int compat_do_replace(struct net *net, void __user *user,
 
 	xt_compat_lock(NFPROTO_BRIDGE);
 
-	ret = xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries);
-	if (ret < 0)
-		goto out_unlock;
+	if (tmp.nentries) {
+		ret = xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries);
+		if (ret < 0)
+			goto out_unlock;
+	}
+
 	ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state);
 	if (ret < 0)
 		goto out_unlock;
-- 
cgit v1.2.3-70-g09d2


From 1a6a0951fc009f6d9fe8ebea2d2417d80d54097b Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Mon, 21 Jan 2019 12:53:21 +0100
Subject: netfilter: nfnetlink_osf: add missing fmatch check

When we check the tcp options of a packet and it doesn't match the current
fingerprint, the tcp packet option pointer must be restored to its initial
value in order to do the proper tcp options check for the next fingerprint.

Here we can see an example.
Assumming the following fingerprint base with two lines:

S10:64:1:60:M*,S,T,N,W6:      Linux:3.0::Linux 3.0
S20:64:1:60:M*,S,T,N,W7:      Linux:4.19:arch:Linux 4.1

Where TCP options are the last field in the OS signature, all of them overlap
except by the last one, ie. 'W6' versus 'W7'.

In case a packet for Linux 4.19 kicks in, the osf finds no matching because the
TCP options pointer is updated after checking for the TCP options in the first
line.

Therefore, reset pointer back to where it should be.

Fixes: 11eeef41d5f6 ("netfilter: passive OS fingerprint xtables match")
Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_osf.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index 6f41dd74729d..1f1d90c1716b 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -66,6 +66,7 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
 			     int ttl_check,
 			     struct nf_osf_hdr_ctx *ctx)
 {
+	const __u8 *optpinit = ctx->optp;
 	unsigned int check_WSS = 0;
 	int fmatch = FMATCH_WRONG;
 	int foptsize, optnum;
@@ -155,6 +156,9 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
 		}
 	}
 
+	if (fmatch != FMATCH_OK)
+		ctx->optp = optpinit;
+
 	return fmatch == FMATCH_OK;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 206b8cc514d7ff2b79dd2d5ad939adc7c493f07a Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Wed, 23 Jan 2019 12:48:11 +0100
Subject: netfilter: ipt_CLUSTERIP: fix warning unused variable cn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When CONFIG_PROC_FS isn't set the variable cn isn't used.

net/ipv4/netfilter/ipt_CLUSTERIP.c: In function ‘clusterip_net_exit’:
net/ipv4/netfilter/ipt_CLUSTERIP.c:849:24: warning: unused variable ‘cn’ [-Wunused-variable]
  struct clusterip_net *cn = clusterip_pernet(net);
                        ^~

Rework so the variable 'cn' is declared inside "#ifdef CONFIG_PROC_FS".

Fixes: b12f7bad5ad3 ("netfilter: ipt_CLUSTERIP: remove wrong WARN_ON_ONCE in netns exit routine")
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index b61977db9b7f..2a909e5f9ba0 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -846,9 +846,9 @@ static int clusterip_net_init(struct net *net)
 
 static void clusterip_net_exit(struct net *net)
 {
+#ifdef CONFIG_PROC_FS
 	struct clusterip_net *cn = clusterip_pernet(net);
 
-#ifdef CONFIG_PROC_FS
 	mutex_lock(&cn->mutex);
 	proc_remove(cn->procdir);
 	cn->procdir = NULL;
-- 
cgit v1.2.3-70-g09d2


From 1d79895aef18fa05789995d86d523c9b2ee58a02 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Mon, 28 Jan 2019 10:13:35 +0100
Subject: sk_msg: Always cancel strp work before freeing the psock

Despite having stopped the parser, we still need to deinitialize it
by calling strp_done so that it cancels its work. Otherwise the worker
thread can run after we have freed the parser, and attempt to access
its workqueue resulting in a use-after-free:

==================================================================
BUG: KASAN: use-after-free in pwq_activate_delayed_work+0x1b/0x1d0
Read of size 8 at addr ffff888069975240 by task kworker/u2:2/93

CPU: 0 PID: 93 Comm: kworker/u2:2 Not tainted 5.0.0-rc2-00335-g28f9d1a3d4fe-dirty #14
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014
Workqueue:            (null) (kstrp)
Call Trace:
 print_address_description+0x6e/0x2b0
 ? pwq_activate_delayed_work+0x1b/0x1d0
 kasan_report+0xfd/0x177
 ? pwq_activate_delayed_work+0x1b/0x1d0
 ? pwq_activate_delayed_work+0x1b/0x1d0
 pwq_activate_delayed_work+0x1b/0x1d0
 ? process_one_work+0x4aa/0x660
 pwq_dec_nr_in_flight+0x9b/0x100
 worker_thread+0x82/0x680
 ? process_one_work+0x660/0x660
 kthread+0x1b9/0x1e0
 ? __kthread_create_on_node+0x250/0x250
 ret_from_fork+0x1f/0x30

Allocated by task 111:
 sk_psock_init+0x3c/0x1b0
 sock_map_link.isra.2+0x103/0x4b0
 sock_map_update_common+0x94/0x270
 sock_map_update_elem+0x145/0x160
 __se_sys_bpf+0x152e/0x1e10
 do_syscall_64+0xb2/0x3e0
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Freed by task 112:
 kfree+0x7f/0x140
 process_one_work+0x40b/0x660
 worker_thread+0x82/0x680
 kthread+0x1b9/0x1e0
 ret_from_fork+0x1f/0x30

The buggy address belongs to the object at ffff888069975180
 which belongs to the cache kmalloc-512 of size 512
The buggy address is located 192 bytes inside of
 512-byte region [ffff888069975180, ffff888069975380)
The buggy address belongs to the page:
page:ffffea0001a65d00 count:1 mapcount:0 mapping:ffff88806d401280 index:0x0 compound_mapcount: 0
flags: 0x4000000000010200(slab|head)
raw: 4000000000010200 dead000000000100 dead000000000200 ffff88806d401280
raw: 0000000000000000 00000000800c000c 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff888069975100: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff888069975180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff888069975200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                           ^
 ffff888069975280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff888069975300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================

Reported-by: Marek Majkowski <marek@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/netdev/CAJPywTLwgXNEZ2dZVoa=udiZmtrWJ0q5SuBW64aYs0Y1khXX3A@mail.gmail.com
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/skmsg.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index d6d5c20d7044..8c826603bf36 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -545,8 +545,7 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
 	struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
 
 	/* No sk_callback_lock since already detached. */
-	if (psock->parser.enabled)
-		strp_done(&psock->parser.strp);
+	strp_done(&psock->parser.strp);
 
 	cancel_work_sync(&psock->work);
 
-- 
cgit v1.2.3-70-g09d2


From 32eb67b93c9e3cd62cb423e30b090cdd4aa8d275 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Sun, 27 Jan 2019 00:57:38 +0000
Subject: net: tls: Save iv in tls_rec for async crypto requests

aead_request_set_crypt takes an iv pointer, and we change the iv
soon after setting it.  Some async crypto algorithms don't save the iv,
so we need to save it in the tls_rec for async requests.

Found by hardcoding x64 aesni to use async crypto manager (to test the async
codepath), however I don't think this combination can happen in the wild.
Presumably other hardware offloads will need this fix, but there have been
no user reports.

Fixes: a42055e8d2c30 ("Add support for async encryption of records...")
Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 2 ++
 net/tls/tls_sw.c  | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/include/net/tls.h b/include/net/tls.h
index 2a6ac8d642af..1486b60c4de8 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -120,6 +120,8 @@ struct tls_rec {
 	struct scatterlist sg_aead_out[2];
 
 	char aad_space[TLS_AAD_SPACE_SIZE];
+	u8 iv_data[TLS_CIPHER_AES_GCM_128_IV_SIZE +
+		   TLS_CIPHER_AES_GCM_128_SALT_SIZE];
 	struct aead_request aead_req;
 	u8 aead_req_ctx[];
 };
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 11cdc8f7db63..7e963560edef 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -439,6 +439,8 @@ static int tls_do_encryption(struct sock *sk,
 	struct scatterlist *sge = sk_msg_elem(msg_en, start);
 	int rc;
 
+	memcpy(rec->iv_data, tls_ctx->tx.iv, sizeof(rec->iv_data));
+
 	sge->offset += tls_ctx->tx.prepend_size;
 	sge->length -= tls_ctx->tx.prepend_size;
 
@@ -448,7 +450,7 @@ static int tls_do_encryption(struct sock *sk,
 	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
 	aead_request_set_crypt(aead_req, rec->sg_aead_in,
 			       rec->sg_aead_out,
-			       data_len, tls_ctx->tx.iv);
+			       data_len, rec->iv_data);
 
 	aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				  tls_encrypt_done, sk);
-- 
cgit v1.2.3-70-g09d2


From 1023121375c6b0b3dc00334983c762ba2b76cb19 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Sun, 27 Jan 2019 00:59:03 +0000
Subject: net: tls: Fix deadlock in free_resources tx

If there are outstanding async tx requests (when crypto returns EINPROGRESS),
there is a potential deadlock: the tx work acquires the lock, while we
cancel_delayed_work_sync() while holding the lock.  Drop the lock while waiting
for the work to complete.

Fixes: a42055e8d2c30 ("Add support for async encryption of records...")
Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 7e963560edef..bf5b54b513bc 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1794,7 +1794,9 @@ void tls_sw_free_resources_tx(struct sock *sk)
 	if (atomic_read(&ctx->encrypt_pending))
 		crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
 
+	release_sock(sk);
 	cancel_delayed_work_sync(&ctx->tx_work.work);
+	lock_sock(sk);
 
 	/* Tx whatever records we can transmit and abandon the rest */
 	tls_tx_records(sk, -1);
-- 
cgit v1.2.3-70-g09d2


From 35edfdc77f683c8fd27d7732af06cf6489af60a5 Mon Sep 17 00:00:00 2001
From: Josh Elsasser <jelsasser@appneta.com>
Date: Sat, 26 Jan 2019 14:38:33 -0800
Subject: net: set default network namespace in init_dummy_netdev()

Assign a default net namespace to netdevs created by init_dummy_netdev().
Fixes a NULL pointer dereference caused by busy-polling a socket bound to
an iwlwifi wireless device, which bumps the per-net BUSYPOLLRXPACKETS stat
if napi_poll() received packets:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000190
  IP: napi_busy_loop+0xd6/0x200
  Call Trace:
    sock_poll+0x5e/0x80
    do_sys_poll+0x324/0x5a0
    SyS_poll+0x6c/0xf0
    do_syscall_64+0x6b/0x1f0
    entry_SYSCALL_64_after_hwframe+0x3d/0xa2

Fixes: 7db6b048da3b ("net: Commonize busy polling code to focus on napi_id instead of socket")
Signed-off-by: Josh Elsasser <jelsasser@appneta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 82f20022259d..8e276e0192a1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8712,6 +8712,9 @@ int init_dummy_netdev(struct net_device *dev)
 	set_bit(__LINK_STATE_PRESENT, &dev->state);
 	set_bit(__LINK_STATE_START, &dev->state);
 
+	/* napi_busy_loop stats accounting wants this */
+	dev_net_set(dev, &init_net);
+
 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
 	 * because users of this 'device' dont need to change
 	 * its refcount.
-- 
cgit v1.2.3-70-g09d2


From 63ff03ab786ab1bc6cca01d48eacd22c95b9b3eb Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 25 Jan 2019 22:43:17 +0100
Subject: Revert "socket: fix struct ifreq size in compat ioctl"

This reverts commit 1cebf8f143c2 ("socket: fix struct ifreq
size in compat ioctl"), it's a bugfix for another commit that
I'll revert next.

This is not a 'perfect' revert, I'm keeping some coding style
intact rather than revert to the state with indentation errors.

Cc: stable@vger.kernel.org
Fixes: 1cebf8f143c2 ("socket: fix struct ifreq size in compat ioctl")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index e89884e2197b..63b53af7379b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -941,8 +941,7 @@ void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
 EXPORT_SYMBOL(dlci_ioctl_set);
 
 static long sock_do_ioctl(struct net *net, struct socket *sock,
-			  unsigned int cmd, unsigned long arg,
-			  unsigned int ifreq_size)
+			  unsigned int cmd, unsigned long arg)
 {
 	int err;
 	void __user *argp = (void __user *)arg;
@@ -968,11 +967,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
 	} else {
 		struct ifreq ifr;
 		bool need_copyout;
-		if (copy_from_user(&ifr, argp, ifreq_size))
+		if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
 			return -EFAULT;
 		err = dev_ioctl(net, cmd, &ifr, &need_copyout);
 		if (!err && need_copyout)
-			if (copy_to_user(argp, &ifr, ifreq_size))
+			if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
 				return -EFAULT;
 	}
 	return err;
@@ -1071,8 +1070,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 			err = open_related_ns(&net->ns, get_net_ns);
 			break;
 		default:
-			err = sock_do_ioctl(net, sock, cmd, arg,
-					    sizeof(struct ifreq));
+			err = sock_do_ioctl(net, sock, cmd, arg);
 			break;
 		}
 	return err;
@@ -2780,8 +2778,7 @@ static int do_siocgstamp(struct net *net, struct socket *sock,
 	int err;
 
 	set_fs(KERNEL_DS);
-	err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv,
-			    sizeof(struct compat_ifreq));
+	err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv);
 	set_fs(old_fs);
 	if (!err)
 		err = compat_put_timeval(&ktv, up);
@@ -2797,8 +2794,7 @@ static int do_siocgstampns(struct net *net, struct socket *sock,
 	int err;
 
 	set_fs(KERNEL_DS);
-	err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts,
-			    sizeof(struct compat_ifreq));
+	err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts);
 	set_fs(old_fs);
 	if (!err)
 		err = compat_put_timespec(&kts, up);
@@ -3109,8 +3105,7 @@ static int routing_ioctl(struct net *net, struct socket *sock,
 	}
 
 	set_fs(KERNEL_DS);
-	ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r,
-			    sizeof(struct compat_ifreq));
+	ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r);
 	set_fs(old_fs);
 
 out:
@@ -3223,8 +3218,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCBONDSETHWADDR:
 	case SIOCBONDCHANGEACTIVE:
 	case SIOCGIFNAME:
-		return sock_do_ioctl(net, sock, cmd, arg,
-				     sizeof(struct compat_ifreq));
+		return sock_do_ioctl(net, sock, cmd, arg);
 	}
 
 	return -ENOIOCTLCMD;
-- 
cgit v1.2.3-70-g09d2


From 37ac39bdddc528c998a9f36db36937de923fdf2a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 25 Jan 2019 22:43:18 +0100
Subject: Revert "kill dev_ifsioc()"

This reverts commit bf4405737f9f ("kill dev_ifsioc()").

This wasn't really unused as implied by the original commit,
it still handles the copy to/from user differently, and the
commit thus caused issues such as
  https://bugzilla.kernel.org/show_bug.cgi?id=199469
and
  https://bugzilla.kernel.org/show_bug.cgi?id=202273

However, deviating from a strict revert, rename dev_ifsioc()
to compat_ifreq_ioctl() to be clearer as to its purpose and
add a comment.

Cc: stable@vger.kernel.org
Fixes: bf4405737f9f ("kill dev_ifsioc()")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index 63b53af7379b..fbf80f9fb057 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2990,6 +2990,53 @@ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd,
 	return dev_ioctl(net, cmd, &ifreq, NULL);
 }
 
+static int compat_ifreq_ioctl(struct net *net, struct socket *sock,
+			      unsigned int cmd,
+			      struct compat_ifreq __user *uifr32)
+{
+	struct ifreq __user *uifr;
+	int err;
+
+	/* Handle the fact that while struct ifreq has the same *layout* on
+	 * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data,
+	 * which are handled elsewhere, it still has different *size* due to
+	 * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit,
+	 * resulting in struct ifreq being 32 and 40 bytes respectively).
+	 * As a result, if the struct happens to be at the end of a page and
+	 * the next page isn't readable/writable, we get a fault. To prevent
+	 * that, copy back and forth to the full size.
+	 */
+
+	uifr = compat_alloc_user_space(sizeof(*uifr));
+	if (copy_in_user(uifr, uifr32, sizeof(*uifr32)))
+		return -EFAULT;
+
+	err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr);
+
+	if (!err) {
+		switch (cmd) {
+		case SIOCGIFFLAGS:
+		case SIOCGIFMETRIC:
+		case SIOCGIFMTU:
+		case SIOCGIFMEM:
+		case SIOCGIFHWADDR:
+		case SIOCGIFINDEX:
+		case SIOCGIFADDR:
+		case SIOCGIFBRDADDR:
+		case SIOCGIFDSTADDR:
+		case SIOCGIFNETMASK:
+		case SIOCGIFPFLAGS:
+		case SIOCGIFTXQLEN:
+		case SIOCGMIIPHY:
+		case SIOCGMIIREG:
+			if (copy_in_user(uifr32, uifr, sizeof(*uifr32)))
+				err = -EFAULT;
+			break;
+		}
+	}
+	return err;
+}
+
 static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
 			struct compat_ifreq __user *uifr32)
 {
@@ -3209,6 +3256,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCGMIIPHY:
 	case SIOCGMIIREG:
 	case SIOCSMIIREG:
+		return compat_ifreq_ioctl(net, sock, cmd, argp);
+
 	case SIOCSARP:
 	case SIOCGARP:
 	case SIOCDARP:
-- 
cgit v1.2.3-70-g09d2


From c6c9fee35dc27362b7bac34b2fc9f5b8ace2e22c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 25 Jan 2019 22:43:19 +0100
Subject: net: socket: fix SIOCGIFNAME in compat

As reported by Robert O'Callahan in
https://bugzilla.kernel.org/show_bug.cgi?id=202273
reverting the previous changes in this area broke
the SIOCGIFNAME ioctl in compat again (I'd previously
fixed it after his previous report of breakage in
https://bugzilla.kernel.org/show_bug.cgi?id=199469).

This is obviously because I fixed SIOCGIFNAME more or
less by accident.

Fix it explicitly now by making it pass through the
restored compat translation code.

Cc: stable@vger.kernel.org
Fixes: 4cf808e7ac32 ("kill dev_ifname32()")
Reported-by: Robert O'Callahan <robert@ocallahan.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index fbf80f9fb057..473ac8d7c54e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3029,6 +3029,7 @@ static int compat_ifreq_ioctl(struct net *net, struct socket *sock,
 		case SIOCGIFTXQLEN:
 		case SIOCGMIIPHY:
 		case SIOCGMIIREG:
+		case SIOCGIFNAME:
 			if (copy_in_user(uifr32, uifr, sizeof(*uifr32)))
 				err = -EFAULT;
 			break;
@@ -3252,6 +3253,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCSIFTXQLEN:
 	case SIOCBRADDIF:
 	case SIOCBRDELIF:
+	case SIOCGIFNAME:
 	case SIOCSIFNAME:
 	case SIOCGMIIPHY:
 	case SIOCGMIIREG:
@@ -3266,7 +3268,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCBONDRELEASE:
 	case SIOCBONDSETHWADDR:
 	case SIOCBONDCHANGEACTIVE:
-	case SIOCGIFNAME:
 		return sock_do_ioctl(net, sock, cmd, arg);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 98406133dd9cb9f195676eab540c270dceca879a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 25 Jan 2019 22:43:20 +0100
Subject: net: socket: make bond ioctls go through compat_ifreq_ioctl()

Same story as before, these use struct ifreq and thus need
to be read with the shorter version to not cause faults.

Cc: stable@vger.kernel.org
Fixes: f92d4fc95341 ("kill bond_ioctl()")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index 473ac8d7c54e..d80d87a395ea 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3258,16 +3258,16 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCGMIIPHY:
 	case SIOCGMIIREG:
 	case SIOCSMIIREG:
+	case SIOCBONDENSLAVE:
+	case SIOCBONDRELEASE:
+	case SIOCBONDSETHWADDR:
+	case SIOCBONDCHANGEACTIVE:
 		return compat_ifreq_ioctl(net, sock, cmd, argp);
 
 	case SIOCSARP:
 	case SIOCGARP:
 	case SIOCDARP:
 	case SIOCATMARK:
-	case SIOCBONDENSLAVE:
-	case SIOCBONDRELEASE:
-	case SIOCBONDSETHWADDR:
-	case SIOCBONDCHANGEACTIVE:
 		return sock_do_ioctl(net, sock, cmd, arg);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From feaf5c796b3f0240f10d0d6d0b686715fd58a05b Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Mon, 28 Jan 2019 22:23:48 +0100
Subject: net: ip_gre: always reports o_key to userspace

Erspan protocol (version 1 and 2) relies on o_key to configure
session id header field. However TUNNEL_KEY bit is cleared in
erspan_xmit since ERSPAN protocol does not set the key field
of the external GRE header and so the configured o_key is not reported
to userspace. The issue can be triggered with the following reproducer:

$ip link add erspan1 type erspan local 192.168.0.1 remote 192.168.0.2 \
    key 1 seq erspan_ver 1
$ip link set erspan1 up
$ip -d link sh erspan1

erspan1@NONE: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc pfifo_fast state UNKNOWN mode DEFAULT
  link/ether 52:aa:99:95:9a:b5 brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 1500
  erspan remote 192.168.0.2 local 192.168.0.1 ttl inherit ikey 0.0.0.1 iseq oseq erspan_index 0

Fix the issue adding TUNNEL_KEY bit to the o_flags parameter in
ipgre_fill_info

Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN")
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 20a64fe6254b..3978f807fa8b 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1455,12 +1455,17 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
 	struct ip_tunnel_parm *p = &t->parms;
+	__be16 o_flags = p->o_flags;
+
+	if ((t->erspan_ver == 1 || t->erspan_ver == 2) &&
+	    !t->collect_md)
+		o_flags |= TUNNEL_KEY;
 
 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
 	    nla_put_be16(skb, IFLA_GRE_IFLAGS,
 			 gre_tnl_flags_to_gre_flags(p->i_flags)) ||
 	    nla_put_be16(skb, IFLA_GRE_OFLAGS,
-			 gre_tnl_flags_to_gre_flags(p->o_flags)) ||
+			 gre_tnl_flags_to_gre_flags(o_flags)) ||
 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
 	    nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
-- 
cgit v1.2.3-70-g09d2


From c706863bc8902d0c2d1a5a27ac8e1ead5d06b79d Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Mon, 28 Jan 2019 22:23:49 +0100
Subject: net: ip6_gre: always reports o_key to userspace

As Erspan_v4, Erspan_v6 protocol relies on o_key to configure
session id header field. However TUNNEL_KEY bit is cleared in
ip6erspan_tunnel_xmit since ERSPAN protocol does not set the key field
of the external GRE header and so the configured o_key is not reported
to userspace. The issue can be triggered with the following reproducer:

$ip link add ip6erspan1 type ip6erspan local 2000::1 remote 2000::2 \
    key 1 seq erspan_ver 1
$ip link set ip6erspan1 up
ip -d link sh ip6erspan1

ip6erspan1@NONE: <BROADCAST,MULTICAST> mtu 1422 qdisc noop state DOWN mode DEFAULT
    link/ether ba:ff:09:24:c3:0e brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 1500
    ip6erspan remote 2000::2 local 2000::1 encaplimit 4 flowlabel 0x00000 ikey 0.0.0.1 iseq oseq

Fix the issue adding TUNNEL_KEY bit to the o_flags parameter in
ip6gre_fill_info

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 4416368dbd49..801a9a0c217e 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -2098,12 +2098,17 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct __ip6_tnl_parm *p = &t->parms;
+	__be16 o_flags = p->o_flags;
+
+	if ((p->erspan_ver == 1 || p->erspan_ver == 2) &&
+	    !p->collect_md)
+		o_flags |= TUNNEL_KEY;
 
 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
 	    nla_put_be16(skb, IFLA_GRE_IFLAGS,
 			 gre_tnl_flags_to_gre_flags(p->i_flags)) ||
 	    nla_put_be16(skb, IFLA_GRE_OFLAGS,
-			 gre_tnl_flags_to_gre_flags(p->o_flags)) ||
+			 gre_tnl_flags_to_gre_flags(o_flags)) ||
 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
 	    nla_put_in6_addr(skb, IFLA_GRE_LOCAL, &p->laddr) ||
-- 
cgit v1.2.3-70-g09d2


From ef489749aae508e6f17886775c075f12ff919fb1 Mon Sep 17 00:00:00 2001
From: Yohei Kanemaru <yohei.kanemaru@gmail.com>
Date: Tue, 29 Jan 2019 15:52:34 +0900
Subject: ipv6: sr: clear IP6CB(skb) on SRH ip4ip6 encapsulation

skb->cb may contain data from previous layers (in an observed case
IPv4 with L3 Master Device). In the observed scenario, the data in
IPCB(skb)->frags was misinterpreted as IP6CB(skb)->frag_max_size,
eventually caused an unexpected IPv6 fragmentation in ip6_fragment()
through ip6_finish_output().

This patch clears IP6CB(skb), which potentially contains garbage data,
on the SRH ip4ip6 encapsulation.

Fixes: 32d99d0b6702 ("ipv6: sr: add support for ip4ip6 encapsulation")
Signed-off-by: Yohei Kanemaru <yohei.kanemaru@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_iptunnel.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 8181ee7e1e27..ee5403cbe655 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -146,6 +146,8 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
 	} else {
 		ip6_flow_hdr(hdr, 0, flowlabel);
 		hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
+
+		memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
 	}
 
 	hdr->nexthdr = NEXTHDR_ROUTING;
-- 
cgit v1.2.3-70-g09d2


From 4522a70db7aa5e77526a4079628578599821b193 Mon Sep 17 00:00:00 2001
From: Jacob Wen <jian.w.wen@oracle.com>
Date: Wed, 30 Jan 2019 14:55:14 +0800
Subject: l2tp: fix reading optional fields of L2TPv3

Use pskb_may_pull() to make sure the optional fields are in skb linear
parts, so we can safely read them later.

It's easy to reproduce the issue with a net driver that supports paged
skb data. Just create a L2TPv3 over IP tunnel and then generates some
network traffic.
Once reproduced, rx err in /sys/kernel/debug/l2tp/tunnels will increase.

Changes in v4:
1. s/l2tp_v3_pull_opt/l2tp_v3_ensure_opt_in_linear/
2. s/tunnel->version != L2TP_HDR_VER_2/tunnel->version == L2TP_HDR_VER_3/
3. Add 'Fixes' in commit messages.

Changes in v3:
1. To keep consistency, move the code out of l2tp_recv_common.
2. Use "net" instead of "net-next", since this is a bug fix.

Changes in v2:
1. Only fix L2TPv3 to make code simple.
   To fix both L2TPv3 and L2TPv2, we'd better refactor l2tp_recv_common.
   It's complicated to do so.
2. Reloading pointers after pskb_may_pull

Fixes: f7faffa3ff8e ("l2tp: Add L2TPv3 protocol support")
Fixes: 0d76751fad77 ("l2tp: Add L2TPv3 IP encapsulation (no UDP) support")
Fixes: a32e0eec7042 ("l2tp: introduce L2TPv3 IP encapsulation support for IPv6")
Signed-off-by: Jacob Wen <jian.w.wen@oracle.com>
Acked-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_core.c |  4 ++++
 net/l2tp/l2tp_core.h | 20 ++++++++++++++++++++
 net/l2tp/l2tp_ip.c   |  3 +++
 net/l2tp/l2tp_ip6.c  |  3 +++
 4 files changed, 30 insertions(+)

(limited to 'net')

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 26f1d435696a..dd5ba0c11ab3 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -884,6 +884,10 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
 		goto error;
 	}
 
+	if (tunnel->version == L2TP_HDR_VER_3 &&
+	    l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr))
+		goto error;
+
 	l2tp_recv_common(session, skb, ptr, optr, hdrflags, length);
 	l2tp_session_dec_refcount(session);
 
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 9c9afe94d389..b2ce90260c35 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -301,6 +301,26 @@ static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel)
 }
 #endif
 
+static inline int l2tp_v3_ensure_opt_in_linear(struct l2tp_session *session, struct sk_buff *skb,
+					       unsigned char **ptr, unsigned char **optr)
+{
+	int opt_len = session->peer_cookie_len + l2tp_get_l2specific_len(session);
+
+	if (opt_len > 0) {
+		int off = *ptr - *optr;
+
+		if (!pskb_may_pull(skb, off + opt_len))
+			return -1;
+
+		if (skb->data != *optr) {
+			*optr = skb->data;
+			*ptr = skb->data + off;
+		}
+	}
+
+	return 0;
+}
+
 #define l2tp_printk(ptr, type, func, fmt, ...)				\
 do {									\
 	if (((ptr)->debug) & (type))					\
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 35f6f86d4dcc..d4c60523c549 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -165,6 +165,9 @@ static int l2tp_ip_recv(struct sk_buff *skb)
 		print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
 	}
 
+	if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr))
+		goto discard_sess;
+
 	l2tp_recv_common(session, skb, ptr, optr, 0, skb->len);
 	l2tp_session_dec_refcount(session);
 
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 237f1a4a0b0c..0ae6899edac0 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -178,6 +178,9 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
 		print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
 	}
 
+	if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr))
+		goto discard_sess;
+
 	l2tp_recv_common(session, skb, ptr, optr, 0, skb->len);
 	l2tp_session_dec_refcount(session);
 
-- 
cgit v1.2.3-70-g09d2


From 91c524708de6207f59dd3512518d8a1c7b434ee3 Mon Sep 17 00:00:00 2001
From: Jacob Wen <jian.w.wen@oracle.com>
Date: Thu, 31 Jan 2019 15:18:56 +0800
Subject: l2tp: copy 4 more bytes to linear part if necessary

The size of L2TPv2 header with all optional fields is 14 bytes.
l2tp_udp_recv_core only moves 10 bytes to the linear part of a
skb. This may lead to l2tp_recv_common read data outside of a skb.

This patch make sure that there is at least 14 bytes in the linear
part of a skb to meet the maximum need of l2tp_udp_recv_core and
l2tp_recv_common. The minimum size of both PPP HDLC-like frame and
Ethernet frame is larger than 14 bytes, so we are safe to do so.

Also remove L2TP_HDR_SIZE_NOSEQ, it is unused now.

Fixes: fd558d186df2 ("l2tp: Split pppol2tp patch into separate l2tp and ppp parts")
Suggested-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Jacob Wen <jian.w.wen@oracle.com>
Acked-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_core.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index dd5ba0c11ab3..fed6becc5daf 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -83,8 +83,7 @@
 #define L2TP_SLFLAG_S	   0x40000000
 #define L2TP_SL_SEQ_MASK   0x00ffffff
 
-#define L2TP_HDR_SIZE_SEQ		10
-#define L2TP_HDR_SIZE_NOSEQ		6
+#define L2TP_HDR_SIZE_MAX		14
 
 /* Default trace flags */
 #define L2TP_DEFAULT_DEBUG_FLAGS	0
@@ -808,7 +807,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
 	__skb_pull(skb, sizeof(struct udphdr));
 
 	/* Short packet? */
-	if (!pskb_may_pull(skb, L2TP_HDR_SIZE_SEQ)) {
+	if (!pskb_may_pull(skb, L2TP_HDR_SIZE_MAX)) {
 		l2tp_info(tunnel, L2TP_MSG_DATA,
 			  "%s: recv short packet (len=%d)\n",
 			  tunnel->name, skb->len);
-- 
cgit v1.2.3-70-g09d2


From 6fa19f5637a6c22bc0999596bcc83bdcac8a4fa6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 31 Jan 2019 08:47:10 -0800
Subject: rds: fix refcount bug in rds_sock_addref

syzbot was able to catch a bug in rds [1]

The issue here is that the socket might be found in a hash table
but that its refcount has already be set to 0 by another cpu.

We need to use refcount_inc_not_zero() to be safe here.

[1]

refcount_t: increment on 0; use-after-free.
WARNING: CPU: 1 PID: 23129 at lib/refcount.c:153 refcount_inc_checked lib/refcount.c:153 [inline]
WARNING: CPU: 1 PID: 23129 at lib/refcount.c:153 refcount_inc_checked+0x61/0x70 lib/refcount.c:151
Kernel panic - not syncing: panic_on_warn set ...
CPU: 1 PID: 23129 Comm: syz-executor3 Not tainted 5.0.0-rc4+ #53
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1db/0x2d0 lib/dump_stack.c:113
 panic+0x2cb/0x65c kernel/panic.c:214
 __warn.cold+0x20/0x48 kernel/panic.c:571
 report_bug+0x263/0x2b0 lib/bug.c:186
 fixup_bug arch/x86/kernel/traps.c:178 [inline]
 fixup_bug arch/x86/kernel/traps.c:173 [inline]
 do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271
 do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:290
 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:973
RIP: 0010:refcount_inc_checked lib/refcount.c:153 [inline]
RIP: 0010:refcount_inc_checked+0x61/0x70 lib/refcount.c:151
Code: 1d 51 63 c8 06 31 ff 89 de e8 eb 1b f2 fd 84 db 75 dd e8 a2 1a f2 fd 48 c7 c7 60 9f 81 88 c6 05 31 63 c8 06 01 e8 af 65 bb fd <0f> 0b eb c1 90 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 41 54 49
RSP: 0018:ffff8880a0cbf1e8 EFLAGS: 00010282
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffc90006113000
RDX: 000000000001047d RSI: ffffffff81685776 RDI: 0000000000000005
RBP: ffff8880a0cbf1f8 R08: ffff888097c9e100 R09: ffffed1015ce5021
R10: ffffed1015ce5020 R11: ffff8880ae728107 R12: ffff8880723c20c0
R13: ffff8880723c24b0 R14: dffffc0000000000 R15: ffffed1014197e64
 sock_hold include/net/sock.h:647 [inline]
 rds_sock_addref+0x19/0x20 net/rds/af_rds.c:675
 rds_find_bound+0x97c/0x1080 net/rds/bind.c:82
 rds_recv_incoming+0x3be/0x1430 net/rds/recv.c:362
 rds_loop_xmit+0xf3/0x2a0 net/rds/loop.c:96
 rds_send_xmit+0x1355/0x2a10 net/rds/send.c:355
 rds_sendmsg+0x323c/0x44e0 net/rds/send.c:1368
 sock_sendmsg_nosec net/socket.c:621 [inline]
 sock_sendmsg+0xdd/0x130 net/socket.c:631
 __sys_sendto+0x387/0x5f0 net/socket.c:1788
 __do_sys_sendto net/socket.c:1800 [inline]
 __se_sys_sendto net/socket.c:1796 [inline]
 __x64_sys_sendto+0xe1/0x1a0 net/socket.c:1796
 do_syscall_64+0x1a3/0x800 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x458089
Code: 6d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 3b b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fc266df8c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 0000000000458089
RDX: 0000000000000000 RSI: 00000000204b3fff RDI: 0000000000000005
RBP: 000000000073bf00 R08: 00000000202b4000 R09: 0000000000000010
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fc266df96d4
R13: 00000000004c56e4 R14: 00000000004d94a8 R15: 00000000ffffffff

Fixes: cc4dfb7f70a3 ("rds: fix two RCU related problems")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Cc: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Cc: rds-devel@oss.oracle.com
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/bind.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 762d2c6788a3..17c9d9f0c848 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -78,10 +78,10 @@ struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
 	__rds_create_bind_key(key, addr, port, scope_id);
 	rcu_read_lock();
 	rs = rhashtable_lookup(&bind_hash_table, key, ht_parms);
-	if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
-		rds_sock_addref(rs);
-	else
+	if (rs && (sock_flag(rds_rs_to_sk(rs), SOCK_DEAD) ||
+		   !refcount_inc_not_zero(&rds_rs_to_sk(rs)->sk_refcnt)))
 		rs = NULL;
+
 	rcu_read_unlock();
 
 	rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr,
-- 
cgit v1.2.3-70-g09d2


From 9d0f50b80222dc273e67e4e14410fcfa4130a90c Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 29 Jan 2019 11:10:57 +0100
Subject: mac80211: ensure that mgmt tx skbs have tailroom for encryption

Some drivers use IEEE80211_KEY_FLAG_SW_MGMT_TX to indicate that management
frames need to be software encrypted. Since normal data packets are still
encrypted by the hardware, crypto_tx_tailroom_needed_cnt gets decremented
after key upload to hw. This can lead to passing skbs to ccmp_encrypt_skb,
which don't have the necessary tailroom for software encryption.

Change the code to add tailroom for encrypted management packets, even if
crypto_tx_tailroom_needed_cnt is 0.

Cc: stable@vger.kernel.org
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tx.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index f170d6c6629a..928f13a208b0 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1938,9 +1938,16 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata,
 				int head_need, bool may_encrypt)
 {
 	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_hdr *hdr;
+	bool enc_tailroom;
 	int tail_need = 0;
 
-	if (may_encrypt && sdata->crypto_tx_tailroom_needed_cnt) {
+	hdr = (struct ieee80211_hdr *) skb->data;
+	enc_tailroom = may_encrypt &&
+		       (sdata->crypto_tx_tailroom_needed_cnt ||
+			ieee80211_is_mgmt(hdr->frame_control));
+
+	if (enc_tailroom) {
 		tail_need = IEEE80211_ENCRYPT_TAILROOM;
 		tail_need -= skb_tailroom(skb);
 		tail_need = max_t(int, tail_need, 0);
@@ -1948,8 +1955,7 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata,
 
 	if (skb_cloned(skb) &&
 	    (!ieee80211_hw_check(&local->hw, SUPPORTS_CLONED_SKBS) ||
-	     !skb_clone_writable(skb, ETH_HLEN) ||
-	     (may_encrypt && sdata->crypto_tx_tailroom_needed_cnt)))
+	     !skb_clone_writable(skb, ETH_HLEN) || enc_tailroom))
 		I802_DEBUG_INC(local->tx_expand_skb_head_cloned);
 	else if (head_need || tail_need)
 		I802_DEBUG_INC(local->tx_expand_skb_head);
-- 
cgit v1.2.3-70-g09d2


From e005bd7ddea06784c1eb91ac5bb6b171a94f3b05 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 1 Feb 2019 11:09:54 +0100
Subject: cfg80211: call disconnect_wk when AP stops

Since we now prevent regulatory restore during STA disconnect
if concurrent AP interfaces are active, we need to reschedule
this check when the AP state changes. This fixes never doing
a restore when an AP is the last interface to stop. Or to put
it another way: we need to re-check after anything we check
here changes.

Cc: stable@vger.kernel.org
Fixes: 113f3aaa81bd ("cfg80211: Prevent regulatory restore during STA disconnect in concurrent interfaces")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/ap.c   | 2 ++
 net/wireless/core.h | 2 ++
 net/wireless/sme.c  | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/ap.c b/net/wireless/ap.c
index 882d97bdc6bf..550ac9d827fe 100644
--- a/net/wireless/ap.c
+++ b/net/wireless/ap.c
@@ -41,6 +41,8 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
 		cfg80211_sched_dfs_chan_update(rdev);
 	}
 
+	schedule_work(&cfg80211_disconnect_work);
+
 	return err;
 }
 
diff --git a/net/wireless/core.h b/net/wireless/core.h
index c5d6f3418601..f6b40563dc63 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -445,6 +445,8 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev);
 bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
 				u32 center_freq_khz, u32 bw_khz);
 
+extern struct work_struct cfg80211_disconnect_work;
+
 /**
  * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable
  * @wiphy: the wiphy to validate against
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index f741d8376a46..7d34cb884840 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -667,7 +667,7 @@ static void disconnect_work(struct work_struct *work)
 	rtnl_unlock();
 }
 
-static DECLARE_WORK(cfg80211_disconnect_work, disconnect_work);
+DECLARE_WORK(cfg80211_disconnect_work, disconnect_work);
 
 
 /*
-- 
cgit v1.2.3-70-g09d2


From ba59fb0273076637f0add4311faa990a5eec27c0 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 1 Feb 2019 15:15:22 +0100
Subject: sctp: walk the list of asoc safely

In sctp_sendmesg(), when walking the list of endpoint associations, the
association can be dropped from the list, making the list corrupt.
Properly handle this by using list_for_each_entry_safe()

Fixes: 4910280503f3 ("sctp: add support for snd flag SCTP_SENDALL process in sendmsg")
Reported-by: Secunia Research <vuln@secunia.com>
Tested-by: Secunia Research <vuln@secunia.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f93c3cf9e567..65d6d04546ae 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2027,7 +2027,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
 	struct sctp_transport *transport = NULL;
 	struct sctp_sndrcvinfo _sinfo, *sinfo;
-	struct sctp_association *asoc;
+	struct sctp_association *asoc, *tmp;
 	struct sctp_cmsgs cmsgs;
 	union sctp_addr *daddr;
 	bool new = false;
@@ -2053,7 +2053,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 
 	/* SCTP_SENDALL process */
 	if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP)) {
-		list_for_each_entry(asoc, &ep->asocs, asocs) {
+		list_for_each_entry_safe(asoc, tmp, &ep->asocs, asocs) {
 			err = sctp_sendmsg_check_sflags(asoc, sflags, msg,
 							msg_len);
 			if (err == 0)
-- 
cgit v1.2.3-70-g09d2


From 14d22d4d61e40623a7c5816728bfe55c322e779a Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Wed, 30 Jan 2019 18:51:00 +0100
Subject: net/smc: fix another sizeof to int comparison

Comparing an int to a size, which is unsigned, causes the int to become
unsigned, giving the wrong result. kernel_sendmsg can return a negative
error code.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_clc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 776e9dfc915d..d53fd588d1f5 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -378,7 +378,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
 	vec.iov_len = sizeof(struct smc_clc_msg_decline);
 	len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
 			     sizeof(struct smc_clc_msg_decline));
-	if (len < sizeof(struct smc_clc_msg_decline))
+	if (len < 0 || len < sizeof(struct smc_clc_msg_decline))
 		len = -EPROTO;
 	return len > 0 ? 0 : len;
 }
-- 
cgit v1.2.3-70-g09d2


From ca8dc1334a71d6081b09e18d55198a27e28fd44b Mon Sep 17 00:00:00 2001
From: Hans Wippel <hwippel@linux.ibm.com>
Date: Wed, 30 Jan 2019 18:51:01 +0100
Subject: net/smc: allow 16 byte pnetids in netlink policy

Currently, users can only send pnetids with a maximum length of 15 bytes
over the SMC netlink interface although the maximum pnetid length is 16
bytes. This patch changes the SMC netlink policy to accept 16 byte
pnetids.

Signed-off-by: Hans Wippel <hwippel@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_pnet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 7cb3e4f07c10..632c3109dee5 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -27,7 +27,7 @@
 static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
 	[SMC_PNETID_NAME] = {
 		.type = NLA_NUL_STRING,
-		.len = SMC_MAX_PNETID_LEN - 1
+		.len = SMC_MAX_PNETID_LEN
 	},
 	[SMC_PNETID_ETHNAME] = {
 		.type = NLA_NUL_STRING,
-- 
cgit v1.2.3-70-g09d2


From 77f838ace755d2f466536c44dac6c856f62cd901 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 30 Jan 2019 18:51:02 +0100
Subject: net/smc: prevent races between smc_lgr_terminate() and
 smc_conn_free()

To prevent races between smc_lgr_terminate() and smc_conn_free() add an
extra check of the lgr field before accessing it, and cancel a delayed
free_work when a new smc connection is created.
This fixes the problem that free_work cleared the lgr variable but
smc_lgr_terminate() or smc_conn_free() still access it in parallel.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 35c1cdc93e1c..097c798983ca 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -128,6 +128,8 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn)
 {
 	struct smc_link_group *lgr = conn->lgr;
 
+	if (!lgr)
+		return;
 	write_lock_bh(&lgr->conns_lock);
 	if (conn->alert_token_local) {
 		__smc_lgr_unregister_conn(conn);
@@ -628,6 +630,8 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact,
 			local_contact = SMC_REUSE_CONTACT;
 			conn->lgr = lgr;
 			smc_lgr_register_conn(conn); /* add smc conn to lgr */
+			if (delayed_work_pending(&lgr->free_work))
+				cancel_delayed_work(&lgr->free_work);
 			write_unlock_bh(&lgr->conns_lock);
 			break;
 		}
-- 
cgit v1.2.3-70-g09d2


From 6889b36da78a21a312d8b462c1fa25a03c2ff192 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 30 Jan 2019 18:51:03 +0100
Subject: net/smc: don't wait for send buffer space when data was already sent

When there is no more send buffer space and at least 1 byte was already
sent then return to user space. The wait is only done when no data was
sent by the sendmsg() call.
This fixes smc_tx_sendmsg() which tried to always send all user data and
started to wait for free send buffer space when needed. During this wait
the user space program was blocked in the sendmsg() call and hence not
able to receive incoming data. When both sides were in such a situation
then the connection stalled forever.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_tx.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index d8366ed51757..f99951f3f7fd 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -165,12 +165,11 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 			conn->local_tx_ctrl.prod_flags.urg_data_pending = 1;
 
 		if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) {
+			if (send_done)
+				return send_done;
 			rc = smc_tx_wait(smc, msg->msg_flags);
-			if (rc) {
-				if (send_done)
-					return send_done;
+			if (rc)
 				goto out_err;
-			}
 			continue;
 		}
 
-- 
cgit v1.2.3-70-g09d2


From 51c5aba3b672c4285fca052817f34b22dc79dda7 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 30 Jan 2019 18:51:04 +0100
Subject: net/smc: recvmsg and splice_read should return 0 after shutdown

When a socket was connected and is now shut down for read, return 0 to
indicate end of data in recvmsg and splice_read (like TCP) and do not
return ENOTCONN. This behavior is required by the socket api.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index c4e56602e0c6..b04a813fc865 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1505,6 +1505,11 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
 	smc = smc_sk(sk);
 	lock_sock(sk);
+	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
+		/* socket was connected before, no more data to read */
+		rc = 0;
+		goto out;
+	}
 	if ((sk->sk_state == SMC_INIT) ||
 	    (sk->sk_state == SMC_LISTEN) ||
 	    (sk->sk_state == SMC_CLOSED))
@@ -1840,7 +1845,11 @@ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
 
 	smc = smc_sk(sk);
 	lock_sock(sk);
-
+	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
+		/* socket was connected before, no more data to read */
+		rc = 0;
+		goto out;
+	}
 	if (sk->sk_state == SMC_INIT ||
 	    sk->sk_state == SMC_LISTEN ||
 	    sk->sk_state == SMC_CLOSED)
-- 
cgit v1.2.3-70-g09d2


From 33f3fcc290671590821ff3c0c9396db1ec9b7d4c Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 30 Jan 2019 18:51:05 +0100
Subject: net/smc: do not wait under send_lock

smc_cdc_get_free_slot() might wait for free transfer buffers when using
SMC-R. This wait should not be done under the send_lock, which is a
spin_lock. This fixes a cpu loop in parallel threads waiting for the
send_lock.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_tx.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index f99951f3f7fd..36af3de731b9 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -488,25 +488,23 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
-	spin_lock_bh(&conn->send_lock);
 	rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend);
 	if (rc < 0) {
 		if (rc == -EBUSY) {
 			struct smc_sock *smc =
 				container_of(conn, struct smc_sock, conn);
 
-			if (smc->sk.sk_err == ECONNABORTED) {
-				rc = sock_error(&smc->sk);
-				goto out_unlock;
-			}
+			if (smc->sk.sk_err == ECONNABORTED)
+				return sock_error(&smc->sk);
 			rc = 0;
 			if (conn->alert_token_local) /* connection healthy */
 				mod_delayed_work(system_wq, &conn->tx_work,
 						 SMC_TX_WORK_DELAY);
 		}
-		goto out_unlock;
+		return rc;
 	}
 
+	spin_lock_bh(&conn->send_lock);
 	if (!conn->local_tx_ctrl.prod_flags.urg_data_present) {
 		rc = smc_tx_rdma_writes(conn);
 		if (rc) {
-- 
cgit v1.2.3-70-g09d2


From 2dee25af42f99b476d5916aeac5c994e4dcc910b Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 30 Jan 2019 18:51:06 +0100
Subject: net/smc: call smc_cdc_msg_send() under send_lock

Call smc_cdc_msg_send() under the connection send_lock to make sure all
send operations for one connection are serialized.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_cdc.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index db83332ac1c8..1c5333d494e9 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -125,7 +125,10 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn)
 	if (rc)
 		return rc;
 
-	return smc_cdc_msg_send(conn, wr_buf, pend);
+	spin_lock_bh(&conn->send_lock);
+	rc = smc_cdc_msg_send(conn, wr_buf, pend);
+	spin_unlock_bh(&conn->send_lock);
+	return rc;
 }
 
 int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
-- 
cgit v1.2.3-70-g09d2


From e5f3aa04dbfd92154f21edfeb7a1676a45918928 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 30 Jan 2019 18:51:07 +0100
Subject: net/smc: use device link provided in qp_context

The device field of the IB event structure does not always point to the
SMC IB device. Load the pointer from the qp_context which is always
provided to smc_ib_qp_event_handler() in the priv field. And for qp
events the affected port is given in the qp structure of the ibevent,
derive it from there.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_ib.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index e519ef29c0ff..76487a16934e 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -289,8 +289,8 @@ int smc_ib_create_protection_domain(struct smc_link *lnk)
 
 static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
 {
-	struct smc_ib_device *smcibdev =
-		(struct smc_ib_device *)ibevent->device;
+	struct smc_link *lnk = (struct smc_link *)priv;
+	struct smc_ib_device *smcibdev = lnk->smcibdev;
 	u8 port_idx;
 
 	switch (ibevent->event) {
@@ -298,7 +298,7 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
 	case IB_EVENT_GID_CHANGE:
 	case IB_EVENT_PORT_ERR:
 	case IB_EVENT_QP_ACCESS_ERR:
-		port_idx = ibevent->element.port_num - 1;
+		port_idx = ibevent->element.qp->port - 1;
 		set_bit(port_idx, &smcibdev->port_event_mask);
 		schedule_work(&smcibdev->port_event_work);
 		break;
-- 
cgit v1.2.3-70-g09d2


From 46ad02229d6b4ee276eb295ca08f039993f323c8 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 30 Jan 2019 18:51:08 +0100
Subject: net/smc: fix use of variable in cleared area

Do not use pend->idx as index for the arrays because its value is
located in the cleared area. Use the existing local variable instead.
Without this fix the wrong area might be cleared.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_wr.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index c2694750a6a8..1dc88c32d6bb 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -218,10 +218,10 @@ int smc_wr_tx_put_slot(struct smc_link *link,
 		u32 idx = pend->idx;
 
 		/* clear the full struct smc_wr_tx_pend including .priv */
-		memset(&link->wr_tx_pends[pend->idx], 0,
-		       sizeof(link->wr_tx_pends[pend->idx]));
-		memset(&link->wr_tx_bufs[pend->idx], 0,
-		       sizeof(link->wr_tx_bufs[pend->idx]));
+		memset(&link->wr_tx_pends[idx], 0,
+		       sizeof(link->wr_tx_pends[idx]));
+		memset(&link->wr_tx_bufs[idx], 0,
+		       sizeof(link->wr_tx_bufs[idx]));
 		test_and_clear_bit(idx, link->wr_tx_mask);
 		return 1;
 	}
-- 
cgit v1.2.3-70-g09d2


From 9b1f19d810e92d6cdc68455fbc22d9f961a58ce1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 30 Jan 2019 11:39:41 -0800
Subject: dccp: fool proof ccid_hc_[rt]x_parse_options()

Similarly to commit 276bdb82dedb ("dccp: check ccid before dereferencing")
it is wise to test for a NULL ccid.

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 16 Comm: ksoftirqd/1 Not tainted 5.0.0-rc3+ #37
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:ccid_hc_tx_parse_options net/dccp/ccid.h:205 [inline]
RIP: 0010:dccp_parse_options+0x8d9/0x12b0 net/dccp/options.c:233
Code: c5 0f b6 75 b3 80 38 00 0f 85 d6 08 00 00 48 b9 00 00 00 00 00 fc ff df 48 8b 45 b8 4c 8b b8 f8 07 00 00 4c 89 f8 48 c1 e8 03 <80> 3c 08 00 0f 85 95 08 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8b
kobject: 'loop5' (0000000080f78fc1): kobject_uevent_env
RSP: 0018:ffff8880a94df0b8 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff8880858ac723 RCX: dffffc0000000000
RDX: 0000000000000100 RSI: 0000000000000007 RDI: 0000000000000001
RBP: ffff8880a94df140 R08: 0000000000000001 R09: ffff888061b83a80
R10: ffffed100c370752 R11: ffff888061b83a97 R12: 0000000000000026
R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff8880ae700000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f0defa33518 CR3: 000000008db5e000 CR4: 00000000001406e0
kobject: 'loop5' (0000000080f78fc1): fill_kobj_path: path = '/devices/virtual/block/loop5'
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 dccp_rcv_state_process+0x2b6/0x1af6 net/dccp/input.c:654
 dccp_v4_do_rcv+0x100/0x190 net/dccp/ipv4.c:688
 sk_backlog_rcv include/net/sock.h:936 [inline]
 __sk_receive_skb+0x3a9/0xea0 net/core/sock.c:473
 dccp_v4_rcv+0x10cb/0x1f80 net/dccp/ipv4.c:880
 ip_protocol_deliver_rcu+0xb6/0xa20 net/ipv4/ip_input.c:208
 ip_local_deliver_finish+0x23b/0x390 net/ipv4/ip_input.c:234
 NF_HOOK include/linux/netfilter.h:289 [inline]
 NF_HOOK include/linux/netfilter.h:283 [inline]
 ip_local_deliver+0x1f0/0x740 net/ipv4/ip_input.c:255
 dst_input include/net/dst.h:450 [inline]
 ip_rcv_finish+0x1f4/0x2f0 net/ipv4/ip_input.c:414
 NF_HOOK include/linux/netfilter.h:289 [inline]
 NF_HOOK include/linux/netfilter.h:283 [inline]
 ip_rcv+0xed/0x620 net/ipv4/ip_input.c:524
 __netif_receive_skb_one_core+0x160/0x210 net/core/dev.c:4973
 __netif_receive_skb+0x2c/0x1c0 net/core/dev.c:5083
 process_backlog+0x206/0x750 net/core/dev.c:5923
 napi_poll net/core/dev.c:6346 [inline]
 net_rx_action+0x76d/0x1930 net/core/dev.c:6412
 __do_softirq+0x30b/0xb11 kernel/softirq.c:292
 run_ksoftirqd kernel/softirq.c:654 [inline]
 run_ksoftirqd+0x8e/0x110 kernel/softirq.c:646
 smpboot_thread_fn+0x6ab/0xa10 kernel/smpboot.c:164
 kthread+0x357/0x430 kernel/kthread.c:246
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352
Modules linked in:
---[ end trace 58a0ba03bea2c376 ]---
RIP: 0010:ccid_hc_tx_parse_options net/dccp/ccid.h:205 [inline]
RIP: 0010:dccp_parse_options+0x8d9/0x12b0 net/dccp/options.c:233
Code: c5 0f b6 75 b3 80 38 00 0f 85 d6 08 00 00 48 b9 00 00 00 00 00 fc ff df 48 8b 45 b8 4c 8b b8 f8 07 00 00 4c 89 f8 48 c1 e8 03 <80> 3c 08 00 0f 85 95 08 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8b
RSP: 0018:ffff8880a94df0b8 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff8880858ac723 RCX: dffffc0000000000
RDX: 0000000000000100 RSI: 0000000000000007 RDI: 0000000000000001
RBP: ffff8880a94df140 R08: 0000000000000001 R09: ffff888061b83a80
R10: ffffed100c370752 R11: ffff888061b83a97 R12: 0000000000000026
R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff8880ae700000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f0defa33518 CR3: 0000000009871000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ccid.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 6eb837a47b5c..baaaeb2b2c42 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -202,7 +202,7 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
 static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
 					   u8 pkt, u8 opt, u8 *val, u8 len)
 {
-	if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
+	if (!ccid || !ccid->ccid_ops->ccid_hc_tx_parse_options)
 		return 0;
 	return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
 }
@@ -214,7 +214,7 @@ static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
 static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
 					   u8 pkt, u8 opt, u8 *val, u8 len)
 {
-	if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
+	if (!ccid || !ccid->ccid_ops->ccid_hc_rx_parse_options)
 		return 0;
 	return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
 }
-- 
cgit v1.2.3-70-g09d2


From 22b5c0b63f32568e130fa2df4ba23efce3eb495b Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Fri, 1 Feb 2019 12:42:06 +0100
Subject: vsock/virtio: fix kernel panic after device hot-unplug

virtio_vsock_remove() invokes the vsock_core_exit() also if there
are opened sockets for the AF_VSOCK protocol family. In this way
the vsock "transport" pointer is set to NULL, triggering the
kernel panic at the first socket activity.

This patch move the vsock_core_init()/vsock_core_exit() in the
virtio_vsock respectively in module_init and module_exit functions,
that cannot be invoked until there are open sockets.

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1609699
Reported-by: Yan Fu <yafu@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/vmw_vsock/virtio_transport.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 5d3cce9e8744..9dae54698737 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -75,6 +75,9 @@ static u32 virtio_transport_get_local_cid(void)
 {
 	struct virtio_vsock *vsock = virtio_vsock_get();
 
+	if (!vsock)
+		return VMADDR_CID_ANY;
+
 	return vsock->guest_cid;
 }
 
@@ -584,10 +587,6 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
 
 	virtio_vsock_update_guest_cid(vsock);
 
-	ret = vsock_core_init(&virtio_transport.transport);
-	if (ret < 0)
-		goto out_vqs;
-
 	vsock->rx_buf_nr = 0;
 	vsock->rx_buf_max_nr = 0;
 	atomic_set(&vsock->queued_replies, 0);
@@ -618,8 +617,6 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
 	mutex_unlock(&the_virtio_vsock_mutex);
 	return 0;
 
-out_vqs:
-	vsock->vdev->config->del_vqs(vsock->vdev);
 out:
 	kfree(vsock);
 	mutex_unlock(&the_virtio_vsock_mutex);
@@ -669,7 +666,6 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
 
 	mutex_lock(&the_virtio_vsock_mutex);
 	the_virtio_vsock = NULL;
-	vsock_core_exit();
 	mutex_unlock(&the_virtio_vsock_mutex);
 
 	vdev->config->del_vqs(vdev);
@@ -702,14 +698,28 @@ static int __init virtio_vsock_init(void)
 	virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0);
 	if (!virtio_vsock_workqueue)
 		return -ENOMEM;
+
 	ret = register_virtio_driver(&virtio_vsock_driver);
 	if (ret)
-		destroy_workqueue(virtio_vsock_workqueue);
+		goto out_wq;
+
+	ret = vsock_core_init(&virtio_transport.transport);
+	if (ret)
+		goto out_vdr;
+
+	return 0;
+
+out_vdr:
+	unregister_virtio_driver(&virtio_vsock_driver);
+out_wq:
+	destroy_workqueue(virtio_vsock_workqueue);
 	return ret;
+
 }
 
 static void __exit virtio_vsock_exit(void)
 {
+	vsock_core_exit();
 	unregister_virtio_driver(&virtio_vsock_driver);
 	destroy_workqueue(virtio_vsock_workqueue);
 }
-- 
cgit v1.2.3-70-g09d2


From 85965487abc540368393a15491e6e7fcd230039d Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Fri, 1 Feb 2019 12:42:07 +0100
Subject: vsock/virtio: reset connected sockets on device removal

When the virtio transport device disappear, we should reset all
connected sockets in order to inform the users.

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/vmw_vsock/virtio_transport.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 9dae54698737..15eb5d3d4750 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -634,6 +634,9 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
 	flush_work(&vsock->event_work);
 	flush_work(&vsock->send_pkt_work);
 
+	/* Reset all connected sockets when the device disappear */
+	vsock_for_each_connected_socket(virtio_vsock_reset_sock);
+
 	vdev->config->reset(vdev);
 
 	mutex_lock(&vsock->rx_lock);
-- 
cgit v1.2.3-70-g09d2


From cfe4bd7a257f6d6f81d3458d8c9d9ec4957539e6 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 4 Feb 2019 03:27:58 +0800
Subject: sctp: check and update stream->out_curr when allocating stream_out

Now when using stream reconfig to add out streams, stream->out
will get re-allocated, and all old streams' information will
be copied to the new ones and the old ones will be freed.

So without stream->out_curr updated, next time when trying to
send from stream->out_curr stream, a panic would be caused.

This patch is to check and update stream->out_curr when
allocating stream_out.

v1->v2:
  - define fa_index() to get elem index from stream->out_curr.
v2->v3:
  - repost with no change.

Fixes: 5bbbbe32a431 ("sctp: introduce stream scheduler foundations")
Reported-by: Ying Xu <yinxu@redhat.com>
Reported-by: syzbot+e33a3a138267ca119c7d@syzkaller.appspotmail.com
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/stream.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'net')

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 80e0ae5534ec..f24633114dfd 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -84,6 +84,19 @@ static void fa_zero(struct flex_array *fa, size_t index, size_t count)
 	}
 }
 
+static size_t fa_index(struct flex_array *fa, void *elem, size_t count)
+{
+	size_t index = 0;
+
+	while (count--) {
+		if (elem == flex_array_get(fa, index))
+			break;
+		index++;
+	}
+
+	return index;
+}
+
 /* Migrates chunks from stream queues to new stream queues if needed,
  * but not across associations. Also, removes those chunks to streams
  * higher than the new max.
@@ -147,6 +160,13 @@ static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
 
 	if (stream->out) {
 		fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt));
+		if (stream->out_curr) {
+			size_t index = fa_index(stream->out, stream->out_curr,
+						stream->outcnt);
+
+			BUG_ON(index == stream->outcnt);
+			stream->out_curr = flex_array_get(out, index);
+		}
 		fa_free(stream->out);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 4e35c1cb9460240e983a01745b5f29fe3a4d8e39 Mon Sep 17 00:00:00 2001
From: Martynas Pumputis <martynas@weave.works>
Date: Tue, 29 Jan 2019 15:51:42 +0100
Subject: netfilter: nf_nat: skip nat clash resolution for same-origin entries

It is possible that two concurrent packets originating from the same
socket of a connection-less protocol (e.g. UDP) can end up having
different IP_CT_DIR_REPLY tuples which results in one of the packets
being dropped.

To illustrate this, consider the following simplified scenario:

1. Packet A and B are sent at the same time from two different threads
   by same UDP socket.  No matching conntrack entry exists yet.
   Both packets cause allocation of a new conntrack entry.
2. get_unique_tuple gets called for A.  No clashing entry found.
   conntrack entry for A is added to main conntrack table.
3. get_unique_tuple is called for B and will find that the reply
   tuple of B is already taken by A.
   It will allocate a new UDP source port for B to resolve the clash.
4. conntrack entry for B cannot be added to main conntrack table
   because its ORIGINAL direction is clashing with A and the REPLY
   directions of A and B are not the same anymore due to UDP source
   port reallocation done in step 3.

This patch modifies nf_conntrack_tuple_taken so it doesn't consider
colliding reply tuples if the IP_CT_DIR_ORIGINAL tuples are equal.

[ Florian: simplify patch to not use .allow_clash setting
  and always ignore identical flows ]

Signed-off-by: Martynas Pumputis <martynas@weave.works>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_core.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 741b533148ba..db4d46332e86 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1007,6 +1007,22 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 		}
 
 		if (nf_ct_key_equal(h, tuple, zone, net)) {
+			/* Tuple is taken already, so caller will need to find
+			 * a new source port to use.
+			 *
+			 * Only exception:
+			 * If the *original tuples* are identical, then both
+			 * conntracks refer to the same flow.
+			 * This is a rare situation, it can occur e.g. when
+			 * more than one UDP packet is sent from same socket
+			 * in different threads.
+			 *
+			 * Let nf_ct_resolve_clash() deal with this later.
+			 */
+			if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+					      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
+				continue;
+
 			NF_CT_STAT_INC_ATOMIC(net, found);
 			rcu_read_unlock();
 			return 1;
-- 
cgit v1.2.3-70-g09d2


From f6ac8585897684374a19863fff21186a05805286 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 2 Feb 2019 10:49:13 +0100
Subject: netfilter: nf_tables: unbind set in rule from commit path

Anonymous sets that are bound to rules from the same transaction trigger
a kernel splat from the abort path due to double set list removal and
double free.

This patch updates the logic to search for the transaction that is
responsible for creating the set and disable the set list removal and
release, given the rule is now responsible for this. Lookup is reverse
since the transaction that adds the set is likely to be at the tail of
the list.

Moreover, this patch adds the unbind step to deliver the event from the
commit path.  This should not be done from the worker thread, since we
have no guarantees of in-order delivery to the listener.

This patch removes the assumption that both activate and deactivate
callbacks need to be provided.

Fixes: cd5125d8f518 ("netfilter: nf_tables: split set destruction in deactivate and destroy phase")
Reported-by: Mikhail Morfikov <mmorfikov@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 17 ++++++--
 net/netfilter/nf_tables_api.c     | 85 +++++++++++++++++++--------------------
 net/netfilter/nft_compat.c        |  6 ++-
 net/netfilter/nft_dynset.c        | 18 ++++-----
 net/netfilter/nft_immediate.c     |  6 ++-
 net/netfilter/nft_lookup.c        | 18 ++++-----
 net/netfilter/nft_objref.c        | 18 ++++-----
 7 files changed, 85 insertions(+), 83 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 841835a387e1..b4984bbbe157 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -469,9 +469,7 @@ struct nft_set_binding {
 int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 		       struct nft_set_binding *binding);
 void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
-			  struct nft_set_binding *binding);
-void nf_tables_rebind_set(const struct nft_ctx *ctx, struct nft_set *set,
-			  struct nft_set_binding *binding);
+			  struct nft_set_binding *binding, bool commit);
 void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set);
 
 /**
@@ -721,6 +719,13 @@ struct nft_expr_type {
 #define NFT_EXPR_STATEFUL		0x1
 #define NFT_EXPR_GC			0x2
 
+enum nft_trans_phase {
+	NFT_TRANS_PREPARE,
+	NFT_TRANS_ABORT,
+	NFT_TRANS_COMMIT,
+	NFT_TRANS_RELEASE
+};
+
 /**
  *	struct nft_expr_ops - nf_tables expression operations
  *
@@ -750,7 +755,8 @@ struct nft_expr_ops {
 	void				(*activate)(const struct nft_ctx *ctx,
 						    const struct nft_expr *expr);
 	void				(*deactivate)(const struct nft_ctx *ctx,
-						      const struct nft_expr *expr);
+						      const struct nft_expr *expr,
+						      enum nft_trans_phase phase);
 	void				(*destroy)(const struct nft_ctx *ctx,
 						   const struct nft_expr *expr);
 	void				(*destroy_clone)(const struct nft_ctx *ctx,
@@ -1323,12 +1329,15 @@ struct nft_trans_rule {
 struct nft_trans_set {
 	struct nft_set			*set;
 	u32				set_id;
+	bool				bound;
 };
 
 #define nft_trans_set(trans)	\
 	(((struct nft_trans_set *)trans->data)->set)
 #define nft_trans_set_id(trans)	\
 	(((struct nft_trans_set *)trans->data)->set_id)
+#define nft_trans_set_bound(trans)	\
+	(((struct nft_trans_set *)trans->data)->bound)
 
 struct nft_trans_chain {
 	bool				update;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index fb07f6cfc719..5a92f23f179f 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -116,6 +116,23 @@ static void nft_trans_destroy(struct nft_trans *trans)
 	kfree(trans);
 }
 
+static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
+{
+	struct net *net = ctx->net;
+	struct nft_trans *trans;
+
+	if (!nft_set_is_anonymous(set))
+		return;
+
+	list_for_each_entry_reverse(trans, &net->nft.commit_list, list) {
+		if (trans->msg_type == NFT_MSG_NEWSET &&
+		    nft_trans_set(trans) == set) {
+			nft_trans_set_bound(trans) = true;
+			break;
+		}
+	}
+}
+
 static int nf_tables_register_hook(struct net *net,
 				   const struct nft_table *table,
 				   struct nft_chain *chain)
@@ -211,18 +228,6 @@ static int nft_delchain(struct nft_ctx *ctx)
 	return err;
 }
 
-/* either expr ops provide both activate/deactivate, or neither */
-static bool nft_expr_check_ops(const struct nft_expr_ops *ops)
-{
-	if (!ops)
-		return true;
-
-	if (WARN_ON_ONCE((!ops->activate ^ !ops->deactivate)))
-		return false;
-
-	return true;
-}
-
 static void nft_rule_expr_activate(const struct nft_ctx *ctx,
 				   struct nft_rule *rule)
 {
@@ -238,14 +243,15 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx,
 }
 
 static void nft_rule_expr_deactivate(const struct nft_ctx *ctx,
-				     struct nft_rule *rule)
+				     struct nft_rule *rule,
+				     enum nft_trans_phase phase)
 {
 	struct nft_expr *expr;
 
 	expr = nft_expr_first(rule);
 	while (expr != nft_expr_last(rule) && expr->ops) {
 		if (expr->ops->deactivate)
-			expr->ops->deactivate(ctx, expr);
+			expr->ops->deactivate(ctx, expr, phase);
 
 		expr = nft_expr_next(expr);
 	}
@@ -296,7 +302,7 @@ static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule)
 		nft_trans_destroy(trans);
 		return err;
 	}
-	nft_rule_expr_deactivate(ctx, rule);
+	nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_PREPARE);
 
 	return 0;
 }
@@ -1929,9 +1935,6 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
  */
 int nft_register_expr(struct nft_expr_type *type)
 {
-	if (!nft_expr_check_ops(type->ops))
-		return -EINVAL;
-
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
 	if (type->family == NFPROTO_UNSPEC)
 		list_add_tail_rcu(&type->list, &nf_tables_expressions);
@@ -2079,10 +2082,6 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
 			err = PTR_ERR(ops);
 			goto err1;
 		}
-		if (!nft_expr_check_ops(ops)) {
-			err = -EINVAL;
-			goto err1;
-		}
 	} else
 		ops = type->ops;
 
@@ -2511,7 +2510,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
 static void nf_tables_rule_release(const struct nft_ctx *ctx,
 				   struct nft_rule *rule)
 {
-	nft_rule_expr_deactivate(ctx, rule);
+	nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE);
 	nf_tables_rule_destroy(ctx, rule);
 }
 
@@ -3708,39 +3707,30 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 bind:
 	binding->chain = ctx->chain;
 	list_add_tail_rcu(&binding->list, &set->bindings);
+	nft_set_trans_bind(ctx, set);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nf_tables_bind_set);
 
-void nf_tables_rebind_set(const struct nft_ctx *ctx, struct nft_set *set,
-			  struct nft_set_binding *binding)
-{
-	if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
-	    nft_is_active(ctx->net, set))
-		list_add_tail_rcu(&set->list, &ctx->table->sets);
-
-	list_add_tail_rcu(&binding->list, &set->bindings);
-}
-EXPORT_SYMBOL_GPL(nf_tables_rebind_set);
-
 void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
-		          struct nft_set_binding *binding)
+			  struct nft_set_binding *binding, bool event)
 {
 	list_del_rcu(&binding->list);
 
-	if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
-	    nft_is_active(ctx->net, set))
+	if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) {
 		list_del_rcu(&set->list);
+		if (event)
+			nf_tables_set_notify(ctx, set, NFT_MSG_DELSET,
+					     GFP_KERNEL);
+	}
 }
 EXPORT_SYMBOL_GPL(nf_tables_unbind_set);
 
 void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
 {
-	if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
-	    nft_is_active(ctx->net, set)) {
-		nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC);
+	if (list_empty(&set->bindings) && nft_set_is_anonymous(set))
 		nft_set_destroy(set);
-	}
 }
 EXPORT_SYMBOL_GPL(nf_tables_destroy_set);
 
@@ -6535,6 +6525,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			nf_tables_rule_notify(&trans->ctx,
 					      nft_trans_rule(trans),
 					      NFT_MSG_DELRULE);
+			nft_rule_expr_deactivate(&trans->ctx,
+						 nft_trans_rule(trans),
+						 NFT_TRANS_COMMIT);
 			break;
 		case NFT_MSG_NEWSET:
 			nft_clear(net, nft_trans_set(trans));
@@ -6621,7 +6614,8 @@ static void nf_tables_abort_release(struct nft_trans *trans)
 		nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
 		break;
 	case NFT_MSG_NEWSET:
-		nft_set_destroy(nft_trans_set(trans));
+		if (!nft_trans_set_bound(trans))
+			nft_set_destroy(nft_trans_set(trans));
 		break;
 	case NFT_MSG_NEWSETELEM:
 		nft_set_elem_destroy(nft_trans_elem_set(trans),
@@ -6682,7 +6676,9 @@ static int __nf_tables_abort(struct net *net)
 		case NFT_MSG_NEWRULE:
 			trans->ctx.chain->use--;
 			list_del_rcu(&nft_trans_rule(trans)->list);
-			nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans));
+			nft_rule_expr_deactivate(&trans->ctx,
+						 nft_trans_rule(trans),
+						 NFT_TRANS_ABORT);
 			break;
 		case NFT_MSG_DELRULE:
 			trans->ctx.chain->use++;
@@ -6692,7 +6688,8 @@ static int __nf_tables_abort(struct net *net)
 			break;
 		case NFT_MSG_NEWSET:
 			trans->ctx.table->use--;
-			list_del_rcu(&nft_trans_set(trans)->list);
+			if (!nft_trans_set_bound(trans))
+				list_del_rcu(&nft_trans_set(trans)->list);
 			break;
 		case NFT_MSG_DELSET:
 			trans->ctx.table->use++;
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 5eb269428832..0732a2fc697c 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -587,10 +587,14 @@ static void nft_compat_activate_tg(const struct nft_ctx *ctx,
 }
 
 static void nft_compat_deactivate(const struct nft_ctx *ctx,
-				  const struct nft_expr *expr)
+				  const struct nft_expr *expr,
+				  enum nft_trans_phase phase)
 {
 	struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops);
 
+	if (phase == NFT_TRANS_COMMIT)
+		return;
+
 	if (--xt->listcnt == 0)
 		list_del_init(&xt->head);
 }
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 07d4efd3d851..f1172f99752b 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -235,20 +235,17 @@ err1:
 	return err;
 }
 
-static void nft_dynset_activate(const struct nft_ctx *ctx,
-				const struct nft_expr *expr)
-{
-	struct nft_dynset *priv = nft_expr_priv(expr);
-
-	nf_tables_rebind_set(ctx, priv->set, &priv->binding);
-}
-
 static void nft_dynset_deactivate(const struct nft_ctx *ctx,
-				  const struct nft_expr *expr)
+				  const struct nft_expr *expr,
+				  enum nft_trans_phase phase)
 {
 	struct nft_dynset *priv = nft_expr_priv(expr);
 
-	nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+	if (phase == NFT_TRANS_PREPARE)
+		return;
+
+	nf_tables_unbind_set(ctx, priv->set, &priv->binding,
+			     phase == NFT_TRANS_COMMIT);
 }
 
 static void nft_dynset_destroy(const struct nft_ctx *ctx,
@@ -296,7 +293,6 @@ static const struct nft_expr_ops nft_dynset_ops = {
 	.eval		= nft_dynset_eval,
 	.init		= nft_dynset_init,
 	.destroy	= nft_dynset_destroy,
-	.activate	= nft_dynset_activate,
 	.deactivate	= nft_dynset_deactivate,
 	.dump		= nft_dynset_dump,
 };
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 0777a93211e2..3f6d1d2a6281 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -72,10 +72,14 @@ static void nft_immediate_activate(const struct nft_ctx *ctx,
 }
 
 static void nft_immediate_deactivate(const struct nft_ctx *ctx,
-				     const struct nft_expr *expr)
+				     const struct nft_expr *expr,
+				     enum nft_trans_phase phase)
 {
 	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
 
+	if (phase == NFT_TRANS_COMMIT)
+		return;
+
 	return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg));
 }
 
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 227b2b15a19c..14496da5141d 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -121,20 +121,17 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
-static void nft_lookup_activate(const struct nft_ctx *ctx,
-				const struct nft_expr *expr)
-{
-	struct nft_lookup *priv = nft_expr_priv(expr);
-
-	nf_tables_rebind_set(ctx, priv->set, &priv->binding);
-}
-
 static void nft_lookup_deactivate(const struct nft_ctx *ctx,
-				  const struct nft_expr *expr)
+				  const struct nft_expr *expr,
+				  enum nft_trans_phase phase)
 {
 	struct nft_lookup *priv = nft_expr_priv(expr);
 
-	nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+	if (phase == NFT_TRANS_PREPARE)
+		return;
+
+	nf_tables_unbind_set(ctx, priv->set, &priv->binding,
+			     phase == NFT_TRANS_COMMIT);
 }
 
 static void nft_lookup_destroy(const struct nft_ctx *ctx,
@@ -225,7 +222,6 @@ static const struct nft_expr_ops nft_lookup_ops = {
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
 	.eval		= nft_lookup_eval,
 	.init		= nft_lookup_init,
-	.activate	= nft_lookup_activate,
 	.deactivate	= nft_lookup_deactivate,
 	.destroy	= nft_lookup_destroy,
 	.dump		= nft_lookup_dump,
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index a3185ca2a3a9..ae178e914486 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -155,20 +155,17 @@ nla_put_failure:
 	return -1;
 }
 
-static void nft_objref_map_activate(const struct nft_ctx *ctx,
-				    const struct nft_expr *expr)
-{
-	struct nft_objref_map *priv = nft_expr_priv(expr);
-
-	nf_tables_rebind_set(ctx, priv->set, &priv->binding);
-}
-
 static void nft_objref_map_deactivate(const struct nft_ctx *ctx,
-				      const struct nft_expr *expr)
+				      const struct nft_expr *expr,
+				      enum nft_trans_phase phase)
 {
 	struct nft_objref_map *priv = nft_expr_priv(expr);
 
-	nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+	if (phase == NFT_TRANS_PREPARE)
+		return;
+
+	nf_tables_unbind_set(ctx, priv->set, &priv->binding,
+			     phase == NFT_TRANS_COMMIT);
 }
 
 static void nft_objref_map_destroy(const struct nft_ctx *ctx,
@@ -185,7 +182,6 @@ static const struct nft_expr_ops nft_objref_map_ops = {
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
 	.eval		= nft_objref_map_eval,
 	.init		= nft_objref_map_init,
-	.activate	= nft_objref_map_activate,
 	.deactivate	= nft_objref_map_deactivate,
 	.destroy	= nft_objref_map_destroy,
 	.dump		= nft_objref_map_dump,
-- 
cgit v1.2.3-70-g09d2


From ad6f317f720f4a3121756c23831a43dda9b095e5 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Mon, 4 Feb 2019 13:44:44 +0100
Subject: net/smc: preallocated memory for rdma work requests

The work requests for rdma writes are built in local variables within
function smc_tx_rdma_write(). This violates the rule that the work
request storage has to stay till the work request is confirmed by
a completion queue response.
This patch introduces preallocated memory for these work requests.
The storage is allocated, once a link (and thus a queue pair) is
established.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_cdc.c  | 11 +++--------
 net/smc/smc_cdc.h  |  8 +++++++-
 net/smc/smc_core.h | 20 ++++++++++++++++++++
 net/smc/smc_llc.c  |  3 ++-
 net/smc/smc_tx.c   | 44 ++++++++++++++++++++++----------------------
 net/smc/smc_wr.c   | 38 +++++++++++++++++++++++++++++++++++++-
 net/smc/smc_wr.h   |  1 +
 7 files changed, 92 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 1c5333d494e9..b80ef104ab4e 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -21,13 +21,6 @@
 
 /********************************** send *************************************/
 
-struct smc_cdc_tx_pend {
-	struct smc_connection	*conn;		/* socket connection */
-	union smc_host_cursor	cursor;	/* tx sndbuf cursor sent */
-	union smc_host_cursor	p_cursor;	/* rx RMBE cursor produced */
-	u16			ctrl_seq;	/* conn. tx sequence # */
-};
-
 /* handler for send/transmission completion of a CDC msg */
 static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
 			       struct smc_link *link,
@@ -61,12 +54,14 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
 
 int smc_cdc_get_free_slot(struct smc_connection *conn,
 			  struct smc_wr_buf **wr_buf,
+			  struct smc_rdma_wr **wr_rdma_buf,
 			  struct smc_cdc_tx_pend **pend)
 {
 	struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
 	int rc;
 
 	rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
+				     wr_rdma_buf,
 				     (struct smc_wr_tx_pend_priv **)pend);
 	if (!conn->alert_token_local)
 		/* abnormal termination */
@@ -121,7 +116,7 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn)
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
-	rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend);
+	rc = smc_cdc_get_free_slot(conn, &wr_buf, NULL, &pend);
 	if (rc)
 		return rc;
 
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index b5bfe38c7f9b..2148da7a26b1 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -270,10 +270,16 @@ static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
 		smcr_cdc_msg_to_host(local, peer, conn);
 }
 
-struct smc_cdc_tx_pend;
+struct smc_cdc_tx_pend {
+	struct smc_connection	*conn;		/* socket connection */
+	union smc_host_cursor	cursor;		/* tx sndbuf cursor sent */
+	union smc_host_cursor	p_cursor;	/* rx RMBE cursor produced */
+	u16			ctrl_seq;	/* conn. tx sequence # */
+};
 
 int smc_cdc_get_free_slot(struct smc_connection *conn,
 			  struct smc_wr_buf **wr_buf,
+			  struct smc_rdma_wr **wr_rdma_buf,
 			  struct smc_cdc_tx_pend **pend);
 void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
 int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index b00287989a3d..8806d2afa6ed 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -52,6 +52,24 @@ enum smc_wr_reg_state {
 	FAILED		/* ib_wr_reg_mr response: failure */
 };
 
+struct smc_rdma_sge {				/* sges for RDMA writes */
+	struct ib_sge		wr_tx_rdma_sge[SMC_IB_MAX_SEND_SGE];
+};
+
+#define SMC_MAX_RDMA_WRITES	2		/* max. # of RDMA writes per
+						 * message send
+						 */
+
+struct smc_rdma_sges {				/* sges per message send */
+	struct smc_rdma_sge	tx_rdma_sge[SMC_MAX_RDMA_WRITES];
+};
+
+struct smc_rdma_wr {				/* work requests per message
+						 * send
+						 */
+	struct ib_rdma_wr	wr_tx_rdma[SMC_MAX_RDMA_WRITES];
+};
+
 struct smc_link {
 	struct smc_ib_device	*smcibdev;	/* ib-device */
 	u8			ibport;		/* port - values 1 | 2 */
@@ -64,6 +82,8 @@ struct smc_link {
 	struct smc_wr_buf	*wr_tx_bufs;	/* WR send payload buffers */
 	struct ib_send_wr	*wr_tx_ibs;	/* WR send meta data */
 	struct ib_sge		*wr_tx_sges;	/* WR send gather meta data */
+	struct smc_rdma_sges	*wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/
+	struct smc_rdma_wr	*wr_tx_rdmas;	/* WR RDMA WRITE */
 	struct smc_wr_tx_pend	*wr_tx_pends;	/* WR send waiting for CQE */
 	/* above four vectors have wr_tx_cnt elements and use the same index */
 	dma_addr_t		wr_tx_dma_addr;	/* DMA address of wr_tx_bufs */
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index a6d3623d06f4..4fd60c522802 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -166,7 +166,8 @@ static int smc_llc_add_pending_send(struct smc_link *link,
 {
 	int rc;
 
-	rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, pend);
+	rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, NULL,
+				     pend);
 	if (rc < 0)
 		return rc;
 	BUILD_BUG_ON_MSG(
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 36af3de731b9..2fdfaff60cf9 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -266,27 +266,23 @@ int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
 
 /* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
 static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
-			     int num_sges, struct ib_sge sges[])
+			     int num_sges, struct ib_rdma_wr *rdma_wr)
 {
 	struct smc_link_group *lgr = conn->lgr;
-	struct ib_rdma_wr rdma_wr;
 	struct smc_link *link;
 	int rc;
 
-	memset(&rdma_wr, 0, sizeof(rdma_wr));
 	link = &lgr->lnk[SMC_SINGLE_LINK];
-	rdma_wr.wr.wr_id = smc_wr_tx_get_next_wr_id(link);
-	rdma_wr.wr.sg_list = sges;
-	rdma_wr.wr.num_sge = num_sges;
-	rdma_wr.wr.opcode = IB_WR_RDMA_WRITE;
-	rdma_wr.remote_addr =
+	rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link);
+	rdma_wr->wr.num_sge = num_sges;
+	rdma_wr->remote_addr =
 		lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
 		/* RMBE within RMB */
 		conn->tx_off +
 		/* offset within RMBE */
 		peer_rmbe_offset;
-	rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
-	rc = ib_post_send(link->roce_qp, &rdma_wr.wr, NULL);
+	rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
+	rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL);
 	if (rc) {
 		conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
 		smc_lgr_terminate(lgr);
@@ -313,24 +309,25 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn,
 /* SMC-R helper for smc_tx_rdma_writes() */
 static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
 			       size_t src_off, size_t src_len,
-			       size_t dst_off, size_t dst_len)
+			       size_t dst_off, size_t dst_len,
+			       struct smc_rdma_wr *wr_rdma_buf)
 {
 	dma_addr_t dma_addr =
 		sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl);
-	struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
 	int src_len_sum = src_len, dst_len_sum = dst_len;
-	struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
 	int sent_count = src_off;
 	int srcchunk, dstchunk;
 	int num_sges;
 	int rc;
 
 	for (dstchunk = 0; dstchunk < 2; dstchunk++) {
+		struct ib_sge *sge =
+			wr_rdma_buf->wr_tx_rdma[dstchunk].wr.sg_list;
+
 		num_sges = 0;
 		for (srcchunk = 0; srcchunk < 2; srcchunk++) {
-			sges[srcchunk].addr = dma_addr + src_off;
-			sges[srcchunk].length = src_len;
-			sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
+			sge[srcchunk].addr = dma_addr + src_off;
+			sge[srcchunk].length = src_len;
 			num_sges++;
 
 			src_off += src_len;
@@ -343,7 +340,8 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
 			src_len = dst_len - src_len; /* remainder */
 			src_len_sum += src_len;
 		}
-		rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges);
+		rc = smc_tx_rdma_write(conn, dst_off, num_sges,
+				       &wr_rdma_buf->wr_tx_rdma[dstchunk]);
 		if (rc)
 			return rc;
 		if (dst_len_sum == len)
@@ -402,7 +400,8 @@ static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len,
 /* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
  * usable snd_wnd as max transmit
  */
-static int smc_tx_rdma_writes(struct smc_connection *conn)
+static int smc_tx_rdma_writes(struct smc_connection *conn,
+			      struct smc_rdma_wr *wr_rdma_buf)
 {
 	size_t len, src_len, dst_off, dst_len; /* current chunk values */
 	union smc_host_cursor sent, prep, prod, cons;
@@ -463,7 +462,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 					 dst_off, dst_len);
 	else
 		rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len,
-					 dst_off, dst_len);
+					 dst_off, dst_len, wr_rdma_buf);
 	if (rc)
 		return rc;
 
@@ -484,11 +483,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
 {
 	struct smc_cdc_producer_flags *pflags;
+	struct smc_rdma_wr *wr_rdma_buf;
 	struct smc_cdc_tx_pend *pend;
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
-	rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend);
+	rc = smc_cdc_get_free_slot(conn, &wr_buf, &wr_rdma_buf, &pend);
 	if (rc < 0) {
 		if (rc == -EBUSY) {
 			struct smc_sock *smc =
@@ -506,7 +506,7 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
 
 	spin_lock_bh(&conn->send_lock);
 	if (!conn->local_tx_ctrl.prod_flags.urg_data_present) {
-		rc = smc_tx_rdma_writes(conn);
+		rc = smc_tx_rdma_writes(conn, wr_rdma_buf);
 		if (rc) {
 			smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
 					   (struct smc_wr_tx_pend_priv *)pend);
@@ -533,7 +533,7 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn)
 
 	spin_lock_bh(&conn->send_lock);
 	if (!pflags->urg_data_present)
-		rc = smc_tx_rdma_writes(conn);
+		rc = smc_tx_rdma_writes(conn, NULL);
 	if (!rc)
 		rc = smcd_cdc_msg_send(conn);
 
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 1dc88c32d6bb..253aa75dc2b6 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -160,6 +160,7 @@ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
  * @link:		Pointer to smc_link used to later send the message.
  * @handler:		Send completion handler function pointer.
  * @wr_buf:		Out value returns pointer to message buffer.
+ * @wr_rdma_buf:	Out value returns pointer to rdma work request.
  * @wr_pend_priv:	Out value returns pointer serving as handler context.
  *
  * Return: 0 on success, or -errno on error.
@@ -167,6 +168,7 @@ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
 int smc_wr_tx_get_free_slot(struct smc_link *link,
 			    smc_wr_tx_handler handler,
 			    struct smc_wr_buf **wr_buf,
+			    struct smc_rdma_wr **wr_rdma_buf,
 			    struct smc_wr_tx_pend_priv **wr_pend_priv)
 {
 	struct smc_wr_tx_pend *wr_pend;
@@ -204,6 +206,8 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
 	wr_ib = &link->wr_tx_ibs[idx];
 	wr_ib->wr_id = wr_id;
 	*wr_buf = &link->wr_tx_bufs[idx];
+	if (wr_rdma_buf)
+		*wr_rdma_buf = &link->wr_tx_rdmas[idx];
 	*wr_pend_priv = &wr_pend->priv;
 	return 0;
 }
@@ -465,12 +469,26 @@ static void smc_wr_init_sge(struct smc_link *lnk)
 			lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
 		lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
 		lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
+		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
+			lnk->roce_pd->local_dma_lkey;
+		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
+			lnk->roce_pd->local_dma_lkey;
+		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
+			lnk->roce_pd->local_dma_lkey;
+		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
+			lnk->roce_pd->local_dma_lkey;
 		lnk->wr_tx_ibs[i].next = NULL;
 		lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
 		lnk->wr_tx_ibs[i].num_sge = 1;
 		lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
 		lnk->wr_tx_ibs[i].send_flags =
 			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
+		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
+		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
+			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
+		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
+			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
 	}
 	for (i = 0; i < lnk->wr_rx_cnt; i++) {
 		lnk->wr_rx_sges[i].addr =
@@ -521,8 +539,12 @@ void smc_wr_free_link_mem(struct smc_link *lnk)
 	lnk->wr_tx_mask = NULL;
 	kfree(lnk->wr_tx_sges);
 	lnk->wr_tx_sges = NULL;
+	kfree(lnk->wr_tx_rdma_sges);
+	lnk->wr_tx_rdma_sges = NULL;
 	kfree(lnk->wr_rx_sges);
 	lnk->wr_rx_sges = NULL;
+	kfree(lnk->wr_tx_rdmas);
+	lnk->wr_tx_rdmas = NULL;
 	kfree(lnk->wr_rx_ibs);
 	lnk->wr_rx_ibs = NULL;
 	kfree(lnk->wr_tx_ibs);
@@ -552,10 +574,20 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
 				  GFP_KERNEL);
 	if (!link->wr_rx_ibs)
 		goto no_mem_wr_tx_ibs;
+	link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT,
+				    sizeof(link->wr_tx_rdmas[0]),
+				    GFP_KERNEL);
+	if (!link->wr_tx_rdmas)
+		goto no_mem_wr_rx_ibs;
+	link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT,
+					sizeof(link->wr_tx_rdma_sges[0]),
+					GFP_KERNEL);
+	if (!link->wr_tx_rdma_sges)
+		goto no_mem_wr_tx_rdmas;
 	link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
 				   GFP_KERNEL);
 	if (!link->wr_tx_sges)
-		goto no_mem_wr_rx_ibs;
+		goto no_mem_wr_tx_rdma_sges;
 	link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
 				   sizeof(link->wr_rx_sges[0]),
 				   GFP_KERNEL);
@@ -579,6 +611,10 @@ no_mem_wr_rx_sges:
 	kfree(link->wr_rx_sges);
 no_mem_wr_tx_sges:
 	kfree(link->wr_tx_sges);
+no_mem_wr_tx_rdma_sges:
+	kfree(link->wr_tx_rdma_sges);
+no_mem_wr_tx_rdmas:
+	kfree(link->wr_tx_rdmas);
 no_mem_wr_rx_ibs:
 	kfree(link->wr_rx_ibs);
 no_mem_wr_tx_ibs:
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 1d85bb14fd6f..09bf32fd3959 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -85,6 +85,7 @@ void smc_wr_add_dev(struct smc_ib_device *smcibdev);
 
 int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
 			    struct smc_wr_buf **wr_buf,
+			    struct smc_rdma_wr **wrs,
 			    struct smc_wr_tx_pend_priv **wr_pend_priv);
 int smc_wr_tx_put_slot(struct smc_link *link,
 		       struct smc_wr_tx_pend_priv *wr_pend_priv);
-- 
cgit v1.2.3-70-g09d2


From b8649efad879c69c7ab1f19ce8814fcabef1f72b Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Mon, 4 Feb 2019 13:44:45 +0100
Subject: net/smc: fix sender_free computation

In some scenarios a separate consumer cursor update is necessary.
The decision is made in smc_tx_consumer_cursor_update(). The
sender_free computation could be wrong:

The rx confirmed cursor is always smaller than or equal to the
rx producer cursor. The parameters in the smc_curs_diff() call
have to be exchanged, otherwise sender_free might even be negative.

And if more data arrives local_rx_ctrl.prod might be updated, enabling
a cursor difference between local_rx_ctrl.prod and rx confirmed cursor
larger than the RMB size. This case is not covered by smc_curs_diff().
Thus function smc_curs_diff_large() is introduced here.

If a recvmsg() is processed in parallel, local_tx_ctrl.cons might
change during smc_cdc_msg_send. Make sure rx_curs_confirmed is updated
with the actually sent local_tx_ctrl.cons value.

Fixes: e82f2e31f559 ("net/smc: optimize consumer cursor updates")
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_cdc.c |  5 +++--
 net/smc/smc_cdc.h | 26 +++++++++++++++++++++++++-
 net/smc/smc_tx.c  |  3 ++-
 3 files changed, 30 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index b80ef104ab4e..a712c9f8699b 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -91,6 +91,7 @@ int smc_cdc_msg_send(struct smc_connection *conn,
 		     struct smc_wr_buf *wr_buf,
 		     struct smc_cdc_tx_pend *pend)
 {
+	union smc_host_cursor cfed;
 	struct smc_link *link;
 	int rc;
 
@@ -102,10 +103,10 @@ int smc_cdc_msg_send(struct smc_connection *conn,
 	conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
 	smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf,
 			    &conn->local_tx_ctrl, conn);
+	smc_curs_copy(&cfed, &((struct smc_host_cdc_msg *)wr_buf)->cons, conn);
 	rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
 	if (!rc)
-		smc_curs_copy(&conn->rx_curs_confirmed,
-			      &conn->local_tx_ctrl.cons, conn);
+		smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn);
 
 	return rc;
 }
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index 2148da7a26b1..271e2524dc8f 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -160,7 +160,9 @@ static inline void smcd_curs_copy(union smcd_cdc_cursor *tgt,
 #endif
 }
 
-/* calculate cursor difference between old and new, where old <= new */
+/* calculate cursor difference between old and new, where old <= new and
+ * difference cannot exceed size
+ */
 static inline int smc_curs_diff(unsigned int size,
 				union smc_host_cursor *old,
 				union smc_host_cursor *new)
@@ -185,6 +187,28 @@ static inline int smc_curs_comp(unsigned int size,
 	return smc_curs_diff(size, old, new);
 }
 
+/* calculate cursor difference between old and new, where old <= new and
+ * difference may exceed size
+ */
+static inline int smc_curs_diff_large(unsigned int size,
+				      union smc_host_cursor *old,
+				      union smc_host_cursor *new)
+{
+	if (old->wrap < new->wrap)
+		return min_t(int,
+			     (size - old->count) + new->count +
+			     (new->wrap - old->wrap - 1) * size,
+			     size);
+
+	if (old->wrap > new->wrap) /* wrap has switched from 0xffff to 0x0000 */
+		return min_t(int,
+			     (size - old->count) + new->count +
+			     (new->wrap + 0xffff - old->wrap) * size,
+			     size);
+
+	return max_t(int, 0, (new->count - old->count));
+}
+
 static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
 					  union smc_host_cursor *local,
 					  struct smc_connection *conn)
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 2fdfaff60cf9..f93f3580c100 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -595,7 +595,8 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force)
 	if (to_confirm > conn->rmbe_update_limit) {
 		smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn);
 		sender_free = conn->rmb_desc->len -
-			      smc_curs_diff(conn->rmb_desc->len, &prod, &cfed);
+			      smc_curs_diff_large(conn->rmb_desc->len,
+						  &cfed, &prod);
 	}
 
 	if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
-- 
cgit v1.2.3-70-g09d2


From a5e04318c83a31925300af1ce358dbc1a708b732 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Mon, 4 Feb 2019 13:44:46 +0100
Subject: net/smc: delete rkey first before switching to unused

Once RMBs are flagged as unused they are candidates for reuse.
Thus the LLC DELETE RKEY operaton should be made before flagging
the RMB as unused.

Fixes: c7674c001b11 ("net/smc: unregister rkeys of unused buffer")
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 097c798983ca..aa1c551cee81 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -302,13 +302,13 @@ static void smc_buf_unuse(struct smc_connection *conn,
 		conn->sndbuf_desc->used = 0;
 	if (conn->rmb_desc) {
 		if (!conn->rmb_desc->regerr) {
-			conn->rmb_desc->used = 0;
 			if (!lgr->is_smcd) {
 				/* unregister rmb with peer */
 				smc_llc_do_delete_rkey(
 						&lgr->lnk[SMC_SINGLE_LINK],
 						conn->rmb_desc);
 			}
+			conn->rmb_desc->used = 0;
 		} else {
 			/* buf registration failed, reuse not possible */
 			write_lock_bh(&lgr->rmbs_lock);
-- 
cgit v1.2.3-70-g09d2


From 84b799a292ebc03de388f8573a169de6eb12c340 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Mon, 4 Feb 2019 13:44:47 +0100
Subject: net/smc: correct state change for peer closing

If some kind of closing is received from the peer while still in
state SMC_INIT, it means the peer has had an active connection and
closed the socket quickly before listen_work finished. This should
not result in a shortcut from state SMC_INIT to state SMC_CLOSED.
This patch adds the socket to the accept queue in state
SMC_APPCLOSEWAIT1. The socket reaches state SMC_CLOSED once being
accepted and closed with smc_release().

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_close.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index ea2b87f29469..e39cadda1bf5 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -345,14 +345,7 @@ static void smc_close_passive_work(struct work_struct *work)
 
 	switch (sk->sk_state) {
 	case SMC_INIT:
-		if (atomic_read(&conn->bytes_to_rcv) ||
-		    (rxflags->peer_done_writing &&
-		     !smc_cdc_rxed_any_close(conn))) {
-			sk->sk_state = SMC_APPCLOSEWAIT1;
-		} else {
-			sk->sk_state = SMC_CLOSED;
-			sock_put(sk); /* passive closing */
-		}
+		sk->sk_state = SMC_APPCLOSEWAIT1;
 		break;
 	case SMC_ACTIVE:
 		sk->sk_state = SMC_APPCLOSEWAIT1;
-- 
cgit v1.2.3-70-g09d2


From c1f7e02979edd7a3a3e69fe04be60b1d650dc8a7 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Mon, 4 Feb 2019 14:50:38 +0000
Subject: net: cls_flower: Remove filter from mask before freeing it

In fl_change(), when adding a new rule (i.e. fold == NULL), a driver may
reject the new rule, for example due to resource exhaustion. By that
point, the new rule was already assigned a mask, and it was added to
that mask's hash table. The clean-up path that's invoked as a result of
the rejection however neglects to undo the hash table addition, and
proceeds to free the new rule, thus leaving a dangling pointer in the
hash table.

Fix by removing fnew from the mask's hash table before it is freed.

Fixes: 35cc3cefc4de ("net/sched: cls_flower: Reject duplicated rules also under skip_sw")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index f6aa57fbbbaf..12ca9d13db83 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1371,7 +1371,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	if (!tc_skip_hw(fnew->flags)) {
 		err = fl_hw_replace_filter(tp, fnew, extack);
 		if (err)
-			goto errout_mask;
+			goto errout_mask_ht;
 	}
 
 	if (!tc_in_hw(fnew->flags))
@@ -1401,6 +1401,10 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	kfree(mask);
 	return 0;
 
+errout_mask_ht:
+	rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
+			       fnew->mask->filter_ht_params);
+
 errout_mask:
 	fl_mask_put(head, fnew->mask, false);
 
-- 
cgit v1.2.3-70-g09d2


From 17ab4f61b8cd6f9c38e9d0b935d86d73b5d0d2b5 Mon Sep 17 00:00:00 2001
From: Rundong Ge <rdong.ge@gmail.com>
Date: Sat, 2 Feb 2019 14:29:35 +0000
Subject: net: dsa: slave: Don't propagate flag changes on down slave
 interfaces

The unbalance of master's promiscuity or allmulti will happen after ifdown
and ifup a slave interface which is in a bridge.

When we ifdown a slave interface , both the 'dsa_slave_close' and
'dsa_slave_change_rx_flags' will clear the master's flags. The flags
of master will be decrease twice.
In the other hand, if we ifup the slave interface again, since the
slave's flags were cleared the 'dsa_slave_open' won't set the master's
flag, only 'dsa_slave_change_rx_flags' that triggered by 'br_add_if'
will set the master's flags. The flags of master is increase once.

Only propagating flag changes when a slave interface is up makes
sure this does not happen. The 'vlan_dev_change_rx_flags' had the
same problem and was fixed, and changes here follows that fix.

Fixes: 91da11f870f0 ("net: Distributed Switch Architecture protocol support")
Signed-off-by: Rundong Ge <rdong.ge@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index a3fcc1d01615..b5e44825d173 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -140,11 +140,14 @@ static int dsa_slave_close(struct net_device *dev)
 static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
 {
 	struct net_device *master = dsa_slave_to_master(dev);
-
-	if (change & IFF_ALLMULTI)
-		dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1);
-	if (change & IFF_PROMISC)
-		dev_set_promiscuity(master, dev->flags & IFF_PROMISC ? 1 : -1);
+	if (dev->flags & IFF_UP) {
+		if (change & IFF_ALLMULTI)
+			dev_set_allmulti(master,
+					 dev->flags & IFF_ALLMULTI ? 1 : -1);
+		if (change & IFF_PROMISC)
+			dev_set_promiscuity(master,
+					    dev->flags & IFF_PROMISC ? 1 : -1);
+	}
 }
 
 static void dsa_slave_set_rx_mode(struct net_device *dev)
-- 
cgit v1.2.3-70-g09d2


From c8101f7729daee251f4f6505f9d135ec08e1342f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Sat, 2 Feb 2019 17:53:29 +0000
Subject: net: dsa: Fix lockdep false positive splat

Creating a macvtap on a DSA-backed interface results in the following
splat when lockdep is enabled:

[   19.638080] IPv6: ADDRCONF(NETDEV_CHANGE): lan0: link becomes ready
[   23.041198] device lan0 entered promiscuous mode
[   23.043445] device eth0 entered promiscuous mode
[   23.049255]
[   23.049557] ============================================
[   23.055021] WARNING: possible recursive locking detected
[   23.060490] 5.0.0-rc3-00013-g56c857a1b8d3 #118 Not tainted
[   23.066132] --------------------------------------------
[   23.071598] ip/2861 is trying to acquire lock:
[   23.076171] 00000000f61990cb (_xmit_ETHER){+...}, at: dev_set_rx_mode+0x1c/0x38
[   23.083693]
[   23.083693] but task is already holding lock:
[   23.089696] 00000000ecf0c3b4 (_xmit_ETHER){+...}, at: dev_uc_add+0x24/0x70
[   23.096774]
[   23.096774] other info that might help us debug this:
[   23.103494]  Possible unsafe locking scenario:
[   23.103494]
[   23.109584]        CPU0
[   23.112093]        ----
[   23.114601]   lock(_xmit_ETHER);
[   23.117917]   lock(_xmit_ETHER);
[   23.121233]
[   23.121233]  *** DEADLOCK ***
[   23.121233]
[   23.127325]  May be due to missing lock nesting notation
[   23.127325]
[   23.134315] 2 locks held by ip/2861:
[   23.137987]  #0: 000000003b766c72 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x338/0x4e0
[   23.146231]  #1: 00000000ecf0c3b4 (_xmit_ETHER){+...}, at: dev_uc_add+0x24/0x70
[   23.153757]
[   23.153757] stack backtrace:
[   23.158243] CPU: 0 PID: 2861 Comm: ip Not tainted 5.0.0-rc3-00013-g56c857a1b8d3 #118
[   23.166212] Hardware name: Globalscale Marvell ESPRESSOBin Board (DT)
[   23.172843] Call trace:
[   23.175358]  dump_backtrace+0x0/0x188
[   23.179116]  show_stack+0x14/0x20
[   23.182524]  dump_stack+0xb4/0xec
[   23.185928]  __lock_acquire+0x123c/0x1860
[   23.190048]  lock_acquire+0xc8/0x248
[   23.193724]  _raw_spin_lock_bh+0x40/0x58
[   23.197755]  dev_set_rx_mode+0x1c/0x38
[   23.201607]  dev_set_promiscuity+0x3c/0x50
[   23.205820]  dsa_slave_change_rx_flags+0x5c/0x70
[   23.210567]  __dev_set_promiscuity+0x148/0x1e0
[   23.215136]  __dev_set_rx_mode+0x74/0x98
[   23.219167]  dev_uc_add+0x54/0x70
[   23.222575]  macvlan_open+0x170/0x1d0
[   23.226336]  __dev_open+0xe0/0x160
[   23.229830]  __dev_change_flags+0x16c/0x1b8
[   23.234132]  dev_change_flags+0x20/0x60
[   23.238074]  do_setlink+0x2d0/0xc50
[   23.241658]  __rtnl_newlink+0x5f8/0x6e8
[   23.245601]  rtnl_newlink+0x50/0x78
[   23.249184]  rtnetlink_rcv_msg+0x360/0x4e0
[   23.253397]  netlink_rcv_skb+0xe8/0x130
[   23.257338]  rtnetlink_rcv+0x14/0x20
[   23.261012]  netlink_unicast+0x190/0x210
[   23.265043]  netlink_sendmsg+0x288/0x350
[   23.269075]  sock_sendmsg+0x18/0x30
[   23.272659]  ___sys_sendmsg+0x29c/0x2c8
[   23.276602]  __sys_sendmsg+0x60/0xb8
[   23.280276]  __arm64_sys_sendmsg+0x1c/0x28
[   23.284488]  el0_svc_common+0xd8/0x138
[   23.288340]  el0_svc_handler+0x24/0x80
[   23.292192]  el0_svc+0x8/0xc

This looks fairly harmless (no actual deadlock occurs), and is
fixed in a similar way to c6894dec8ea9 ("bridge: fix lockdep
addr_list_lock false positive splat") by putting the addr_list_lock
in its own lockdep class.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/master.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/dsa/master.c b/net/dsa/master.c
index 71bb15f491c8..54f5551fb799 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -205,6 +205,8 @@ static void dsa_master_reset_mtu(struct net_device *dev)
 	rtnl_unlock();
 }
 
+static struct lock_class_key dsa_master_addr_list_lock_key;
+
 int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
 {
 	int ret;
@@ -218,6 +220,8 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
 	wmb();
 
 	dev->dsa_ptr = cpu_dp;
+	lockdep_set_class(&dev->addr_list_lock,
+			  &dsa_master_addr_list_lock_key);
 
 	ret = dsa_master_ethtool_setup(dev);
 	if (ret)
-- 
cgit v1.2.3-70-g09d2


From 15df03c661cb362366ecfc3a21820cb934f3e4ca Mon Sep 17 00:00:00 2001
From: Eli Cooper <elicooper@gmx.com>
Date: Mon, 21 Jan 2019 18:45:27 +0800
Subject: netfilter: ipv6: Don't preserve original oif for loopback address

Commit 508b09046c0f ("netfilter: ipv6: Preserve link scope traffic
original oif") made ip6_route_me_harder() keep the original oif for
link-local and multicast packets. However, it also affected packets
for the loopback address because it used rt6_need_strict().

REDIRECT rules in the OUTPUT chain rewrite the destination to loopback
address; thus its oif should not be preserved. This commit fixes the bug
that redirected local packets are being dropped. Actually the packet was
not exactly dropped; Instead it was sent out to the original oif rather
than lo. When a packet with daddr ::1 is sent to the router, it is
effectively dropped.

Fixes: 508b09046c0f ("netfilter: ipv6: Preserve link scope traffic original oif")
Signed-off-by: Eli Cooper <elicooper@gmx.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 8b075f0bc351..6d0b1f3e927b 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -23,9 +23,11 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
 	struct sock *sk = sk_to_full_sk(skb->sk);
 	unsigned int hh_len;
 	struct dst_entry *dst;
+	int strict = (ipv6_addr_type(&iph->daddr) &
+		      (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
 	struct flowi6 fl6 = {
 		.flowi6_oif = sk && sk->sk_bound_dev_if ? sk->sk_bound_dev_if :
-			rt6_need_strict(&iph->daddr) ? skb_dst(skb)->dev->ifindex : 0,
+			strict ? skb_dst(skb)->dev->ifindex : 0,
 		.flowi6_mark = skb->mark,
 		.flowi6_uid = sock_net_uid(net, sk),
 		.daddr = iph->daddr,
-- 
cgit v1.2.3-70-g09d2


From 947e492c0fc2132ae5fca081a9c2952ccaab0404 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 5 Feb 2019 12:16:18 +0100
Subject: netfilter: nft_compat: don't use refcount_inc on newly allocated
 entry

When I moved the refcount to refcount_t type I missed the fact that
refcount_inc() will result in use-after-free warning with
CONFIG_REFCOUNT_FULL=y builds.

The correct fix would be to init the reference count to 1 at allocation
time, but, unfortunately we cannot do this, as we can't undo that
in case something else fails later in the batch.

So only solution I see is to special-case the 'new entry' condition
and replace refcount_inc() with a "delayed" refcount_set(1) in this case,
as done here.

The .activate callback can be removed to simplify things, we only
need to make sure that deactivate() decrements/unlinks the entry
from the list at end of transaction phase (commit or abort).

Fixes: 12c44aba6618 ("netfilter: nft_compat: use refcnt_t type for nft_xt reference count")
Reported-by: Jordan Glover <Golden_Miller83@protonmail.ch>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 62 +++++++++++++++++-----------------------------
 1 file changed, 23 insertions(+), 39 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 0732a2fc697c..fe64df848365 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -61,6 +61,21 @@ static struct nft_compat_net *nft_compat_pernet(struct net *net)
 	return net_generic(net, nft_compat_net_id);
 }
 
+static void nft_xt_get(struct nft_xt *xt)
+{
+	/* refcount_inc() warns on 0 -> 1 transition, but we can't
+	 * init the reference count to 1 in .select_ops -- we can't
+	 * undo such an increase when another expression inside the same
+	 * rule fails afterwards.
+	 */
+	if (xt->listcnt == 0)
+		refcount_set(&xt->refcnt, 1);
+	else
+		refcount_inc(&xt->refcnt);
+
+	xt->listcnt++;
+}
+
 static bool nft_xt_put(struct nft_xt *xt)
 {
 	if (refcount_dec_and_test(&xt->refcnt)) {
@@ -291,7 +306,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		return -EINVAL;
 
 	nft_xt = container_of(expr->ops, struct nft_xt, ops);
-	refcount_inc(&nft_xt->refcnt);
+	nft_xt_get(nft_xt);
 	return 0;
 }
 
@@ -504,7 +519,7 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		return ret;
 
 	nft_xt = container_of(expr->ops, struct nft_xt, ops);
-	refcount_inc(&nft_xt->refcnt);
+	nft_xt_get(nft_xt);
 	return 0;
 }
 
@@ -558,45 +573,16 @@ nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 	__nft_match_destroy(ctx, expr, nft_expr_priv(expr));
 }
 
-static void nft_compat_activate(const struct nft_ctx *ctx,
-				const struct nft_expr *expr,
-				struct list_head *h)
-{
-	struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops);
-
-	if (xt->listcnt == 0)
-		list_add(&xt->head, h);
-
-	xt->listcnt++;
-}
-
-static void nft_compat_activate_mt(const struct nft_ctx *ctx,
-				   const struct nft_expr *expr)
-{
-	struct nft_compat_net *cn = nft_compat_pernet(ctx->net);
-
-	nft_compat_activate(ctx, expr, &cn->nft_match_list);
-}
-
-static void nft_compat_activate_tg(const struct nft_ctx *ctx,
-				   const struct nft_expr *expr)
-{
-	struct nft_compat_net *cn = nft_compat_pernet(ctx->net);
-
-	nft_compat_activate(ctx, expr, &cn->nft_target_list);
-}
-
 static void nft_compat_deactivate(const struct nft_ctx *ctx,
 				  const struct nft_expr *expr,
 				  enum nft_trans_phase phase)
 {
 	struct nft_xt *xt = container_of(expr->ops, struct nft_xt, ops);
 
-	if (phase == NFT_TRANS_COMMIT)
-		return;
-
-	if (--xt->listcnt == 0)
-		list_del_init(&xt->head);
+	if (phase == NFT_TRANS_ABORT || phase == NFT_TRANS_COMMIT) {
+		if (--xt->listcnt == 0)
+			list_del_init(&xt->head);
+	}
 }
 
 static void
@@ -852,7 +838,6 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 	nft_match->ops.eval = nft_match_eval;
 	nft_match->ops.init = nft_match_init;
 	nft_match->ops.destroy = nft_match_destroy;
-	nft_match->ops.activate = nft_compat_activate_mt;
 	nft_match->ops.deactivate = nft_compat_deactivate;
 	nft_match->ops.dump = nft_match_dump;
 	nft_match->ops.validate = nft_match_validate;
@@ -870,7 +855,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
 
 	nft_match->ops.size = matchsize;
 
-	nft_match->listcnt = 1;
+	nft_match->listcnt = 0;
 	list_add(&nft_match->head, &cn->nft_match_list);
 
 	return &nft_match->ops;
@@ -957,7 +942,6 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 	nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize));
 	nft_target->ops.init = nft_target_init;
 	nft_target->ops.destroy = nft_target_destroy;
-	nft_target->ops.activate = nft_compat_activate_tg;
 	nft_target->ops.deactivate = nft_compat_deactivate;
 	nft_target->ops.dump = nft_target_dump;
 	nft_target->ops.validate = nft_target_validate;
@@ -968,7 +952,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
 	else
 		nft_target->ops.eval = nft_target_eval_xt;
 
-	nft_target->listcnt = 1;
+	nft_target->listcnt = 0;
 	list_add(&nft_target->head, &cn->nft_target_list);
 
 	return &nft_target->ops;
-- 
cgit v1.2.3-70-g09d2


From 6dce3c20ac429e7a651d728e375853370c796e8d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 4 Feb 2019 08:36:06 -0800
Subject: rxrpc: bad unlock balance in rxrpc_recvmsg

When either "goto wait_interrupted;" or "goto wait_error;"
paths are taken, socket lock has already been released.

This patch fixes following syzbot splat :

WARNING: bad unlock balance detected!
5.0.0-rc4+ #59 Not tainted
-------------------------------------
syz-executor223/8256 is trying to release lock (sk_lock-AF_RXRPC) at:
[<ffffffff86651353>] rxrpc_recvmsg+0x6d3/0x3099 net/rxrpc/recvmsg.c:598
but there are no more locks to release!

other info that might help us debug this:
1 lock held by syz-executor223/8256:
 #0: 00000000fa9ed0f4 (slock-AF_RXRPC){+...}, at: spin_lock_bh include/linux/spinlock.h:334 [inline]
 #0: 00000000fa9ed0f4 (slock-AF_RXRPC){+...}, at: release_sock+0x20/0x1c0 net/core/sock.c:2798

stack backtrace:
CPU: 1 PID: 8256 Comm: syz-executor223 Not tainted 5.0.0-rc4+ #59
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 print_unlock_imbalance_bug kernel/locking/lockdep.c:3391 [inline]
 print_unlock_imbalance_bug.cold+0x114/0x123 kernel/locking/lockdep.c:3368
 __lock_release kernel/locking/lockdep.c:3601 [inline]
 lock_release+0x67e/0xa00 kernel/locking/lockdep.c:3860
 sock_release_ownership include/net/sock.h:1471 [inline]
 release_sock+0x183/0x1c0 net/core/sock.c:2808
 rxrpc_recvmsg+0x6d3/0x3099 net/rxrpc/recvmsg.c:598
 sock_recvmsg_nosec net/socket.c:794 [inline]
 sock_recvmsg net/socket.c:801 [inline]
 sock_recvmsg+0xd0/0x110 net/socket.c:797
 __sys_recvfrom+0x1ff/0x350 net/socket.c:1845
 __do_sys_recvfrom net/socket.c:1863 [inline]
 __se_sys_recvfrom net/socket.c:1859 [inline]
 __x64_sys_recvfrom+0xe1/0x1a0 net/socket.c:1859
 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x446379
Code: e8 2c b3 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 2b 09 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fe5da89fd98 EFLAGS: 00000246 ORIG_RAX: 000000000000002d
RAX: ffffffffffffffda RBX: 00000000006dbc28 RCX: 0000000000446379
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003
RBP: 00000000006dbc20 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006dbc2c
R13: 0000000000000000 R14: 0000000000000000 R15: 20c49ba5e353f7cf

Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: David Howells <dhowells@redhat.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/recvmsg.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index eaf19ebaa964..3f7bb11f3290 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -596,6 +596,7 @@ error_requeue_call:
 	}
 error_no_call:
 	release_sock(&rx->sk);
+error_trace:
 	trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret);
 	return ret;
 
@@ -604,7 +605,7 @@ wait_interrupted:
 wait_error:
 	finish_wait(sk_sleep(&rx->sk), &wait);
 	call = NULL;
-	goto error_no_call;
+	goto error_trace;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From e248aa7be86e8179f20ac0931774ecd746f3f5bf Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 25 Jan 2019 16:54:54 -0500
Subject: svcrdma: Remove max_sge check at connect time

Two and a half years ago, the client was changed to use gathered
Send for larger inline messages, in commit 655fec6987b ("xprtrdma:
Use gathered Send for large inline messages"). Several fixes were
required because there are a few in-kernel device drivers whose
max_sge is 3, and these were broken by the change.

Apparently my memory is going, because some time later, I submitted
commit 25fd86eca11c ("svcrdma: Don't overrun the SGE array in
svc_rdma_send_ctxt"), and after that, commit f3c1fd0ee294 ("svcrdma:
Reduce max_send_sges"). These too incorrectly assumed in-kernel
device drivers would have more than a few Send SGEs available.

The fix for the server side is not the same. This is because the
fundamental problem on the server is that, whether or not the client
has provisioned a chunk for the RPC reply, the server must squeeze
even the most complex RPC replies into a single RDMA Send. Failing
in the send path because of Send SGE exhaustion should never be an
option.

Therefore, instead of failing when the send path runs out of SGEs,
switch to using a bounce buffer mechanism to handle RPC replies that
are too complex for the device to send directly. That allows us to
remove the max_sge check to enable drivers with small max_sge to
work again.

Reported-by: Don Dutile <ddutile@redhat.com>
Fixes: 25fd86eca11c ("svcrdma: Don't overrun the SGE array in ...")
Cc: stable@vger.kernel.org
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    | 105 +++++++++++++++++++++++++++++--
 net/sunrpc/xprtrdma/svc_rdma_transport.c |   9 +--
 2 files changed, 102 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index cf51b8f9b15f..1f200119268c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -537,6 +537,99 @@ void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma,
 				      DMA_TO_DEVICE);
 }
 
+/* If the xdr_buf has more elements than the device can
+ * transmit in a single RDMA Send, then the reply will
+ * have to be copied into a bounce buffer.
+ */
+static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
+				    struct xdr_buf *xdr,
+				    __be32 *wr_lst)
+{
+	int elements;
+
+	/* xdr->head */
+	elements = 1;
+
+	/* xdr->pages */
+	if (!wr_lst) {
+		unsigned int remaining;
+		unsigned long pageoff;
+
+		pageoff = xdr->page_base & ~PAGE_MASK;
+		remaining = xdr->page_len;
+		while (remaining) {
+			++elements;
+			remaining -= min_t(u32, PAGE_SIZE - pageoff,
+					   remaining);
+			pageoff = 0;
+		}
+	}
+
+	/* xdr->tail */
+	if (xdr->tail[0].iov_len)
+		++elements;
+
+	/* assume 1 SGE is needed for the transport header */
+	return elements >= rdma->sc_max_send_sges;
+}
+
+/* The device is not capable of sending the reply directly.
+ * Assemble the elements of @xdr into the transport header
+ * buffer.
+ */
+static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
+				      struct svc_rdma_send_ctxt *ctxt,
+				      struct xdr_buf *xdr, __be32 *wr_lst)
+{
+	unsigned char *dst, *tailbase;
+	unsigned int taillen;
+
+	dst = ctxt->sc_xprt_buf;
+	dst += ctxt->sc_sges[0].length;
+
+	memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len);
+	dst += xdr->head[0].iov_len;
+
+	tailbase = xdr->tail[0].iov_base;
+	taillen = xdr->tail[0].iov_len;
+	if (wr_lst) {
+		u32 xdrpad;
+
+		xdrpad = xdr_padsize(xdr->page_len);
+		if (taillen && xdrpad) {
+			tailbase += xdrpad;
+			taillen -= xdrpad;
+		}
+	} else {
+		unsigned int len, remaining;
+		unsigned long pageoff;
+		struct page **ppages;
+
+		ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+		pageoff = xdr->page_base & ~PAGE_MASK;
+		remaining = xdr->page_len;
+		while (remaining) {
+			len = min_t(u32, PAGE_SIZE - pageoff, remaining);
+
+			memcpy(dst, page_address(*ppages), len);
+			remaining -= len;
+			dst += len;
+			pageoff = 0;
+		}
+	}
+
+	if (taillen)
+		memcpy(dst, tailbase, taillen);
+
+	ctxt->sc_sges[0].length += xdr->len;
+	ib_dma_sync_single_for_device(rdma->sc_pd->device,
+				      ctxt->sc_sges[0].addr,
+				      ctxt->sc_sges[0].length,
+				      DMA_TO_DEVICE);
+
+	return 0;
+}
+
 /* svc_rdma_map_reply_msg - Map the buffer holding RPC message
  * @rdma: controlling transport
  * @ctxt: send_ctxt for the Send WR
@@ -559,8 +652,10 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 	u32 xdr_pad;
 	int ret;
 
-	if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
-		return -EIO;
+	if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst))
+		return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst);
+
+	++ctxt->sc_cur_sge_no;
 	ret = svc_rdma_dma_map_buf(rdma, ctxt,
 				   xdr->head[0].iov_base,
 				   xdr->head[0].iov_len);
@@ -591,8 +686,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 	while (remaining) {
 		len = min_t(u32, PAGE_SIZE - page_off, remaining);
 
-		if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
-			return -EIO;
+		++ctxt->sc_cur_sge_no;
 		ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++,
 					    page_off, len);
 		if (ret < 0)
@@ -606,8 +700,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 	len = xdr->tail[0].iov_len;
 tail:
 	if (len) {
-		if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
-			return -EIO;
+		++ctxt->sc_cur_sge_no;
 		ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len);
 		if (ret < 0)
 			return ret;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 924c17d46903..57f86c63a463 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -419,12 +419,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	/* Transport header, head iovec, tail iovec */
 	newxprt->sc_max_send_sges = 3;
 	/* Add one SGE per page list entry */
-	newxprt->sc_max_send_sges += svcrdma_max_req_size / PAGE_SIZE;
-	if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) {
-		pr_err("svcrdma: too few Send SGEs available (%d needed)\n",
-		       newxprt->sc_max_send_sges);
-		goto errout;
-	}
+	newxprt->sc_max_send_sges += (svcrdma_max_req_size / PAGE_SIZE) + 1;
+	if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge)
+		newxprt->sc_max_send_sges = dev->attrs.max_send_sge;
 	newxprt->sc_max_req_size = svcrdma_max_req_size;
 	newxprt->sc_max_requests = svcrdma_max_requests;
 	newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;
-- 
cgit v1.2.3-70-g09d2


From 00670cb8a73b10b10d3c40f045c15411715e4465 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 6 Feb 2019 18:35:15 +0300
Subject: net: dsa: Fix NULL checking in dsa_slave_set_eee()

This function can't succeed if dp->pl is NULL.  It will Oops inside the
call to return phylink_ethtool_get_eee(dp->pl, e);

Fixes: 1be52e97ed3e ("dsa: slave: eee: Allow ports to use phylink")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index b5e44825d173..a1c9fe155057 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -642,7 +642,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
 	int ret;
 
 	/* Port's PHY and MAC both need to be EEE capable */
-	if (!dev->phydev && !dp->pl)
+	if (!dev->phydev || !dp->pl)
 		return -ENODEV;
 
 	if (!ds->ops->set_mac_eee)
@@ -662,7 +662,7 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
 	int ret;
 
 	/* Port's PHY and MAC both need to be EEE capable */
-	if (!dev->phydev && !dp->pl)
+	if (!dev->phydev || !dp->pl)
 		return -ENODEV;
 
 	if (!ds->ops->get_mac_eee)
-- 
cgit v1.2.3-70-g09d2


From 173656accaf583698bac3f9e269884ba60d51ef4 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Thu, 7 Feb 2019 18:36:11 +0800
Subject: sit: check if IPv6 enabled before calling
 ip6_err_gen_icmpv6_unreach()

If we disabled IPv6 from the kernel command line (ipv6.disable=1), we should
not call ip6_err_gen_icmpv6_unreach(). This:

  ip link add sit1 type sit local 192.0.2.1 remote 192.0.2.2 ttl 1
  ip link set sit1 up
  ip addr add 198.51.100.1/24 dev sit1
  ping 198.51.100.2

if IPv6 is disabled at boot time, will crash the kernel.

v2: there's no need to use in6_dev_get(), use __in6_dev_get() instead,
    as we only need to check that idev exists and we are under
    rcu_read_lock() (from netif_receive_skb_internal()).

Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: ca15a078bd90 ("sit: generate icmpv6 error when receiving icmpv4 error")
Cc: Oussama Ghorbel <ghorbel@pivasoftware.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/sit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 1e03305c0549..e8a1dabef803 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -546,7 +546,8 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
 	}
 
 	err = 0;
-	if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len))
+	if (__in6_dev_get(skb->dev) &&
+	    !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len))
 		goto out;
 
 	if (t->parms.iph.daddr == 0)
-- 
cgit v1.2.3-70-g09d2