Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next

Pull networking updates from David Miller: 1) More iov_iter conversion work from Al Viro. [ The "crypto: switch af_alg_make_sg() to iov_iter" commit was wrong, and this pull actually adds an extra commit on top of the branch I'm pulling to fix that up, so that the pre-merge state is ok. - Linus ] 2) Various optimizations to the ipv4 forwarding information base trie lookup implementation. From Alexander Duyck. 3) Remove sock_iocb altogether, from CHristoph Hellwig. 4) Allow congestion control algorithm selection via routing metrics. From Daniel Borkmann. 5) Make ipv4 uncached route list per-cpu, from Eric Dumazet. 6) Handle rfs hash collisions more gracefully, also from Eric Dumazet. 7) Add xmit_more support to r8169, e1000, and e1000e drivers. From Florian Westphal. 8) Transparent Ethernet Bridging support for GRO, from Jesse Gross. 9) Add BPF packet actions to packet scheduler, from Jiri Pirko. 10) Add support for uniqu flow IDs to openvswitch, from Joe Stringer. 11) New NetCP ethernet driver, from Muralidharan Karicheri and Wingman Kwok. 12) More sanely handle out-of-window dupacks, which can result in serious ACK storms. From Neal Cardwell. 13) Various rhashtable bug fixes and enhancements, from Herbert Xu, Patrick McHardy, and Thomas Graf. 14) Support xmit_more in be2net, from Sathya Perla. 15) Group Policy extensions for vxlan, from Thomas Graf. 16) Remove Checksum Offload support for vxlan, from Tom Herbert. 17) Like ipv4, support lockless transmit over ipv6 UDP sockets. From Vlad Yasevich. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1494+1 commits) crypto: fix af_alg_make_sg() conversion to iov_iter ipv4: Namespecify TCP PMTU mechanism i40e: Fix for stats init function call in Rx setup tcp: don't include Fast Open option in SYN-ACK on pure SYN-data openvswitch: Only set TUNNEL_VXLAN_OPT if VXLAN-GBP metadata is set ipv6: Make __ipv6_select_ident static ipv6: Fix fragment id assignment on LE arches. bridge: Fix inability to add non-vlan fdb entry net: Mellanox: Delete unnecessary checks before the function call "vunmap" cxgb4: Add support in cxgb4 to get expansion rom version via ethtool ethtool: rename reserved1 memeber in ethtool_drvinfo for expansion ROM version net: dsa: Remove redundant phy_attach() IB/mlx4: Reset flow support for IB kernel ULPs IB/mlx4: Always use the correct port for mirrored multicast attachments net/bonding: Fix potential bad memory access during bonding events tipc: remove tipc_snprintf tipc: nl compat add noop and remove legacy nl framework tipc: convert legacy nl stats show to nl compat tipc: convert legacy nl net id get to nl compat tipc: convert legacy nl net id set to nl compat ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-10 20:01:30 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-10 20:01:30 -0800
commit: c5ce28df0e7c01a1de23c36ebdefcd803f2b6cbb (patch)
tree: 9830baf38832769e1cf621708889111bbe3c93df /net/sched
parent: 29afc4e9a408f2304e09c6dd0dbcfbd2356d0faa (diff)
parent: 9399f0c51489ae8c16d6559b82a452fdc1895e91 (diff)
14 files changed, 481 insertions, 38 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index c54c9d9d1ffb..899d0319f2b2 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -698,6 +698,30 @@ config NET_ACT_VLAN
 	  To compile this code as a module, choose M here: the
 	  module will be called act_vlan.
 
+config NET_ACT_BPF
+        tristate "BPF based action"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to execute BPF code on packets. The BPF code will decide
+	  if the packet should be dropped or not.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_bpf.
+
+config NET_ACT_CONNMARK
+        tristate "Netfilter Connection Mark Retriever"
+        depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+        depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+        ---help---
+	  Say Y here to allow retrieving of conn mark
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_connmark.
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 679f24ae7f93..7ca7f4c1b8c2 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -17,6 +17,8 @@ obj-$(CONFIG_NET_ACT_SIMP)	+= act_simple.o
 obj-$(CONFIG_NET_ACT_SKBEDIT)	+= act_skbedit.o
 obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
 obj-$(CONFIG_NET_ACT_VLAN)	+= act_vlan.o
+obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
+obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
new file mode 100644
index 000000000000..82c5d7fc1988
--- /dev/null
+++ b/net/sched/act_bpf.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/filter.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_bpf.h>
+#include <net/tc_act/tc_bpf.h>
+
+#define BPF_TAB_MASK     15
+
+static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a,
+		   struct tcf_result *res)
+{
+	struct tcf_bpf *b = a->priv;
+	int action;
+	int filter_res;
+
+	spin_lock(&b->tcf_lock);
+	b->tcf_tm.lastuse = jiffies;
+	bstats_update(&b->tcf_bstats, skb);
+	action = b->tcf_action;
+
+	filter_res = BPF_PROG_RUN(b->filter, skb);
+	if (filter_res == 0) {
+		/* Return code 0 from the BPF program
+		 * is being interpreted as a drop here.
+		 */
+		action = TC_ACT_SHOT;
+		b->tcf_qstats.drops++;
+	}
+
+	spin_unlock(&b->tcf_lock);
+	return action;
+}
+
+static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *a,
+			int bind, int ref)
+{
+	unsigned char *tp = skb_tail_pointer(skb);
+	struct tcf_bpf *b = a->priv;
+	struct tc_act_bpf opt = {
+		.index    = b->tcf_index,
+		.refcnt   = b->tcf_refcnt - ref,
+		.bindcnt  = b->tcf_bindcnt - bind,
+		.action   = b->tcf_action,
+	};
+	struct tcf_t t;
+	struct nlattr *nla;
+
+	if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	if (nla_put_u16(skb, TCA_ACT_BPF_OPS_LEN, b->bpf_num_ops))
+		goto nla_put_failure;
+
+	nla = nla_reserve(skb, TCA_ACT_BPF_OPS, b->bpf_num_ops *
+			  sizeof(struct sock_filter));
+	if (!nla)
+		goto nla_put_failure;
+
+	memcpy(nla_data(nla), b->bpf_ops, nla_len(nla));
+
+	t.install = jiffies_to_clock_t(jiffies - b->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - b->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(b->tcf_tm.expires);
+	if (nla_put(skb, TCA_ACT_BPF_TM, sizeof(t), &t))
+		goto nla_put_failure;
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, tp);
+	return -1;
+}
+
+static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = {
+	[TCA_ACT_BPF_PARMS]	= { .len = sizeof(struct tc_act_bpf) },
+	[TCA_ACT_BPF_OPS_LEN]	= { .type = NLA_U16 },
+	[TCA_ACT_BPF_OPS]	= { .type = NLA_BINARY,
+				    .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
+};
+
+static int tcf_bpf_init(struct net *net, struct nlattr *nla,
+			struct nlattr *est, struct tc_action *a,
+			int ovr, int bind)
+{
+	struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
+	struct tc_act_bpf *parm;
+	struct tcf_bpf *b;
+	u16 bpf_size, bpf_num_ops;
+	struct sock_filter *bpf_ops;
+	struct sock_fprog_kern tmp;
+	struct bpf_prog *fp;
+	int ret;
+
+	if (!nla)
+		return -EINVAL;
+
+	ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[TCA_ACT_BPF_PARMS] ||
+	    !tb[TCA_ACT_BPF_OPS_LEN] || !tb[TCA_ACT_BPF_OPS])
+		return -EINVAL;
+	parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
+
+	bpf_num_ops = nla_get_u16(tb[TCA_ACT_BPF_OPS_LEN]);
+	if (bpf_num_ops	> BPF_MAXINSNS || bpf_num_ops == 0)
+		return -EINVAL;
+
+	bpf_size = bpf_num_ops * sizeof(*bpf_ops);
+	if (bpf_size != nla_len(tb[TCA_ACT_BPF_OPS]))
+		return -EINVAL;
+
+	bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
+	if (!bpf_ops)
+		return -ENOMEM;
+
+	memcpy(bpf_ops, nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size);
+
+	tmp.len = bpf_num_ops;
+	tmp.filter = bpf_ops;
+
+	ret = bpf_prog_create(&fp, &tmp);
+	if (ret)
+		goto free_bpf_ops;
+
+	if (!tcf_hash_check(parm->index, a, bind)) {
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*b), bind);
+		if (ret)
+			goto destroy_fp;
+
+		ret = ACT_P_CREATED;
+	} else {
+		if (bind)
+			goto destroy_fp;
+		tcf_hash_release(a, bind);
+		if (!ovr) {
+			ret = -EEXIST;
+			goto destroy_fp;
+		}
+	}
+
+	b = to_bpf(a);
+	spin_lock_bh(&b->tcf_lock);
+	b->tcf_action = parm->action;
+	b->bpf_num_ops = bpf_num_ops;
+	b->bpf_ops = bpf_ops;
+	b->filter = fp;
+	spin_unlock_bh(&b->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(a);
+	return ret;
+
+destroy_fp:
+	bpf_prog_destroy(fp);
+free_bpf_ops:
+	kfree(bpf_ops);
+	return ret;
+}
+
+static void tcf_bpf_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_bpf *b = a->priv;
+
+	bpf_prog_destroy(b->filter);
+}
+
+static struct tc_action_ops act_bpf_ops = {
+	.kind =		"bpf",
+	.type =		TCA_ACT_BPF,
+	.owner =	THIS_MODULE,
+	.act =		tcf_bpf,
+	.dump =		tcf_bpf_dump,
+	.cleanup =	tcf_bpf_cleanup,
+	.init =		tcf_bpf_init,
+};
+
+static int __init bpf_init_module(void)
+{
+	return tcf_register_action(&act_bpf_ops, BPF_TAB_MASK);
+}
+
+static void __exit bpf_cleanup_module(void)
+{
+	tcf_unregister_action(&act_bpf_ops);
+}
+
+module_init(bpf_init_module);
+module_exit(bpf_cleanup_module);
+
+MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>");
+MODULE_DESCRIPTION("TC BPF based action");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
new file mode 100644
index 000000000000..8e472518f9f6
--- /dev/null
+++ b/net/sched/act_connmark.c
@@ -0,0 +1,192 @@
+/*
+ * net/sched/act_connmark.c  netfilter connmark retriever action
+ * skb mark is over-written
+ *
+ * Copyright (c) 2011 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/act_api.h>
+#include <uapi/linux/tc_act/tc_connmark.h>
+#include <net/tc_act/tc_connmark.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+#define CONNMARK_TAB_MASK     3
+
+static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
+			struct tcf_result *res)
+{
+	const struct nf_conntrack_tuple_hash *thash;
+	struct nf_conntrack_tuple tuple;
+	enum ip_conntrack_info ctinfo;
+	struct tcf_connmark_info *ca = a->priv;
+	struct nf_conn *c;
+	int proto;
+
+	spin_lock(&ca->tcf_lock);
+	ca->tcf_tm.lastuse = jiffies;
+	bstats_update(&ca->tcf_bstats, skb);
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		if (skb->len < sizeof(struct iphdr))
+			goto out;
+
+		proto = NFPROTO_IPV4;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		if (skb->len < sizeof(struct ipv6hdr))
+			goto out;
+
+		proto = NFPROTO_IPV6;
+	} else {
+		goto out;
+	}
+
+	c = nf_ct_get(skb, &ctinfo);
+	if (c) {
+		skb->mark = c->mark;
+		/* using overlimits stats to count how many packets marked */
+		ca->tcf_qstats.overlimits++;
+		nf_ct_put(c);
+		goto out;
+	}
+
+	if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+			       proto, &tuple))
+		goto out;
+
+	thash = nf_conntrack_find_get(dev_net(skb->dev), ca->zone, &tuple);
+	if (!thash)
+		goto out;
+
+	c = nf_ct_tuplehash_to_ctrack(thash);
+	/* using overlimits stats to count how many packets marked */
+	ca->tcf_qstats.overlimits++;
+	skb->mark = c->mark;
+	nf_ct_put(c);
+
+out:
+	skb->nfct = NULL;
+	spin_unlock(&ca->tcf_lock);
+	return ca->tcf_action;
+}
+
+static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
+	[TCA_CONNMARK_PARMS] = { .len = sizeof(struct tc_connmark) },
+};
+
+static int tcf_connmark_init(struct net *net, struct nlattr *nla,
+			     struct nlattr *est, struct tc_action *a,
+			     int ovr, int bind)
+{
+	struct nlattr *tb[TCA_CONNMARK_MAX + 1];
+	struct tcf_connmark_info *ci;
+	struct tc_connmark *parm;
+	int ret = 0;
+
+	if (!nla)
+		return -EINVAL;
+
+	ret = nla_parse_nested(tb, TCA_CONNMARK_MAX, nla, connmark_policy);
+	if (ret < 0)
+		return ret;
+
+	parm = nla_data(tb[TCA_CONNMARK_PARMS]);
+
+	if (!tcf_hash_check(parm->index, a, bind)) {
+		ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), bind);
+		if (ret)
+			return ret;
+
+		ci = to_connmark(a);
+		ci->tcf_action = parm->action;
+		ci->zone = parm->zone;
+
+		tcf_hash_insert(a);
+		ret = ACT_P_CREATED;
+	} else {
+		ci = to_connmark(a);
+		if (bind)
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
+			return -EEXIST;
+		/* replacing action and zone */
+		ci->tcf_action = parm->action;
+		ci->zone = parm->zone;
+	}
+
+	return ret;
+}
+
+static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
+				    int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_connmark_info *ci = a->priv;
+
+	struct tc_connmark opt = {
+		.index   = ci->tcf_index,
+		.refcnt  = ci->tcf_refcnt - ref,
+		.bindcnt = ci->tcf_bindcnt - bind,
+		.action  = ci->tcf_action,
+		.zone   = ci->zone,
+	};
+	struct tcf_t t;
+
+	if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	t.install = jiffies_to_clock_t(jiffies - ci->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(ci->tcf_tm.expires);
+	if (nla_put(skb, TCA_CONNMARK_TM, sizeof(t), &t))
+		goto nla_put_failure;
+
+	return skb->len;
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tc_action_ops act_connmark_ops = {
+	.kind		=	"connmark",
+	.type		=	TCA_ACT_CONNMARK,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_connmark,
+	.dump		=	tcf_connmark_dump,
+	.init		=	tcf_connmark_init,
+};
+
+static int __init connmark_init_module(void)
+{
+	return tcf_register_action(&act_connmark_ops, CONNMARK_TAB_MASK);
+}
+
+static void __exit connmark_cleanup_module(void)
+{
+	tcf_unregister_action(&act_connmark_ops);
+}
+
+module_init(connmark_init_module);
+module_exit(connmark_cleanup_module);
+MODULE_AUTHOR("Felix Fietkau <nbd@openwrt.org>");
+MODULE_DESCRIPTION("Connection tracking mark restoring");
+MODULE_LICENSE("GPL");
+
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index edbf40dac709..4cd5cf1aedf8 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -509,7 +509,7 @@ static int tcf_csum(struct sk_buff *skb,
 	if (unlikely(action == TC_ACT_SHOT))
 		goto drop;
 
-	switch (skb->protocol) {
+	switch (tc_skb_protocol(skb)) {
 	case cpu_to_be16(ETH_P_IP):
 		if (!tcf_csum_ipv4(skb, update_flags))
 			goto drop;
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 5aed341406c2..fc399db86f11 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -65,9 +65,12 @@ static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
 	if (head == NULL)
 		return 0UL;
 
-	list_for_each_entry(f, &head->flist, link)
-		if (f->handle == handle)
+	list_for_each_entry(f, &head->flist, link) {
+		if (f->handle == handle) {
 			l = (unsigned long) f;
+			break;
+		}
+	}
 
 	return l;
 }
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index f59adf8a4cd7..5f3ee9e4b5bf 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -37,7 +37,7 @@ struct cls_bpf_prog {
 	struct tcf_result res;
 	struct list_head link;
 	u32 handle;
-	u16 bpf_len;
+	u16 bpf_num_ops;
 	struct tcf_proto *tp;
 	struct rcu_head rcu;
 };
@@ -160,7 +160,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
 	struct tcf_exts exts;
 	struct sock_fprog_kern tmp;
 	struct bpf_prog *fp;
-	u16 bpf_size, bpf_len;
+	u16 bpf_size, bpf_num_ops;
 	u32 classid;
 	int ret;
 
@@ -173,13 +173,13 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
 		return ret;
 
 	classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
-	bpf_len = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
-	if (bpf_len > BPF_MAXINSNS || bpf_len == 0) {
+	bpf_num_ops = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
+	if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) {
 		ret = -EINVAL;
 		goto errout;
 	}
 
-	bpf_size = bpf_len * sizeof(*bpf_ops);
+	bpf_size = bpf_num_ops * sizeof(*bpf_ops);
 	if (bpf_size != nla_len(tb[TCA_BPF_OPS])) {
 		ret = -EINVAL;
 		goto errout;
@@ -193,14 +193,14 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
 
 	memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
 
-	tmp.len = bpf_len;
+	tmp.len = bpf_num_ops;
 	tmp.filter = bpf_ops;
 
 	ret = bpf_prog_create(&fp, &tmp);
 	if (ret)
 		goto errout_free;
 
-	prog->bpf_len = bpf_len;
+	prog->bpf_num_ops = bpf_num_ops;
 	prog->bpf_ops = bpf_ops;
 	prog->filter = fp;
 	prog->res.classid = classid;
@@ -314,10 +314,10 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 
 	if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
 		goto nla_put_failure;
-	if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_len))
+	if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops))
 		goto nla_put_failure;
 
-	nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_len *
+	nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops *
 			  sizeof(struct sock_filter));
 	if (nla == NULL)
 		goto nla_put_failure;
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 15d68f24a521..461410394d08 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -77,7 +77,7 @@ static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
 {
 	if (flow->dst)
 		return ntohl(flow->dst);
-	return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
+	return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
 }
 
 static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
@@ -98,7 +98,7 @@ static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys
 	if (flow->ports)
 		return ntohs(flow->port16[1]);
 
-	return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
+	return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
 }
 
 static u32 flow_get_iif(const struct sk_buff *skb)
@@ -144,7 +144,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
 
 static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	switch (skb->protocol) {
+	switch (tc_skb_protocol(skb)) {
 	case htons(ETH_P_IP):
 		return ntohl(CTTUPLE(skb, src.u3.ip));
 	case htons(ETH_P_IPV6):
@@ -156,7 +156,7 @@ fallback:
 
 static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	switch (skb->protocol) {
+	switch (tc_skb_protocol(skb)) {
 	case htons(ETH_P_IP):
 		return ntohl(CTTUPLE(skb, dst.u3.ip));
 	case htons(ETH_P_IPV6):
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
index 5b4a4efe468c..a3d79c8bf3b8 100644
--- a/net/sched/em_ipset.c
+++ b/net/sched/em_ipset.c
@@ -59,7 +59,7 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
 	struct net_device *dev, *indev = NULL;
 	int ret, network_offset;
 
-	switch (skb->protocol) {
+	switch (tc_skb_protocol(skb)) {
 	case htons(ETH_P_IP):
 		acpar.family = NFPROTO_IPV4;
 		if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index c8f8c399b99a..b5294ce20cd4 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -176,7 +176,7 @@ META_COLLECTOR(int_vlan_tag)
 {
 	unsigned short tag;
 
-	tag = vlan_tx_tag_get(skb);
+	tag = skb_vlan_tag_get(skb);
 	if (!tag && __vlan_get_tag(skb, &tag))
 		*err = -1;
 	else
@@ -197,7 +197,7 @@ META_COLLECTOR(int_priority)
 META_COLLECTOR(int_protocol)
 {
 	/* Let userspace take care of the byte ordering */
-	dst->value = skb->protocol;
+	dst->value = tc_skb_protocol(skb);
 }
 
 META_COLLECTOR(int_pkttype)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 76f402e05bd6..243b7d169d61 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1807,7 +1807,7 @@ done:
 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
 		       struct tcf_result *res)
 {
-	__be16 protocol = skb->protocol;
+	__be16 protocol = tc_skb_protocol(skb);
 	int err;
 
 	for (; tp; tp = rcu_dereference_bh(tp->next)) {
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 227114f27f94..66700a6116aa 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -203,7 +203,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);
 
 	if (p->set_tc_index) {
-		switch (skb->protocol) {
+		switch (tc_skb_protocol(skb)) {
 		case htons(ETH_P_IP):
 			if (skb_cow_head(skb, sizeof(struct iphdr)))
 				goto drop;
@@ -289,7 +289,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
 	index = skb->tc_index & (p->indices - 1);
 	pr_debug("index %d->%d\n", skb->tc_index, index);
 
-	switch (skb->protocol) {
+	switch (tc_skb_protocol(skb)) {
 	case htons(ETH_P_IP):
 		ipv4_change_dsfield(ip_hdr(skb), p->mask[index],
 				    p->value[index]);
@@ -306,7 +306,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
 		 */
 		if (p->mask[index] != 0xff || p->value[index])
 			pr_warn("%s: unsupported protocol %d\n",
-				__func__, ntohs(skb->protocol));
+				__func__, ntohs(tc_skb_protocol(skb)));
 		break;
 	}
 
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 333cd94ba381..dfcea20e3171 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -1,7 +1,7 @@
 /*
  * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
  *
- *  Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
+ *  Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
  *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
@@ -52,6 +52,7 @@
 #include <net/pkt_sched.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
+#include <net/tcp.h>
 
 /*
  * Per flow structure, dynamically allocated
@@ -92,6 +93,7 @@ struct fq_sched_data {
 	u32		flow_refill_delay;
 	u32		flow_max_rate;	/* optional max rate per flow */
 	u32		flow_plimit;	/* max packets per flow */
+	u32		orphan_mask;	/* mask for orphaned skb */
 	struct rb_root	*fq_root;
 	u8		rate_enable;
 	u8		fq_trees_log;
@@ -222,11 +224,20 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
 	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
 		return &q->internal;
 
-	if (unlikely(!sk)) {
+	/* SYNACK messages are attached to a listener socket.
+	 * 1) They are not part of a 'flow' yet
+	 * 2) We do not want to rate limit them (eg SYNFLOOD attack),
+	 *    especially if the listener set SO_MAX_PACING_RATE
+	 * 3) We pretend they are orphaned
+	 */
+	if (!sk || sk->sk_state == TCP_LISTEN) {
+		unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
+
 		/* By forcing low order bit to 1, we make sure to not
 		 * collide with a local flow (socket pointers are word aligned)
 		 */
-		sk = (struct sock *)(skb_get_hash(skb) | 1L);
+		sk = (struct sock *)((hash << 1) | 1UL);
+		skb_orphan(skb);
 	}
 
 	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
@@ -445,7 +456,9 @@ begin:
 		goto begin;
 	}
 
-	if (unlikely(f->head && now < f->time_next_packet)) {
+	skb = f->head;
+	if (unlikely(skb && now < f->time_next_packet &&
+		     !skb_is_tcp_pure_ack(skb))) {
 		head->first = f->next;
 		fq_flow_set_throttled(q, f);
 		goto begin;
@@ -464,14 +477,17 @@ begin:
 		goto begin;
 	}
 	prefetch(&skb->end);
-	f->time_next_packet = now;
 	f->credit -= qdisc_pkt_len(skb);
 
 	if (f->credit > 0 || !q->rate_enable)
 		goto out;
 
+	/* Do not pace locally generated ack packets */
+	if (skb_is_tcp_pure_ack(skb))
+		goto out;
+
 	rate = q->flow_max_rate;
-	if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT)
+	if (skb->sk)
 		rate = min(skb->sk->sk_pacing_rate, rate);
 
 	if (rate != ~0U) {
@@ -704,6 +720,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
 		q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
 	}
 
+	if (tb[TCA_FQ_ORPHAN_MASK])
+		q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
+
 	if (!err) {
 		sch_tree_unlock(sch);
 		err = fq_resize(sch, fq_log);
@@ -749,6 +768,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->delayed		= RB_ROOT;
 	q->fq_root		= NULL;
 	q->fq_trees_log		= ilog2(1024);
+	q->orphan_mask		= 1024 - 1;
 	qdisc_watchdog_init(&q->watchdog, sch);
 
 	if (opt)
@@ -778,6 +798,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
 	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
 			jiffies_to_usecs(q->flow_refill_delay)) ||
+	    nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
 	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
 		goto nla_put_failure;
 
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 6ada42396a24..e02687185a59 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -122,13 +122,6 @@ teql_peek(struct Qdisc *sch)
 	return NULL;
 }
 
-static inline void
-teql_neigh_release(struct neighbour *n)
-{
-	if (n)
-		neigh_release(n);
-}
-
 static void
 teql_reset(struct Qdisc *sch)
 {
@@ -249,8 +242,8 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
 		char haddr[MAX_ADDR_LEN];
 
 		neigh_ha_snapshot(haddr, n, dev);
-		err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,
-				      NULL, skb->len);
+		err = dev_hard_header(skb, dev, ntohs(tc_skb_protocol(skb)),
+				      haddr, NULL, skb->len);
 
 		if (err < 0)
 			err = -EINVAL;
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-10 20:01:30 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-10 20:01:30 -0800
commit	c5ce28df0e7c01a1de23c36ebdefcd803f2b6cbb (patch)
tree	9830baf38832769e1cf621708889111bbe3c93df /net/sched
parent	29afc4e9a408f2304e09c6dd0dbcfbd2356d0faa (diff)
parent	9399f0c51489ae8c16d6559b82a452fdc1895e91 (diff)