From 87775312a86bcf213e3b21f6f7c79e2e00d96f7b Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Thu, 2 Jul 2015 16:30:24 +0200 Subject: net-ipv6: Delete an unnecessary check before the function call "free_percpu" The free_percpu() function tests whether its argument is NULL and then returns immediately. Thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: David S. Miller --- net/ipv6/route.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1a1122a6bbf5..6090969937f8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -369,10 +369,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) struct inet6_dev *idev; dst_destroy_metrics_generic(dst); - - if (rt->rt6i_pcpu) - free_percpu(rt->rt6i_pcpu); - + free_percpu(rt->rt6i_pcpu); rt6_uncached_list_del(rt); idev = rt->rt6i_idev; -- cgit v1.2.3-70-g09d2 From 19e42e45150672124b6a4341e2bc7982d247f0ac Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:48 +0200 Subject: ipv6: support for fib route lwtunnel encap attributes This patch adds support in ipv6 fib functions to parse Netlink RTA encap attributes and attach encap state data to rt6_info. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 +++ net/ipv6/ip6_fib.c | 2 ++ net/ipv6/route.c | 33 ++++++++++++++++++++++++++++++--- 3 files changed, 35 insertions(+), 3 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 3b76849c190f..276328e3daa6 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -51,6 +51,8 @@ struct fib6_config { struct nlattr *fc_mp; struct nl_info fc_nlinfo; + struct nlattr *fc_encap; + u16 fc_encap_type; }; struct fib6_node { @@ -131,6 +133,7 @@ struct rt6_info { /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; + struct lwtunnel_state *rt6i_lwtstate; }; static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 55d19861ab20..d715f2e0c4e7 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -177,6 +178,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { + lwtunnel_state_put(rt->rt6i_lwtstate); rt6_free_pcpu(rt); dst_free(&rt->dst); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6090969937f8..b3431b79dfb1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -58,6 +58,7 @@ #include #include #include +#include #include @@ -1770,6 +1771,17 @@ int ip6_route_add(struct fib6_config *cfg) rt->dst.output = ip6_output; + if (cfg->fc_encap) { + struct lwtunnel_state *lwtstate; + + err = lwtunnel_build_state(dev, cfg->fc_encap_type, + cfg->fc_encap, &lwtstate); + if (err) + goto out; + lwtunnel_state_get(lwtstate); + rt->rt6i_lwtstate = lwtstate; + } + ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; if (rt->rt6i_dst.plen == 128) @@ -2595,6 +2607,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_PREF] = { .type = NLA_U8 }, + [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, + [RTA_ENCAP] = { .type = NLA_NESTED }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2689,6 +2703,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_flags |= RTF_PREF(pref); } + if (tb[RTA_ENCAP]) + cfg->fc_encap = tb[RTA_ENCAP]; + + if (tb[RTA_ENCAP_TYPE]) + cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); + err = 0; errout: return err; @@ -2721,6 +2741,10 @@ beginning: r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } + r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); + nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); + if (nla) + r_cfg.fc_encap_type = nla_get_u16(nla); } err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg); if (err) { @@ -2783,7 +2807,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) return ip6_route_add(&cfg); } -static inline size_t rt6_nlmsg_size(void) +static inline size_t rt6_nlmsg_size(struct rt6_info *rt) { return NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(16) /* RTA_SRC */ @@ -2797,7 +2821,8 @@ static inline size_t rt6_nlmsg_size(void) + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ - + nla_total_size(1); /* RTA_PREF */ + + nla_total_size(1) /* RTA_PREF */ + + lwtunnel_get_encap_size(rt->rt6i_lwtstate); } static int rt6_fill_node(struct net *net, @@ -2945,6 +2970,8 @@ static int rt6_fill_node(struct net *net, if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) goto nla_put_failure; + lwtunnel_fill_encap(skb, rt->rt6i_lwtstate); + nlmsg_end(skb, nlh); return 0; @@ -3071,7 +3098,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; - skb = nlmsg_new(rt6_nlmsg_size(), gfp_any()); + skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (!skb) goto errout; -- cgit v1.2.3-70-g09d2 From 74a0f2fe8ed51e3adbb1c882be04672fe7bb6996 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:51 +0200 Subject: ipv6: rt6_info output redirect to tunnel output This is similar to ipv4 redirect of dst output to lwtunnel output function for encapsulation and xmit. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv6/route.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b3431b79dfb1..7f2214f8fde7 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1780,6 +1780,7 @@ int ip6_route_add(struct fib6_config *cfg) goto out; lwtunnel_state_get(lwtstate); rt->rt6i_lwtstate = lwtstate; + rt->dst.output = lwtunnel_output6; } ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); -- cgit v1.2.3-70-g09d2 From 6673a9f4e35c1f0e9976cd4e88042f87674a6b02 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 24 Jul 2015 10:59:41 +0200 Subject: ipv6: use lwtunnel_output6() only if flag redirect is set This function make sense only when LWTUNNEL_STATE_OUTPUT_REDIRECT is set. The check is already done in IPv4. CC: Thomas Graf CC: Roopa Prabhu Fixes: 74a0f2fe8ed5 ("ipv6: rt6_info output redirect to tunnel output") Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv6/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7f2214f8fde7..f91d2637072b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1780,7 +1780,8 @@ int ip6_route_add(struct fib6_config *cfg) goto out; lwtunnel_state_get(lwtstate); rt->rt6i_lwtstate = lwtstate; - rt->dst.output = lwtunnel_output6; + if (lwtunnel_output_redirect(rt->rt6i_lwtstate)) + rt->dst.output = lwtunnel_output6; } ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); -- cgit v1.2.3-70-g09d2 From d943659508a4fb883507fdd3f998329e70a8f922 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 24 Jul 2015 12:28:35 +0200 Subject: ipv6: copy lwtstate in ip6_rt_copy_init() We need to copy this field (ip6_rt_cache_alloc() and ip6_rt_pcpu_alloc() use ip6_rt_copy_init() to build a dst). CC: Thomas Graf CC: Roopa Prabhu Fixes: 19e42e451506 ("ipv6: support for fib route lwtunnel encap attributes") Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f91d2637072b..fbe27fb6bd3f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2161,6 +2161,10 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) #endif rt->rt6i_prefsrc = ort->rt6i_prefsrc; rt->rt6i_table = ort->rt6i_table; + if (ort->rt6i_lwtstate) { + lwtunnel_state_get(ort->rt6i_lwtstate); + rt->rt6i_lwtstate = ort->rt6i_lwtstate; + } } #ifdef CONFIG_IPV6_ROUTE_INFO -- cgit v1.2.3-70-g09d2 From 5a6228a0b472062646434cd2536d109c102b606e Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 24 Jul 2015 12:28:36 +0200 Subject: lwtunnel: change prototype of lwtunnel_state_get() It saves some lines and simplify a bit the code when the state is returning by this function. It's also useful to handle a NULL entry. To avoid too long lines, I've also renamed lwtunnel_state_get() and lwtunnel_state_put() to lwtstate_get() and lwtstate_put(). CC: Thomas Graf CC: Roopa Prabhu Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 16 +++++++++++----- net/ipv4/fib_semantics.c | 9 ++++----- net/ipv4/route.c | 9 ++------- net/ipv6/ip6_fib.c | 2 +- net/ipv6/route.c | 8 ++------ 5 files changed, 20 insertions(+), 24 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 918e03c1dafa..b02039081b04 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -35,12 +35,16 @@ extern const struct lwtunnel_encap_ops __rcu * lwtun_encaps[LWTUNNEL_ENCAP_MAX+1]; #ifdef CONFIG_LWTUNNEL -static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +static inline struct lwtunnel_state * +lwtstate_get(struct lwtunnel_state *lws) { - atomic_inc(&lws->refcnt); + if (lws) + atomic_inc(&lws->refcnt); + + return lws; } -static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +static inline void lwtstate_put(struct lwtunnel_state *lws) { if (!lws) return; @@ -74,11 +78,13 @@ int lwtunnel_output6(struct sock *sk, struct sk_buff *skb); #else -static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +static inline struct lwtunnel_state * +lwtstate_get(struct lwtunnel_state *lws) { + return lws; } -static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +static inline void lwtstate_put(struct lwtunnel_state *lws) { } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d4c6732cfbfa..65e00399a9a6 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -209,7 +209,7 @@ static void free_fib_info_rcu(struct rcu_head *head) change_nexthops(fi) { if (nexthop_nh->nh_dev) dev_put(nexthop_nh->nh_dev); - lwtunnel_state_put(nexthop_nh->nh_lwtstate); + lwtstate_put(nexthop_nh->nh_lwtstate); free_nh_exceptions(nexthop_nh); rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); rt_fibinfo_free(&nexthop_nh->nh_rth_input); @@ -514,8 +514,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, nla, &lwtstate); if (ret) goto errout; - lwtunnel_state_get(lwtstate); - nexthop_nh->nh_lwtstate = lwtstate; + nexthop_nh->nh_lwtstate = + lwtstate_get(lwtstate); } } @@ -971,8 +971,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (err) goto failure; - lwtunnel_state_get(lwtstate); - nh->nh_lwtstate = lwtstate; + nh->nh_lwtstate = lwtstate_get(lwtstate); } nh->nh_oif = cfg->fc_oif; nh->nh_gw = cfg->fc_gw; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 519ec232818d..11096396ef4a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1358,7 +1358,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst) list_del(&rt->rt_uncached); spin_unlock_bh(&ul->lock); } - lwtunnel_state_put(rt->rt_lwtstate); + lwtstate_put(rt->rt_lwtstate); } void rt_flush_dev(struct net_device *dev) @@ -1407,12 +1407,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif - if (nh->nh_lwtstate) { - lwtunnel_state_get(nh->nh_lwtstate); - rt->rt_lwtstate = nh->nh_lwtstate; - } else { - rt->rt_lwtstate = NULL; - } + rt->rt_lwtstate = lwtstate_get(nh->nh_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index d715f2e0c4e7..5693b5eb8482 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -178,7 +178,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { - lwtunnel_state_put(rt->rt6i_lwtstate); + lwtstate_put(rt->rt6i_lwtstate); rt6_free_pcpu(rt); dst_free(&rt->dst); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index fbe27fb6bd3f..c9b2b9fe83fc 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1778,8 +1778,7 @@ int ip6_route_add(struct fib6_config *cfg) cfg->fc_encap, &lwtstate); if (err) goto out; - lwtunnel_state_get(lwtstate); - rt->rt6i_lwtstate = lwtstate; + rt->rt6i_lwtstate = lwtstate_get(lwtstate); if (lwtunnel_output_redirect(rt->rt6i_lwtstate)) rt->dst.output = lwtunnel_output6; } @@ -2161,10 +2160,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) #endif rt->rt6i_prefsrc = ort->rt6i_prefsrc; rt->rt6i_table = ort->rt6i_table; - if (ort->rt6i_lwtstate) { - lwtunnel_state_get(ort->rt6i_lwtstate); - rt->rt6i_lwtstate = ort->rt6i_lwtstate; - } + rt->rt6i_lwtstate = lwtstate_get(ort->rt6i_lwtstate); } #ifdef CONFIG_IPV6_ROUTE_INFO -- cgit v1.2.3-70-g09d2 From 990edb428c2c85c22ca770330437db7183cbe8b5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 24 Jul 2015 09:57:42 -0700 Subject: ipv6: Re-arrange code in rt6_probe() It is a prep work for the next patch to remove write_lock from rt6_probe(). 1. Reduce the number of if(neigh) check. From 4 to 1. 2. Bring the write_(un)lock() closer to the operations that the lock is protecting. Hopefully, the above make rt6_probe() more readable. Signed-off-by: Martin KaFai Lau Cc: Hannes Frederic Sowa Cc: Julian Anastasov Cc: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/route.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c9b2b9fe83fc..0ef52623fc77 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -545,6 +545,7 @@ static void rt6_probe_deferred(struct work_struct *w) static void rt6_probe(struct rt6_info *rt) { + struct __rt6_probe_work *work; struct neighbour *neigh; /* * Okay, this does not seem to be appropriate @@ -559,34 +560,29 @@ static void rt6_probe(struct rt6_info *rt) rcu_read_lock_bh(); neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); if (neigh) { + work = NULL; write_lock(&neigh->lock); - if (neigh->nud_state & NUD_VALID) - goto out; - } - - if (!neigh || - time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) { - struct __rt6_probe_work *work; - - work = kmalloc(sizeof(*work), GFP_ATOMIC); - - if (neigh && work) - __neigh_set_probe_once(neigh); - - if (neigh) - write_unlock(&neigh->lock); - - if (work) { - INIT_WORK(&work->work, rt6_probe_deferred); - work->target = rt->rt6i_gateway; - dev_hold(rt->dst.dev); - work->dev = rt->dst.dev; - schedule_work(&work->work); + if (!(neigh->nud_state & NUD_VALID) && + time_after(jiffies, + neigh->updated + + rt->rt6i_idev->cnf.rtr_probe_interval)) { + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) + __neigh_set_probe_once(neigh); } - } else { -out: write_unlock(&neigh->lock); + } else { + work = kmalloc(sizeof(*work), GFP_ATOMIC); + } + + if (work) { + INIT_WORK(&work->work, rt6_probe_deferred); + work->target = rt->rt6i_gateway; + dev_hold(rt->dst.dev); + work->dev = rt->dst.dev; + schedule_work(&work->work); } + rcu_read_unlock_bh(); } #else -- cgit v1.2.3-70-g09d2 From 8d6c31bf574177c8de48dd1387d96e1ec3a8b8bc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 24 Jul 2015 09:57:43 -0700 Subject: ipv6: Avoid rt6_probe() taking writer lock in the fast path The patch checks neigh->nud_state before acquiring the writer lock. Note that rt6_probe() is only used in CONFIG_IPV6_ROUTER_PREF. 40 udpflood processes and a /64 gateway route are used. The gateway has NUD_PERMANENT. Each of them is run for 30s. At the end, the total number of finished sendto(): Before: 55M After: 95M Signed-off-by: Martin KaFai Lau Cc: Hannes Frederic Sowa CC: Julian Anastasov CC: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0ef52623fc77..54fccf0d705d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -560,6 +560,9 @@ static void rt6_probe(struct rt6_info *rt) rcu_read_lock_bh(); neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); if (neigh) { + if (neigh->nud_state & NUD_VALID) + goto out; + work = NULL; write_lock(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && @@ -583,6 +586,7 @@ static void rt6_probe(struct rt6_info *rt) schedule_work(&work->work); } +out: rcu_read_unlock_bh(); } #else -- cgit v1.2.3-70-g09d2 From 330567b71d8716704b189454553c2696e1eceb6c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Aug 2015 10:54:28 +0200 Subject: ipv6: don't reject link-local nexthop on other interface 48ed7b26faa7 ("ipv6: reject locally assigned nexthop addresses") is too strict; it rejects following corner-case: ip -6 route add default via fe80::1:2:3 dev eth1 [ where fe80::1:2:3 is assigned to a local interface, but not eth1 ] Fix this by restricting search to given device if nh is linklocal. Joint work with Hannes Frederic Sowa. Fixes: 48ed7b26faa7 ("ipv6: reject locally assigned nexthop addresses") Signed-off-by: Hannes Frederic Sowa Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6090969937f8..9de4d2bcd916 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1831,6 +1831,7 @@ int ip6_route_add(struct fib6_config *cfg) int gwa_type; gw_addr = &cfg->fc_gateway; + gwa_type = ipv6_addr_type(gw_addr); /* if gw_addr is local we will fail to detect this in case * address is still TENTATIVE (DAD in progress). rt6_lookup() @@ -1838,11 +1839,12 @@ int ip6_route_add(struct fib6_config *cfg) * prefix route was assigned to, which might be non-loopback. */ err = -EINVAL; - if (ipv6_chk_addr_and_flags(net, gw_addr, NULL, 0, 0)) + if (ipv6_chk_addr_and_flags(net, gw_addr, + gwa_type & IPV6_ADDR_LINKLOCAL ? + dev : NULL, 0, 0)) goto out; rt->rt6i_gateway = *gw_addr; - gwa_type = ipv6_addr_type(gw_addr); if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { struct rt6_info *grt; -- cgit v1.2.3-70-g09d2 From cea45e208d700e9d633a636384a49f19cda979b7 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Thu, 13 Aug 2015 10:39:00 -0400 Subject: net: track link status of ipv6 nexthops Add support to track current link status of ipv6 nexthops to match recent changes that added support for ipv4 nexthops. This takes a simple approach to track linkdown status for next-hops and simply checks the dev for the dst entry and sets proper flags that to be used in the netlink message. v2: drop use of rt6i_nhflags since it is not needed right now Signed-off-by: Andy Gospodarek Signed-off-by: Dinesh Dutt Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c0fa61eba8f2..370f72785385 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2887,6 +2887,8 @@ static int rt6_fill_node(struct net *net, else rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; + if (!netif_carrier_ok(rt->dst.dev)) + rtm->rtm_flags |= RTNH_F_LINKDOWN; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; if (rt->rt6i_flags & RTF_DYNAMIC) -- cgit v1.2.3-70-g09d2 From 35103d11173b8fea874183f8aa508ae71234d299 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Thu, 13 Aug 2015 10:39:01 -0400 Subject: net: ipv6 sysctl option to ignore routes when nexthop link is down Like the ipv4 patch with a similar title, this adds a sysctl to allow the user to change routing behavior based on whether or not the interface associated with the nexthop was an up or down link. The default setting preserves the current behavior, but anyone that enables it will notice that nexthops on down interfaces will no longer be selected: net.ipv6.conf.all.ignore_routes_with_linkdown = 0 net.ipv6.conf.default.ignore_routes_with_linkdown = 0 net.ipv6.conf.lo.ignore_routes_with_linkdown = 0 ... When the above sysctls are set, not only will link status be reported to userspace, but an indication that a nexthop is dead and will not be used is also reported. 1000::/8 via 7000::2 dev p7p1 metric 1024 dead linkdown pref medium 1000::/8 via 8000::2 dev p8p1 metric 1024 pref medium 7000::/8 dev p7p1 proto kernel metric 256 dead linkdown pref medium 8000::/8 dev p8p1 proto kernel metric 256 pref medium 9000::/8 via 8000::2 dev p8p1 metric 2048 pref medium 9000::/8 via 7000::2 dev p7p1 metric 1024 dead linkdown pref medium fe80::/64 dev p7p1 proto kernel metric 256 dead linkdown pref medium fe80::/64 dev p8p1 proto kernel metric 256 pref medium This also adds devconf support and notification when sysctl values change. v2: drop use of rt6i_nhflags since it is not needed right now Signed-off-by: Andy Gospodarek Signed-off-by: Dinesh Dutt Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 105 +++++++++++++++++++++++++++++++++++++++++++++- net/ipv6/route.c | 11 ++++- 4 files changed, 116 insertions(+), 2 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index cb9dcad72372..f1f32af6d9b9 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -31,6 +31,7 @@ struct ipv6_devconf { __s32 accept_ra_defrtr; __s32 accept_ra_min_hop_limit; __s32 accept_ra_pinfo; + __s32 ignore_routes_with_linkdown; #ifdef CONFIG_IPV6_ROUTER_PREF __s32 accept_ra_rtr_pref; __s32 rtr_probe_interval; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 80f3b74446a1..38b4fef20219 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -173,6 +173,7 @@ enum { DEVCONF_STABLE_SECRET, DEVCONF_USE_OIF_ADDRS_ONLY, DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT, + DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 53e3a9d756b0..5dfbac72f1ab 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -214,6 +214,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .initialized = false, }, .use_oif_addrs_only = 0, + .ignore_routes_with_linkdown = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -257,6 +258,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .initialized = false, }, .use_oif_addrs_only = 0, + .ignore_routes_with_linkdown = 0, }; /* Check if a valid qdisc is available */ @@ -472,6 +474,9 @@ static int inet6_netconf_msgsize_devconf(int type) if (type == -1 || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); + if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) + size += nla_total_size(4); + return size; } @@ -508,6 +513,11 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0) goto nla_put_failure; + if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && + nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + devconf->ignore_routes_with_linkdown) < 0) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -544,6 +554,7 @@ static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, + [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; static int inet6_netconf_get_devconf(struct sk_buff *in_skb, @@ -766,6 +777,63 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) rt6_purge_dflt_routers(net); return 1; } + +static void addrconf_linkdown_change(struct net *net, __s32 newf) +{ + struct net_device *dev; + struct inet6_dev *idev; + + for_each_netdev(net, dev) { + idev = __in6_dev_get(dev); + if (idev) { + int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf); + + idev->cnf.ignore_routes_with_linkdown = newf; + if (changed) + inet6_netconf_notify_devconf(dev_net(dev), + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + dev->ifindex, + &idev->cnf); + } + } +} + +static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) +{ + struct net *net; + int old; + + if (!rtnl_trylock()) + return restart_syscall(); + + net = (struct net *)table->extra2; + old = *p; + *p = newf; + + if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) { + if ((!newf) ^ (!old)) + inet6_netconf_notify_devconf(net, + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + rtnl_unlock(); + return 0; + } + + if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) { + net->ipv6.devconf_dflt->ignore_routes_with_linkdown = newf; + addrconf_linkdown_change(net, newf); + if ((!newf) ^ (!old)) + inet6_netconf_notify_devconf(net, + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + } + rtnl_unlock(); + + return 1; +} + #endif /* Nobody refers to this ifaddr, destroy it */ @@ -4616,6 +4684,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu; + array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown; /* we omit DEVCONF_STABLE_SECRET for now */ array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; } @@ -5338,6 +5407,34 @@ out: return err; } +static +int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + loff_t pos = *ppos; + struct ctl_table lctl; + int ret; + + /* ctl->data points to idev->cnf.ignore_routes_when_linkdown + * we should not modify it until we get the rtnl lock. + */ + lctl = *ctl; + lctl.data = &val; + + ret = proc_dointvec(&lctl, write, buffer, lenp, ppos); + + if (write) + ret = addrconf_fixup_linkdown(ctl, valp, val); + if (ret) + *ppos = pos; + return ret; +} + static struct addrconf_sysctl_table { struct ctl_table_header *sysctl_header; @@ -5629,7 +5726,13 @@ static struct addrconf_sysctl_table .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - + }, + { + .procname = "ignore_routes_with_linkdown", + .data = &ipv6_devconf.ignore_routes_with_linkdown, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_ignore_routes_with_linkdown, }, { /* sentinel */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 370f72785385..1c0217e61357 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -665,6 +665,12 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, { int m; bool match_do_rr = false; + struct inet6_dev *idev = rt->rt6i_idev; + struct net_device *dev = rt->dst.dev; + + if (dev && !netif_carrier_ok(dev) && + idev->cnf.ignore_routes_with_linkdown) + goto out; if (rt6_check_expired(rt)) goto out; @@ -2887,8 +2893,11 @@ static int rt6_fill_node(struct net *net, else rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; - if (!netif_carrier_ok(rt->dst.dev)) + if (!netif_carrier_ok(rt->dst.dev)) { rtm->rtm_flags |= RTNH_F_LINKDOWN; + if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) + rtm->rtm_flags |= RTNH_F_DEAD; + } rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; if (rt->rt6i_flags & RTF_DYNAMIC) -- cgit v1.2.3-70-g09d2 From ad706862890171e02df1d7391b05599fb676ec18 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 14 Aug 2015 11:05:52 -0700 Subject: ipv6: Remove un-used argument from ip6_dst_alloc() After 4b32b5ad31a6 ("ipv6: Stop rt6_info from using inet_peer's metrics"), ip6_dst_alloc() does not need the 'table' argument. This patch cleans it up. Signed-off-by: Martin KaFai Lau CC: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/route.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9de4d2bcd916..c95c3197c186 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -318,8 +318,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = { /* allocate dst with ip6_dst_ops */ static struct rt6_info *__ip6_dst_alloc(struct net *net, struct net_device *dev, - int flags, - struct fib6_table *table) + int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0, DST_OBSOLETE_FORCE_CHK, flags); @@ -336,10 +335,9 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, static struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, - int flags, - struct fib6_table *table) + int flags) { - struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table); + struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); if (rt) { rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); @@ -950,8 +948,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) ort = (struct rt6_info *)ort->dst.from; - rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, - 0, ort->rt6i_table); + rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0); if (!rt) return NULL; @@ -983,8 +980,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) struct rt6_info *pcpu_rt; pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev), - rt->dst.dev, rt->dst.flags, - rt->rt6i_table); + rt->dst.dev, rt->dst.flags); if (!pcpu_rt) return NULL; @@ -1555,7 +1551,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, if (unlikely(!idev)) return ERR_PTR(-ENODEV); - rt = ip6_dst_alloc(net, dev, 0, NULL); + rt = ip6_dst_alloc(net, dev, 0); if (unlikely(!rt)) { in6_dev_put(idev); dst = ERR_PTR(-ENOMEM); @@ -1742,7 +1738,8 @@ int ip6_route_add(struct fib6_config *cfg) if (!table) goto out; - rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table); + rt = ip6_dst_alloc(net, NULL, + (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); if (!rt) { err = -ENOMEM; @@ -2399,7 +2396,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, { struct net *net = dev_net(idev->dev); struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, - DST_NOCOUNT, NULL); + DST_NOCOUNT); if (!rt) return ERR_PTR(-ENOMEM); -- cgit v1.2.3-70-g09d2 From a73e4195636c17f310b8530643a576f42b82385f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 14 Aug 2015 11:05:53 -0700 Subject: ipv6: Add rt6_make_pcpu_route() It is a prep work for fixing a potential deadlock when creating a pcpu rt. The current rt6_get_pcpu_route() will also create a pcpu rt if one does not exist. This patch moves the pcpu rt creation logic into another function, rt6_make_pcpu_route(). Signed-off-by: Martin KaFai Lau CC: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/route.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c95c3197c186..0a82653efc88 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -993,13 +993,21 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) /* It should be called with read_lock_bh(&tb6_lock) acquired */ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) { - struct rt6_info *pcpu_rt, *prev, **p; + struct rt6_info *pcpu_rt, **p; p = this_cpu_ptr(rt->rt6i_pcpu); pcpu_rt = *p; - if (pcpu_rt) - goto done; + if (pcpu_rt) { + dst_hold(&pcpu_rt->dst); + rt6_dst_from_metrics_check(pcpu_rt); + } + return pcpu_rt; +} + +static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) +{ + struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(rt); if (!pcpu_rt) { @@ -1009,6 +1017,7 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) goto done; } + p = this_cpu_ptr(rt->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); if (prev) { /* If someone did it before us, return prev instead */ @@ -1093,8 +1102,11 @@ redo_rt6_select: rt->dst.lastuse = jiffies; rt->dst.__use++; pcpu_rt = rt6_get_pcpu_route(rt); - read_unlock_bh(&table->tb6_lock); + if (!pcpu_rt) + pcpu_rt = rt6_make_pcpu_route(rt); + + read_unlock_bh(&table->tb6_lock); return pcpu_rt; } } -- cgit v1.2.3-70-g09d2 From 9c7370a166b4e157137bfbfe2ad296d57147547c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 14 Aug 2015 11:05:54 -0700 Subject: ipv6: Fix a potential deadlock when creating pcpu rt rt6_make_pcpu_route() is called under read_lock(&table->tb6_lock). rt6_make_pcpu_route() calls ip6_rt_pcpu_alloc(rt) which then calls dst_alloc(). dst_alloc() _may_ call ip6_dst_gc() which takes the write_lock(&tabl->tb6_lock). A visualized version: read_lock(&table->tb6_lock); rt6_make_pcpu_route(); => ip6_rt_pcpu_alloc(); => dst_alloc(); => ip6_dst_gc(); => write_lock(&table->tb6_lock); /* oops */ The fix is to do a read_unlock first before calling ip6_rt_pcpu_alloc(). A reported stack: [141625.537638] INFO: rcu_sched self-detected stall on CPU { 27} (t=60000 jiffies g=4159086 c=4159085 q=2139) [141625.547469] Task dump for CPU 27: [141625.550881] mtr R running task 0 22121 22081 0x00000008 [141625.558069] 0000000000000000 ffff88103f363d98 ffffffff8106e488 000000000000001b [141625.565641] ffffffff81684900 ffff88103f363db8 ffffffff810702b0 0000000008000000 [141625.573220] ffffffff81684900 ffff88103f363de8 ffffffff8108df9f ffff88103f375a00 [141625.580803] Call Trace: [141625.583345] [] sched_show_task+0xc1/0xc6 [141625.589650] [] dump_cpu_task+0x35/0x39 [141625.595144] [] rcu_dump_cpu_stacks+0x6a/0x8c [141625.601320] [] rcu_check_callbacks+0x1f6/0x5d4 [141625.607669] [] update_process_times+0x2a/0x4f [141625.613925] [] tick_sched_handle+0x32/0x3e [141625.619923] [] tick_sched_timer+0x35/0x5c [141625.625830] [] __hrtimer_run_queues+0x8f/0x18d [141625.632171] [] hrtimer_interrupt+0xa0/0x166 [141625.638258] [] local_apic_timer_interrupt+0x4e/0x52 [141625.645036] [] smp_apic_timer_interrupt+0x39/0x4a [141625.651643] [] apic_timer_interrupt+0x68/0x70 [141625.657895] [] ? dst_destroy+0x7c/0xb5 [141625.664188] [] ? fib6_flush_trees+0x20/0x20 [141625.670272] [] ? queue_write_lock_slowpath+0x60/0x6f [141625.677140] [] _raw_write_lock_bh+0x23/0x25 [141625.683218] [] __fib6_clean_all+0x40/0x82 [141625.689124] [] ? fib6_flush_trees+0x20/0x20 [141625.695207] [] fib6_clean_all+0xe/0x10 [141625.700854] [] fib6_run_gc+0x79/0xc8 [141625.706329] [] ip6_dst_gc+0x85/0xf9 [141625.711718] [] dst_alloc+0x55/0x159 [141625.717105] [] __ip6_dst_alloc.isra.32+0x19/0x63 [141625.723620] [] ip6_pol_route+0x36a/0x3e8 [141625.729441] [] ip6_pol_route_output+0x11/0x13 [141625.735700] [] fib6_rule_action+0xa7/0x1bf [141625.741698] [] ? ip6_pol_route_input+0x17/0x17 [141625.748043] [] fib_rules_lookup+0xb5/0x12a [141625.754050] [] ? poll_select_copy_remaining+0xf9/0xf9 [141625.761002] [] fib6_rule_lookup+0x37/0x5c [141625.766914] [] ? ip6_pol_route_input+0x17/0x17 [141625.773260] [] ip6_route_output+0x7a/0x82 [141625.779177] [] ip6_dst_lookup_tail+0x53/0x112 [141625.785437] [] ip6_dst_lookup_flow+0x2a/0x6b [141625.791604] [] rawv6_sendmsg+0x407/0x9b6 [141625.797423] [] ? do_ipv6_setsockopt.isra.8+0xd87/0xde2 [141625.804464] [] inet_sendmsg+0x57/0x8e [141625.810028] [] sock_sendmsg+0x2e/0x3c [141625.815588] [] SyS_sendto+0xfe/0x143 [141625.821063] [] ? rawv6_setsockopt+0x5e/0x67 [141625.827146] [] ? sock_common_setsockopt+0xf/0x11 [141625.833660] [] ? SyS_setsockopt+0x81/0xa2 [141625.839565] [] entry_SYSCALL_64_fastpath+0x12/0x6a Fixes: d52d3997f843 ("pv6: Create percpu rt6_info") Signed-off-by: Martin KaFai Lau CC: Hannes Frederic Sowa Reported-by: Steinar H. Gunderson Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 2 ++ net/ipv6/route.c | 44 +++++++++++++++++++++++++++++++++----------- 2 files changed, 35 insertions(+), 11 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 55d19861ab20..548c6237b1e7 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -172,6 +172,8 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) *ppcpu_rt = NULL; } } + + non_pcpu_rt->rt6i_pcpu = NULL; } static void rt6_release(struct rt6_info *rt) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0a82653efc88..d15586490cec 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1007,27 +1007,39 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) { + struct fib6_table *table = rt->rt6i_table; struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(rt); if (!pcpu_rt) { struct net *net = dev_net(rt->dst.dev); - pcpu_rt = net->ipv6.ip6_null_entry; - goto done; + dst_hold(&net->ipv6.ip6_null_entry->dst); + return net->ipv6.ip6_null_entry; } - p = this_cpu_ptr(rt->rt6i_pcpu); - prev = cmpxchg(p, NULL, pcpu_rt); - if (prev) { - /* If someone did it before us, return prev instead */ + read_lock_bh(&table->tb6_lock); + if (rt->rt6i_pcpu) { + p = this_cpu_ptr(rt->rt6i_pcpu); + prev = cmpxchg(p, NULL, pcpu_rt); + if (prev) { + /* If someone did it before us, return prev instead */ + dst_destroy(&pcpu_rt->dst); + pcpu_rt = prev; + } + } else { + /* rt has been removed from the fib6 tree + * before we have a chance to acquire the read_lock. + * In this case, don't brother to create a pcpu rt + * since rt is going away anyway. The next + * dst_check() will trigger a re-lookup. + */ dst_destroy(&pcpu_rt->dst); - pcpu_rt = prev; + pcpu_rt = rt; } - -done: dst_hold(&pcpu_rt->dst); rt6_dst_from_metrics_check(pcpu_rt); + read_unlock_bh(&table->tb6_lock); return pcpu_rt; } @@ -1103,11 +1115,21 @@ redo_rt6_select: rt->dst.__use++; pcpu_rt = rt6_get_pcpu_route(rt); - if (!pcpu_rt) + if (pcpu_rt) { + read_unlock_bh(&table->tb6_lock); + } else { + /* We have to do the read_unlock first + * because rt6_make_pcpu_route() may trigger + * ip6_dst_gc() which will take the write_lock. + */ + dst_hold(&rt->dst); + read_unlock_bh(&table->tb6_lock); pcpu_rt = rt6_make_pcpu_route(rt); + dst_release(&rt->dst); + } - read_unlock_bh(&table->tb6_lock); return pcpu_rt; + } } -- cgit v1.2.3-70-g09d2 From 2536862311d2276454ddef9dc36d6551a4b400fd Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 17 Aug 2015 13:42:24 -0700 Subject: lwt: Add support to redirect dst.input This patch adds the capability to redirect dst input in the same way that dst output is redirected by LWT. Also, save the original dst.input and and dst.out when setting up lwtunnel redirection. These can be called by the client as a pass- through. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 30 ++++++++++++++++++++++++++- net/core/lwtunnel.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/route.c | 8 +++++++- net/ipv6/route.c | 8 +++++++- 4 files changed, 98 insertions(+), 3 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 33bd30963a95..e25b60eb262d 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -11,12 +11,15 @@ #define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) /* lw tunnel state flags */ -#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1 +#define LWTUNNEL_STATE_OUTPUT_REDIRECT BIT(0) +#define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) struct lwtunnel_state { __u16 type; __u16 flags; atomic_t refcnt; + int (*orig_output)(struct sock *sk, struct sk_buff *skb); + int (*orig_input)(struct sk_buff *); int len; __u8 data[0]; }; @@ -25,6 +28,7 @@ struct lwtunnel_encap_ops { int (*build_state)(struct net_device *dev, struct nlattr *encap, struct lwtunnel_state **ts); int (*output)(struct sock *sk, struct sk_buff *skb); + int (*input)(struct sk_buff *skb); int (*fill_encap)(struct sk_buff *skb, struct lwtunnel_state *lwtstate); int (*get_encap_size)(struct lwtunnel_state *lwtstate); @@ -58,6 +62,13 @@ static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) return false; } +static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) +{ + if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_INPUT_REDIRECT)) + return true; + + return false; +} int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, @@ -72,6 +83,8 @@ struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct sock *sk, struct sk_buff *skb); int lwtunnel_output6(struct sock *sk, struct sk_buff *skb); +int lwtunnel_input(struct sk_buff *skb); +int lwtunnel_input6(struct sk_buff *skb); #else @@ -90,6 +103,11 @@ static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) return false; } +static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) +{ + return false; +} + static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num) { @@ -142,6 +160,16 @@ static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) return -EOPNOTSUPP; } +static inline int lwtunnel_input(struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + +static inline int lwtunnel_input6(struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + #endif #endif /* __NET_LWTUNNEL_H */ diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 5d6d8e3d450a..3331585174d9 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -241,3 +241,58 @@ int lwtunnel_output(struct sock *sk, struct sk_buff *skb) return __lwtunnel_output(sk, skb, lwtstate); } EXPORT_SYMBOL(lwtunnel_output); + +int __lwtunnel_input(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (!lwtstate) + goto drop; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->input)) + ret = ops->input(skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree_skb(skb); + + return ret; +} + +int lwtunnel_input6(struct sk_buff *skb) +{ + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct lwtunnel_state *lwtstate = NULL; + + if (rt) + lwtstate = rt->rt6i_lwtstate; + + return __lwtunnel_input(skb, lwtstate); +} +EXPORT_SYMBOL(lwtunnel_input6); + +int lwtunnel_input(struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable *)skb_dst(skb); + struct lwtunnel_state *lwtstate = NULL; + + if (rt) + lwtstate = rt->rt_lwtstate; + + return __lwtunnel_input(skb, lwtstate); +} +EXPORT_SYMBOL(lwtunnel_input); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2c89d294b669..2403e85107f0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1631,8 +1631,14 @@ static int __mkroute_input(struct sk_buff *skb, rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); - if (lwtunnel_output_redirect(rth->rt_lwtstate)) + if (lwtunnel_output_redirect(rth->rt_lwtstate)) { + rth->rt_lwtstate->orig_output = rth->dst.output; rth->dst.output = lwtunnel_output; + } + if (lwtunnel_input_redirect(rth->rt_lwtstate)) { + rth->rt_lwtstate->orig_input = rth->dst.input; + rth->dst.input = lwtunnel_input; + } skb_dst_set(skb, &rth->dst); out: err = 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1c0217e61357..c3733049715e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1785,8 +1785,14 @@ int ip6_route_add(struct fib6_config *cfg) if (err) goto out; rt->rt6i_lwtstate = lwtstate_get(lwtstate); - if (lwtunnel_output_redirect(rt->rt6i_lwtstate)) + if (lwtunnel_output_redirect(rt->rt6i_lwtstate)) { + rt->rt6i_lwtstate->orig_output = rt->dst.output; rt->dst.output = lwtunnel_output6; + } + if (lwtunnel_input_redirect(rt->rt6i_lwtstate)) { + rt->rt6i_lwtstate->orig_input = rt->dst.input; + rt->dst.input = lwtunnel_input6; + } } ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); -- cgit v1.2.3-70-g09d2 From 61adedf3e3f1d3f032c5a6a299978d91eff6d555 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:25 +0200 Subject: route: move lwtunnel state to dst_entry Currently, the lwtunnel state resides in per-protocol data. This is a problem if we encapsulate ipv6 traffic in an ipv4 tunnel (or vice versa). The xmit function of the tunnel does not know whether the packet has been routed to it by ipv4 or ipv6, yet it needs the lwtstate data. Moving the lwtstate data to dst_entry makes such inter-protocol tunneling possible. As a bonus, this brings a nice diffstat. Signed-off-by: Jiri Benc Acked-by: Roopa Prabhu Acked-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vrf.c | 1 - drivers/net/vxlan.c | 4 +-- include/net/dst.h | 3 +- include/net/dst_metadata.h | 15 +++------ include/net/ip6_fib.h | 1 - include/net/lwtunnel.h | 12 -------- include/net/route.h | 1 - net/core/dst.c | 3 ++ net/core/filter.c | 2 +- net/core/lwtunnel.c | 70 ++++++------------------------------------ net/ipv4/ip_gre.c | 2 +- net/ipv4/route.c | 20 +++++------- net/ipv6/ila.c | 14 +++------ net/ipv6/ip6_fib.c | 1 - net/ipv6/route.c | 20 ++++++------ net/mpls/mpls_iptunnel.c | 7 ++--- net/openvswitch/vport-netdev.c | 2 +- 17 files changed, 48 insertions(+), 130 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index dbeffe789185..b3d9c5546c79 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -295,7 +295,6 @@ static struct rtable *vrf_rtable_create(struct net_device *dev) rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); rth->rt_uncached_list = NULL; - rth->rt_lwtstate = NULL; } return rth; diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ebeb3def06c5..93613ffd8d7e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1909,7 +1909,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, u32 flags = vxlan->flags; /* FIXME: Support IPv6 */ - info = skb_tunnel_info(skb, AF_INET); + info = skb_tunnel_info(skb); if (rdst) { dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; @@ -2105,7 +2105,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) struct vxlan_fdb *f; /* FIXME: Support IPv6 */ - info = skb_tunnel_info(skb, AF_INET); + info = skb_tunnel_info(skb); skb_reset_mac_header(skb); eth = eth_hdr(skb); diff --git a/include/net/dst.h b/include/net/dst.h index 2578811cef51..0a9a723f6c19 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -44,6 +44,7 @@ struct dst_entry { #else void *__pad1; #endif + struct lwtunnel_state *lwtstate; int (*input)(struct sk_buff *); int (*output)(struct sock *sk, struct sk_buff *skb); @@ -89,7 +90,7 @@ struct dst_entry { * (L1_CACHE_SIZE would be too much) */ #ifdef CONFIG_64BIT - long __pad_to_align_refcnt[2]; + long __pad_to_align_refcnt[1]; #endif /* * __refcnt wants to be on a different cache line from diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 075f523ff23f..2cb52d562272 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -23,22 +23,17 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) return NULL; } -static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb, - int family) +static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); - struct rtable *rt; + struct dst_entry *dst; if (md_dst) return &md_dst->u.tun_info; - switch (family) { - case AF_INET: - rt = (struct rtable *)skb_dst(skb); - if (rt && rt->rt_lwtstate) - return lwt_tun_info(rt->rt_lwtstate); - break; - } + dst = skb_dst(skb); + if (dst && dst->lwtstate) + return lwt_tun_info(dst->lwtstate); return NULL; } diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 276328e3daa6..063d30474cf6 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -133,7 +133,6 @@ struct rt6_info { /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; - struct lwtunnel_state *rt6i_lwtstate; }; static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index cfee53916ba5..843489884448 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -87,9 +87,7 @@ int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct sock *sk, struct sk_buff *skb); -int lwtunnel_output6(struct sock *sk, struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb); -int lwtunnel_input6(struct sk_buff *skb); #else @@ -164,21 +162,11 @@ static inline int lwtunnel_output(struct sock *sk, struct sk_buff *skb) return -EOPNOTSUPP; } -static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) -{ - return -EOPNOTSUPP; -} - static inline int lwtunnel_input(struct sk_buff *skb) { return -EOPNOTSUPP; } -static inline int lwtunnel_input6(struct sk_buff *skb) -{ - return -EOPNOTSUPP; -} - #endif #endif /* __NET_LWTUNNEL_H */ diff --git a/include/net/route.h b/include/net/route.h index 6dda2c1bf8c6..395d79bb556c 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -66,7 +66,6 @@ struct rtable { struct list_head rt_uncached; struct uncached_list *rt_uncached_list; - struct lwtunnel_state *rt_lwtstate; }; static inline bool rt_is_input_route(const struct rtable *rt) diff --git a/net/core/dst.c b/net/core/dst.c index f8694d1b8702..50dcdbb0ee46 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -184,6 +185,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, #ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = 0; #endif + dst->lwtstate = NULL; atomic_set(&dst->__refcnt, initial_ref); dst->__use = 0; dst->lastuse = jiffies; @@ -264,6 +266,7 @@ again: kfree(dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); + lwtstate_put(dst->lwtstate); dst = child; if (dst) { diff --git a/net/core/filter.c b/net/core/filter.c index 379568562ffb..b4adc961413f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1489,7 +1489,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; - struct ip_tunnel_info *info = skb_tunnel_info(skb, AF_INET); + struct ip_tunnel_info *info = skb_tunnel_info(skb); if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info)) return -EINVAL; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 3331585174d9..e924c2e08554 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -179,14 +179,16 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) } EXPORT_SYMBOL(lwtunnel_cmp_encap); -int __lwtunnel_output(struct sock *sk, struct sk_buff *skb, - struct lwtunnel_state *lwtstate) +int lwtunnel_output(struct sock *sk, struct sk_buff *skb) { + struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; int ret = -EINVAL; - if (!lwtstate) + if (!dst) goto drop; + lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) @@ -209,47 +211,18 @@ drop: return ret; } - -int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) -{ - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); - struct lwtunnel_state *lwtstate = NULL; - - if (rt) { - lwtstate = rt->rt6i_lwtstate; - skb->dev = rt->dst.dev; - } - - skb->protocol = htons(ETH_P_IPV6); - - return __lwtunnel_output(sk, skb, lwtstate); -} -EXPORT_SYMBOL(lwtunnel_output6); - -int lwtunnel_output(struct sock *sk, struct sk_buff *skb) -{ - struct rtable *rt = (struct rtable *)skb_dst(skb); - struct lwtunnel_state *lwtstate = NULL; - - if (rt) { - lwtstate = rt->rt_lwtstate; - skb->dev = rt->dst.dev; - } - - skb->protocol = htons(ETH_P_IP); - - return __lwtunnel_output(sk, skb, lwtstate); -} EXPORT_SYMBOL(lwtunnel_output); -int __lwtunnel_input(struct sk_buff *skb, - struct lwtunnel_state *lwtstate) +int lwtunnel_input(struct sk_buff *skb) { + struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; int ret = -EINVAL; - if (!lwtstate) + if (!dst) goto drop; + lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) @@ -272,27 +245,4 @@ drop: return ret; } - -int lwtunnel_input6(struct sk_buff *skb) -{ - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); - struct lwtunnel_state *lwtstate = NULL; - - if (rt) - lwtstate = rt->rt6i_lwtstate; - - return __lwtunnel_input(skb, lwtstate); -} -EXPORT_SYMBOL(lwtunnel_input6); - -int lwtunnel_input(struct sk_buff *skb) -{ - struct rtable *rt = (struct rtable *)skb_dst(skb); - struct lwtunnel_state *lwtstate = NULL; - - if (rt) - lwtstate = rt->rt_lwtstate; - - return __lwtunnel_input(skb, lwtstate); -} EXPORT_SYMBOL(lwtunnel_input); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5193618b2600..1bf328182697 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -521,7 +521,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) __be16 df, flags; int err; - tun_info = skb_tunnel_info(skb, AF_INET); + tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX)) goto err_free_skb; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2403e85107f0..f3087aaa6dd8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1359,7 +1359,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst) list_del(&rt->rt_uncached); spin_unlock_bh(&ul->lock); } - lwtstate_put(rt->rt_lwtstate); } void rt_flush_dev(struct net_device *dev) @@ -1408,7 +1407,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif - rt->rt_lwtstate = lwtstate_get(nh->nh_lwtstate); + rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) @@ -1494,7 +1493,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - rth->rt_lwtstate = NULL; if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; @@ -1624,19 +1622,18 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); - if (lwtunnel_output_redirect(rth->rt_lwtstate)) { - rth->rt_lwtstate->orig_output = rth->dst.output; + if (lwtunnel_output_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_output = rth->dst.output; rth->dst.output = lwtunnel_output; } - if (lwtunnel_input_redirect(rth->rt_lwtstate)) { - rth->rt_lwtstate->orig_input = rth->dst.input; + if (lwtunnel_input_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_input = rth->dst.input; rth->dst.input = lwtunnel_input; } skb_dst_set(skb, &rth->dst); @@ -1695,7 +1692,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ - tun_info = skb_tunnel_info(skb, AF_INET); + tun_info = skb_tunnel_info(skb); if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; else @@ -1815,7 +1812,6 @@ local_input: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { @@ -2006,7 +2002,6 @@ add: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) @@ -2029,7 +2024,7 @@ add: } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); - if (lwtunnel_output_redirect(rth->rt_lwtstate)) + if (lwtunnel_output_redirect(rth->dst.lwtstate)) rth->dst.output = lwtunnel_output; return rth; @@ -2293,7 +2288,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); - rt->rt_lwtstate = NULL; dst_free(new); } diff --git a/net/ipv6/ila.c b/net/ipv6/ila.c index 2540ab4b76d1..f011c3d5ca40 100644 --- a/net/ipv6/ila.c +++ b/net/ipv6/ila.c @@ -89,16 +89,13 @@ static void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) static int ila_output(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct rt6_info *rt6 = NULL; if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - rt6 = (struct rt6_info *)dst; + update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); - update_ipv6_locator(skb, ila_params_lwtunnel(rt6->rt6i_lwtstate)); - - return rt6->rt6i_lwtstate->orig_output(sk, skb); + return dst->lwtstate->orig_output(sk, skb); drop: kfree_skb(skb); @@ -108,16 +105,13 @@ drop: static int ila_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct rt6_info *rt6 = NULL; if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - rt6 = (struct rt6_info *)dst; - - update_ipv6_locator(skb, ila_params_lwtunnel(rt6->rt6i_lwtstate)); + update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); - return rt6->rt6i_lwtstate->orig_input(skb); + return dst->lwtstate->orig_input(skb); drop: kfree_skb(skb); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5693b5eb8482..865e777ae20c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -178,7 +178,6 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { - lwtstate_put(rt->rt6i_lwtstate); rt6_free_pcpu(rt); dst_free(&rt->dst); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c3733049715e..e6bbcdee7707 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1784,14 +1784,14 @@ int ip6_route_add(struct fib6_config *cfg) cfg->fc_encap, &lwtstate); if (err) goto out; - rt->rt6i_lwtstate = lwtstate_get(lwtstate); - if (lwtunnel_output_redirect(rt->rt6i_lwtstate)) { - rt->rt6i_lwtstate->orig_output = rt->dst.output; - rt->dst.output = lwtunnel_output6; + rt->dst.lwtstate = lwtstate_get(lwtstate); + if (lwtunnel_output_redirect(rt->dst.lwtstate)) { + rt->dst.lwtstate->orig_output = rt->dst.output; + rt->dst.output = lwtunnel_output; } - if (lwtunnel_input_redirect(rt->rt6i_lwtstate)) { - rt->rt6i_lwtstate->orig_input = rt->dst.input; - rt->dst.input = lwtunnel_input6; + if (lwtunnel_input_redirect(rt->dst.lwtstate)) { + rt->dst.lwtstate->orig_input = rt->dst.input; + rt->dst.input = lwtunnel_input; } } @@ -2174,7 +2174,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) #endif rt->rt6i_prefsrc = ort->rt6i_prefsrc; rt->rt6i_table = ort->rt6i_table; - rt->rt6i_lwtstate = lwtstate_get(ort->rt6i_lwtstate); + rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -2838,7 +2838,7 @@ static inline size_t rt6_nlmsg_size(struct rt6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->rt6i_lwtstate); + + lwtunnel_get_encap_size(rt->dst.lwtstate); } static int rt6_fill_node(struct net *net, @@ -2991,7 +2991,7 @@ static int rt6_fill_node(struct net *net, if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) goto nla_put_failure; - lwtunnel_fill_encap(skb, rt->rt6i_lwtstate); + lwtunnel_fill_encap(skb, rt->dst.lwtstate); nlmsg_end(skb, nlh); return 0; diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 276f8c992218..3da5ca3ba563 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -48,7 +48,6 @@ int mpls_output(struct sock *sk, struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct rtable *rt = NULL; struct rt6_info *rt6 = NULL; - struct lwtunnel_state *lwtstate = NULL; int err = 0; bool bos; int i; @@ -58,11 +57,9 @@ int mpls_output(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) { ttl = ip_hdr(skb)->ttl; rt = (struct rtable *)dst; - lwtstate = rt->rt_lwtstate; } else if (skb->protocol == htons(ETH_P_IPV6)) { ttl = ipv6_hdr(skb)->hop_limit; rt6 = (struct rt6_info *)dst; - lwtstate = rt6->rt6i_lwtstate; } else { goto drop; } @@ -72,12 +69,12 @@ int mpls_output(struct sock *sk, struct sk_buff *skb) /* Find the output device */ out_dev = dst->dev; if (!mpls_output_possible(out_dev) || - !lwtstate || skb_warn_if_lro(skb)) + !dst->lwtstate || skb_warn_if_lro(skb)) goto drop; skb_forward_csum(skb); - tun_encap_info = mpls_lwtunnel_encap(lwtstate); + tun_encap_info = mpls_lwtunnel_encap(dst->lwtstate); /* Verify the destination can hold the packet */ new_header_size = mpls_encap_size(tun_encap_info); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 4b70aaa4a746..a75011505039 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -57,7 +57,7 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) skb_push(skb, ETH_HLEN); ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - ovs_vport_receive(vport, skb, skb_tunnel_info(skb, AF_INET)); + ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: -- cgit v1.2.3-70-g09d2 From 06e9d040ba08b0f645783ff958384d5837b3fa3a Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:26 +0200 Subject: ipv6: drop metadata dst in ip6_route_input The fix in commit 48fb6b554501 is incomplete, as now ip6_route_input can be called with non-NULL dst if it's a metadata dst and the reference is leaked. Drop the reference. Fixes: 48fb6b554501 ("ipv6: fix crash over flow-based vxlan device") Fixes: ee122c79d422 ("vxlan: Flow based tunneling") CC: Wei-Chun Chao CC: Thomas Graf Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv6/route.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e6bbcdee7707..0947ad0b3de8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1140,6 +1140,7 @@ void ip6_route_input(struct sk_buff *skb) .flowi6_proto = iph->nexthdr, }; + skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); } -- cgit v1.2.3-70-g09d2 From ab450605b35caa768ca33e86db9403229bf42be4 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:27 +0200 Subject: ipv6: ndisc: inherit metadata dst when creating ndisc requests If output device wants to see the dst, inherit the dst of the original skb in the ndisc request. This is an IPv6 counterpart of commit 0accfc268f4d ("arp: Inherit metadata dst when creating ARP requests"). Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ndisc.h | 3 ++- net/ipv6/addrconf.c | 2 +- net/ipv6/ndisc.c | 10 +++++++--- net/ipv6/route.c | 2 +- 4 files changed, 11 insertions(+), 6 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index b3a7751251b4..aba5695fadb0 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -182,7 +182,8 @@ int ndisc_rcv(struct sk_buff *skb); void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr); + const struct in6_addr *daddr, const struct in6_addr *saddr, + struct sk_buff *oskb); void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 59242399b0b5..0f08d3b9e238 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3656,7 +3656,7 @@ static void addrconf_dad_work(struct work_struct *w) /* send a neighbour solicitation for our addr */ addrconf_addr_solict_mult(&ifp->addr, &mcaddr); - ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any); + ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any, NULL); out: in6_ifa_put(ifp); rtnl_unlock(); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b3054611f88a..13d3c2beb93e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -553,7 +553,8 @@ static void ndisc_send_unsol_na(struct net_device *dev) void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr) + const struct in6_addr *daddr, const struct in6_addr *saddr, + struct sk_buff *oskb) { struct sk_buff *skb; struct in6_addr addr_buf; @@ -589,6 +590,9 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr); + if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE) && oskb) + skb_dst_copy(skb, oskb); + ndisc_send_skb(skb, daddr, saddr); } @@ -675,12 +679,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } - ndisc_send_ns(dev, neigh, target, target, saddr); + ndisc_send_ns(dev, neigh, target, target, saddr, skb); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); - ndisc_send_ns(dev, NULL, target, &mcaddr, saddr); + ndisc_send_ns(dev, NULL, target, &mcaddr, saddr, skb); } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0947ad0b3de8..c4f3b9fcca9d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -538,7 +538,7 @@ static void rt6_probe_deferred(struct work_struct *w) container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); - ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL); + ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL, NULL); dev_put(work->dev); kfree(work); } -- cgit v1.2.3-70-g09d2 From 904af04d30f303d96902584206457128c3051d8d Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:31 +0200 Subject: ipv6: route: extend flow representation with tunnel key Use flowi_tunnel in flowi6 similarly to what is done with IPv4. This complements commit 1b7179d3adff ("route: Extend flow representation with tunnel key"). Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/flow.h | 1 + net/ipv6/route.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'net/ipv6/route.c') diff --git a/include/net/flow.h b/include/net/flow.h index f305588fc162..9e0297c4c11d 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -130,6 +130,7 @@ struct flowi6 { #define flowi6_proto __fl_common.flowic_proto #define flowi6_flags __fl_common.flowic_flags #define flowi6_secid __fl_common.flowic_secid +#define flowi6_tun_key __fl_common.flowic_tun_key struct in6_addr daddr; struct in6_addr saddr; __be32 flowlabel; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c4f3b9fcca9d..6c0fe4c7ce8d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -54,11 +54,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include @@ -1131,6 +1133,7 @@ void ip6_route_input(struct sk_buff *skb) const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); int flags = RT6_LOOKUP_F_HAS_SADDR; + struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .daddr = iph->daddr, @@ -1140,6 +1143,9 @@ void ip6_route_input(struct sk_buff *skb) .flowi6_proto = iph->nexthdr, }; + tun_info = skb_tunnel_info(skb); + if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) + fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); } -- cgit v1.2.3-70-g09d2 From 127eb7cd3c210afead788991a30950a9e36759ea Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 24 Aug 2015 09:45:41 -0700 Subject: lwt: Add cfg argument to build_state Add cfg and family arguments to lwt build state functions. cfg is a void pointer and will either be a pointer to a fib_config or fib6_config structure. The family parameter indicates which one (either AF_INET or AF_INET6). LWT encpasulation implementation may use the fib configuration to build the LWT state. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 3 +++ net/core/lwtunnel.c | 5 +++-- net/ipv4/fib_semantics.c | 17 ++++++++++------- net/ipv4/ip_tunnel_core.c | 2 ++ net/ipv6/ila.c | 1 + net/ipv6/route.c | 3 ++- net/mpls/mpls_iptunnel.c | 1 + 7 files changed, 22 insertions(+), 10 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 843489884448..fce0e35e74d0 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -26,6 +26,7 @@ struct lwtunnel_state { struct lwtunnel_encap_ops { int (*build_state)(struct net_device *dev, struct nlattr *encap, + unsigned int family, const void *cfg, struct lwtunnel_state **ts); int (*output)(struct sock *sk, struct sk_buff *skb); int (*input)(struct sk_buff *skb); @@ -80,6 +81,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_build_state(struct net_device *dev, u16 encap_type, struct nlattr *encap, + unsigned int family, const void *cfg, struct lwtunnel_state **lws); int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate); @@ -130,6 +132,7 @@ static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, static inline int lwtunnel_build_state(struct net_device *dev, u16 encap_type, struct nlattr *encap, + unsigned int family, const void *cfg, struct lwtunnel_state **lws) { return -EOPNOTSUPP; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index e924c2e08554..dfb1a9ca0835 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -72,7 +72,8 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, EXPORT_SYMBOL(lwtunnel_encap_del_ops); int lwtunnel_build_state(struct net_device *dev, u16 encap_type, - struct nlattr *encap, struct lwtunnel_state **lws) + struct nlattr *encap, unsigned int family, + const void *cfg, struct lwtunnel_state **lws) { const struct lwtunnel_encap_ops *ops; int ret = -EINVAL; @@ -85,7 +86,7 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type, rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); if (likely(ops && ops->build_state)) - ret = ops->build_state(dev, encap, lws); + ret = ops->build_state(dev, encap, family, cfg, lws); rcu_read_unlock(); return ret; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 01f1c7dcd329..1b2d01170a4d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -511,7 +511,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, dev = __dev_get_by_index(net, cfg->fc_oif); ret = lwtunnel_build_state(dev, nla_get_u16( nla_entype), - nla, &lwtstate); + nla, AF_INET, cfg, + &lwtstate); if (ret) goto errout; nexthop_nh->nh_lwtstate = @@ -535,7 +536,8 @@ errout: static int fib_encap_match(struct net *net, u16 encap_type, struct nlattr *encap, - int oif, const struct fib_nh *nh) + int oif, const struct fib_nh *nh, + const struct fib_config *cfg) { struct lwtunnel_state *lwtstate; struct net_device *dev = NULL; @@ -546,8 +548,8 @@ static int fib_encap_match(struct net *net, u16 encap_type, if (oif) dev = __dev_get_by_index(net, oif); - ret = lwtunnel_build_state(dev, encap_type, - encap, &lwtstate); + ret = lwtunnel_build_state(dev, encap_type, encap, + AF_INET, cfg, &lwtstate); if (!ret) { result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); lwtstate_free(lwtstate); @@ -571,7 +573,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) if (cfg->fc_encap) { if (fib_encap_match(net, cfg->fc_encap_type, cfg->fc_encap, cfg->fc_oif, - fi->fib_nh)) + fi->fib_nh, cfg)) return 1; } if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && @@ -663,7 +665,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, struct fib_nh *nh) { - int err; + int err = 0; struct net *net; struct net_device *dev; @@ -1005,7 +1007,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (cfg->fc_oif) dev = __dev_get_by_index(net, cfg->fc_oif); err = lwtunnel_build_state(dev, cfg->fc_encap_type, - cfg->fc_encap, &lwtstate); + cfg->fc_encap, AF_INET, cfg, + &lwtstate); if (err) goto failure; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 289b6c26ce37..934f2ac8ad61 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -204,6 +204,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { }; static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, + unsigned int family, const void *cfg, struct lwtunnel_state **ts) { struct ip_tunnel_info *tun_info; @@ -311,6 +312,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { }; static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, + unsigned int family, const void *cfg, struct lwtunnel_state **ts) { struct ip_tunnel_info *tun_info; diff --git a/net/ipv6/ila.c b/net/ipv6/ila.c index f011c3d5ca40..ffe4dcad6088 100644 --- a/net/ipv6/ila.c +++ b/net/ipv6/ila.c @@ -123,6 +123,7 @@ static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { }; static int ila_build_state(struct net_device *dev, struct nlattr *nla, + unsigned int family, const void *cfg, struct lwtunnel_state **ts) { struct ila_params *p; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e476f01add87..df3e353a012d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1819,7 +1819,8 @@ int ip6_route_add(struct fib6_config *cfg) struct lwtunnel_state *lwtstate; err = lwtunnel_build_state(dev, cfg->fc_encap_type, - cfg->fc_encap, &lwtstate); + cfg->fc_encap, AF_INET6, cfg, + &lwtstate); if (err) goto out; rt->dst.lwtstate = lwtstate_get(lwtstate); diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 3da5ca3ba563..21e70bc9af98 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -123,6 +123,7 @@ drop: } static int mpls_build_state(struct net_device *dev, struct nlattr *nla, + unsigned int family, const void *cfg, struct lwtunnel_state **ts) { struct mpls_iptunnel_encap *tun_encap_info; -- cgit v1.2.3-70-g09d2 From 46fa062ad63146dd138ec0f017e71224471e8ea5 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 28 Aug 2015 20:48:19 +0200 Subject: ip_tunnels: convert the mode field of ip_tunnel_info to flags The mode field holds a single bit of information only (whether the ip_tunnel_info struct is for rx or tx). Change the mode field to bit flags. This allows more mode flags to be added. Signed-off-by: Jiri Benc Acked-by: Alexei Starovoitov Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/geneve.c | 2 +- drivers/net/vxlan.c | 2 +- include/net/dst_metadata.h | 1 - include/net/ip_tunnels.h | 9 ++------- net/ipv4/ip_gre.c | 2 +- net/ipv4/route.c | 2 +- net/ipv6/route.c | 2 +- 7 files changed, 7 insertions(+), 13 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 4357bae732d7..4a39c09f144c 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -623,7 +623,7 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) if (geneve->collect_md) { info = skb_tunnel_info(skb); - if (unlikely(info && info->mode != IP_TUNNEL_INFO_TX)) { + if (unlikely(info && !(info->mode & IP_TUNNEL_INFO_TX))) { netdev_dbg(dev, "no tunnel metadata\n"); goto tx_error; } diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 30e56cb58884..bd1b8cdf2bf6 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2113,7 +2113,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } if (vxlan->flags & VXLAN_F_COLLECT_METADATA && - info && info->mode == IP_TUNNEL_INFO_TX) { + info && info->mode & IP_TUNNEL_INFO_TX) { vxlan_xmit_one(skb, dev, NULL, false); return NETDEV_TX_OK; } diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 60c03326c087..2b83f0d232e0 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -59,7 +59,6 @@ static inline struct metadata_dst *tun_rx_dst(__be16 flags, return NULL; info = &tun_dst->u.tun_info; - info->mode = IP_TUNNEL_INFO_RX; info->key.tun_flags = flags; info->key.tun_id = tunnel_id; info->key.tp_src = 0; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 224e4ecec91b..9bdb3948798f 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -50,13 +50,8 @@ struct ip_tunnel_key { __be16 tp_dst; }; -/* Indicates whether the tunnel info structure represents receive - * or transmit tunnel parameters. - */ -enum { - IP_TUNNEL_INFO_RX, - IP_TUNNEL_INFO_TX, -}; +/* Flags for ip_tunnel_info mode. */ +#define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ struct ip_tunnel_info { struct ip_tunnel_key key; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index faf1cde6f8da..1e813a9f9378 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -511,7 +511,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) int err; tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX)) + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX))) goto err_free_skb; key = &tun_info->key; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6b91879e9cbe..5f4a5565ad8b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1696,7 +1696,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, */ tun_info = skb_tunnel_info(skb); - if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) + if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; else fl4.flowi4_tun_key.tun_id = 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index df3e353a012d..308dd5f9158f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1174,7 +1174,7 @@ void ip6_route_input(struct sk_buff *skb) }; tun_info = skb_tunnel_info(skb); - if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) + if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); -- cgit v1.2.3-70-g09d2 From 1bb14807bc761a88bb9d319e7bf519eebf4c82ec Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 Aug 2015 15:58:45 +0200 Subject: net: fib6: reduce identation in ip6_convert_metrics Reduce the identation a bit, there's no need to artificically have it increased. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/ipv6/route.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 308dd5f9158f..0261b721b34b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1711,26 +1711,26 @@ static int ip6_convert_metrics(struct mx6_config *mxc, nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { int type = nla_type(nla); + u32 val; - if (type) { - u32 val; - - if (unlikely(type > RTAX_MAX)) - goto err; - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; + if (!type) + continue; + if (unlikely(type > RTAX_MAX)) + goto err; - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); - if (val == TCP_CA_UNSPEC) - goto err; - } else { - val = nla_get_u32(nla); - } + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; - mp[type - 1] = val; - __set_bit(type - 1, mxc->mx_valid); + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp); + if (val == TCP_CA_UNSPEC) + goto err; + } else { + val = nla_get_u32(nla); } + + mp[type - 1] = val; + __set_bit(type - 1, mxc->mx_valid); } mxc->mx = mp; -- cgit v1.2.3-70-g09d2 From b8d3e4163a3562d7cba486687904383e78e7dd6a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 Aug 2015 15:58:46 +0200 Subject: fib, fib6: reject invalid feature bits Feature bits that are invalid should not be accepted by the kernel, only the lower 4 bits may be configured, but not the remaining ones. Even from these 4, 2 of them are unused. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 11 +++++++---- net/ipv4/fib_semantics.c | 2 ++ net/ipv6/route.c | 2 ++ 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 0d3d3cc43356..702024769c74 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -418,10 +418,13 @@ enum { #define RTAX_MAX (__RTAX_MAX - 1) -#define RTAX_FEATURE_ECN 0x00000001 -#define RTAX_FEATURE_SACK 0x00000002 -#define RTAX_FEATURE_TIMESTAMP 0x00000004 -#define RTAX_FEATURE_ALLFRAG 0x00000008 +#define RTAX_FEATURE_ECN (1 << 0) +#define RTAX_FEATURE_SACK (1 << 1) +#define RTAX_FEATURE_TIMESTAMP (1 << 2) +#define RTAX_FEATURE_ALLFRAG (1 << 3) + +#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG) struct rta_session { __u8 proto; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 88afbae893f0..115a08e70d43 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -908,6 +908,8 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) val = 65535 - 40; if (type == RTAX_MTU && val > 65535 - 15) val = 65535 - 15; + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + return -EINVAL; fi->fib_metrics[type - 1] = val; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0261b721b34b..8771530df45e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1728,6 +1728,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc, } else { val = nla_get_u32(nla); } + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + goto err; mp[type - 1] = val; __set_bit(type - 1, mxc->mx_valid); -- cgit v1.2.3-70-g09d2 From c3a8d9474684d391b0afc3970d9b249add15ec07 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 Aug 2015 15:58:47 +0200 Subject: tcp: use dctcp if enabled on the route to the initiator Currently, the following case doesn't use DCTCP, even if it should: A responder has f.e. Cubic as system wide default, but for a specific route to the initiating host, DCTCP is being set in RTAX_CC_ALGO. The initiating host then uses DCTCP as congestion control, but since the initiator sets ECT(0), tcp_ecn_create_request() doesn't set ecn_ok, and we have to fall back to Reno after 3WHS completes. We were thinking on how to solve this in a minimal, non-intrusive way without bloating tcp_ecn_create_request() needlessly: lets cache the CA ecn option flag in RTAX_FEATURES. In other words, when ECT(0) is set on the SYN packet, set ecn_ok=1 iff route RTAX_FEATURES contains the unexposed (internal-only) DST_FEATURE_ECN_CA. This allows to only do a single metric feature lookup inside tcp_ecn_create_request(). Joint work with Florian Westphal. Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/dst.h | 6 ++++++ include/net/tcp.h | 2 +- net/core/rtnetlink.c | 6 ++++++ net/ipv4/fib_semantics.c | 6 +++++- net/ipv4/tcp_cong.c | 9 ++++++--- net/ipv4/tcp_input.c | 7 +++++-- net/ipv6/route.c | 9 +++++++-- 7 files changed, 36 insertions(+), 9 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/include/net/dst.h b/include/net/dst.h index 4c4801645371..9261d928303d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -207,6 +207,12 @@ static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val) p[metric-1] = val; } +/* Kernel-internal feature bits that are unallocated in user space. */ +#define DST_FEATURE_ECN_CA (1 << 31) + +#define DST_FEATURE_MASK (DST_FEATURE_ECN_CA) +#define DST_FEATURE_ECN_MASK (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN) + static inline u32 dst_feature(const struct dst_entry *dst, u32 feature) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 4a7b03947a38..0cab28cd43a9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -888,7 +888,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find_key(u32 key); -u32 tcp_ca_get_key_by_name(const char *name); +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 788ceed39463..a466821d1441 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -678,6 +678,12 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) continue; if (nla_put_string(skb, i + 1, name)) goto nla_put_failure; + } else if (i == RTAX_FEATURES - 1) { + u32 user_features = metrics[i] & RTAX_FEATURE_MASK; + + BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); + if (nla_put_u32(skb, i + 1, user_features)) + goto nla_put_failure; } else { if (nla_put_u32(skb, i + 1, metrics[i])) goto nla_put_failure; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 115a08e70d43..992a9597daf8 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -879,6 +879,7 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) static int fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) { + bool ecn_ca = false; struct nlattr *nla; int remaining; @@ -898,7 +899,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) return -EINVAL; } else { @@ -913,6 +914,9 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) fi->fib_metrics[type - 1] = val; } + if (ecn_ca) + fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + return 0; } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index a2ed23c595cf..93c4dc3ab23f 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -114,16 +114,19 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); -u32 tcp_ca_get_key_by_name(const char *name) +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; - u32 key; + u32 key = TCP_CA_UNSPEC; might_sleep(); rcu_read_lock(); ca = __tcp_ca_find_autoload(name); - key = ca ? ca->key : TCP_CA_UNSPEC; + if (ca) { + key = ca->key; + *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; + } rcu_read_unlock(); return key; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dc08e2352665..a8f515bb19c4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6003,14 +6003,17 @@ static void tcp_ecn_create_request(struct request_sock *req, const struct net *net = sock_net(listen_sk); bool th_ecn = th->ece && th->cwr; bool ect, ecn_ok; + u32 ecn_ok_dst; if (!th_ecn) return; ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); - ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); + ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); + ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; - if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk)) + if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + (ecn_ok_dst & DST_FEATURE_ECN_CA)) inet_rsk(req)->ecn_ok = 1; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8771530df45e..f45cac6f8356 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1698,6 +1698,7 @@ out: static int ip6_convert_metrics(struct mx6_config *mxc, const struct fib6_config *cfg) { + bool ecn_ca = false; struct nlattr *nla; int remaining; u32 *mp; @@ -1722,7 +1723,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc, char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) goto err; } else { @@ -1735,8 +1736,12 @@ static int ip6_convert_metrics(struct mx6_config *mxc, __set_bit(type - 1, mxc->mx_valid); } - mxc->mx = mp; + if (ecn_ca) { + __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); + mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + } + mxc->mx = mp; return 0; err: kfree(mp); -- cgit v1.2.3-70-g09d2 From 6b9ea5a64ed5eeb3f68f2e6fcce0ed1179801d1e Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 8 Sep 2015 10:53:04 -0700 Subject: ipv6: fix multipath route replace error recovery Problem: The ecmp route replace support for ipv6 in the kernel, deletes the existing ecmp route too early, ie when it installs the first nexthop. If there is an error in installing the subsequent nexthops, its too late to recover the already deleted existing route leaving the fib in an inconsistent state. This patch reduces the possibility of this by doing the following: a) Changes the existing multipath route add code to a two stage process: build rt6_infos + insert them ip6_route_add rt6_info creation code is moved into ip6_route_info_create. b) This ensures that most errors are caught during building rt6_infos and we fail early c) Separates multipath add and del code. Because add needs the special two stage mode in a) and delete essentially does not care. d) In any event if the code fails during inserting a route again, a warning is printed (This should be unlikely) Before the patch: $ip -6 route show 3000:1000:1000:1000::2 via fe80::202:ff:fe00:b dev swp49s0 metric 1024 3000:1000:1000:1000::2 via fe80::202:ff:fe00:d dev swp49s1 metric 1024 3000:1000:1000:1000::2 via fe80::202:ff:fe00:f dev swp49s2 metric 1024 /* Try replacing the route with a duplicate nexthop */ $ip -6 route change 3000:1000:1000:1000::2/128 nexthop via fe80::202:ff:fe00:b dev swp49s0 nexthop via fe80::202:ff:fe00:d dev swp49s1 nexthop via fe80::202:ff:fe00:d dev swp49s1 RTNETLINK answers: File exists $ip -6 route show /* previously added ecmp route 3000:1000:1000:1000::2 dissappears from * kernel */ After the patch: $ip -6 route show 3000:1000:1000:1000::2 via fe80::202:ff:fe00:b dev swp49s0 metric 1024 3000:1000:1000:1000::2 via fe80::202:ff:fe00:d dev swp49s1 metric 1024 3000:1000:1000:1000::2 via fe80::202:ff:fe00:f dev swp49s2 metric 1024 /* Try replacing the route with a duplicate nexthop */ $ip -6 route change 3000:1000:1000:1000::2/128 nexthop via fe80::202:ff:fe00:b dev swp49s0 nexthop via fe80::202:ff:fe00:d dev swp49s1 nexthop via fe80::202:ff:fe00:d dev swp49s1 RTNETLINK answers: File exists $ip -6 route show 3000:1000:1000:1000::2 via fe80::202:ff:fe00:b dev swp49s0 metric 1024 3000:1000:1000:1000::2 via fe80::202:ff:fe00:d dev swp49s1 metric 1024 3000:1000:1000:1000::2 via fe80::202:ff:fe00:f dev swp49s2 metric 1024 Fixes: 27596472473a ("ipv6: fix ECMP route replacement") Signed-off-by: Roopa Prabhu Reviewed-by: Nikolay Aleksandrov Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/route.c | 201 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 175 insertions(+), 26 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f45cac6f8356..34539d3b843f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1748,7 +1748,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc, return -EINVAL; } -int ip6_route_add(struct fib6_config *cfg) +int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) { int err; struct net *net = cfg->fc_nlinfo.nl_net; @@ -1756,7 +1756,6 @@ int ip6_route_add(struct fib6_config *cfg) struct net_device *dev = NULL; struct inet6_dev *idev = NULL; struct fib6_table *table; - struct mx6_config mxc = { .mx = NULL, }; int addr_type; if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) @@ -1981,6 +1980,32 @@ install_route: cfg->fc_nlinfo.nl_net = dev_net(dev); + *rt_ret = rt; + + return 0; +out: + if (dev) + dev_put(dev); + if (idev) + in6_dev_put(idev); + if (rt) + dst_free(&rt->dst); + + *rt_ret = NULL; + + return err; +} + +int ip6_route_add(struct fib6_config *cfg) +{ + struct mx6_config mxc = { .mx = NULL, }; + struct rt6_info *rt = NULL; + int err; + + err = ip6_route_info_create(cfg, &rt); + if (err) + goto out; + err = ip6_convert_metrics(&mxc, cfg); if (err) goto out; @@ -1988,14 +2013,12 @@ install_route: err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc); kfree(mxc.mx); + return err; out: - if (dev) - dev_put(dev); - if (idev) - in6_dev_put(idev); if (rt) dst_free(&rt->dst); + return err; } @@ -2776,19 +2799,78 @@ errout: return err; } -static int ip6_route_multipath(struct fib6_config *cfg, int add) +struct rt6_nh { + struct rt6_info *rt6_info; + struct fib6_config r_cfg; + struct mx6_config mxc; + struct list_head next; +}; + +static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) +{ + struct rt6_nh *nh; + + list_for_each_entry(nh, rt6_nh_list, next) { + pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n", + &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, + nh->r_cfg.fc_ifindex); + } +} + +static int ip6_route_info_append(struct list_head *rt6_nh_list, + struct rt6_info *rt, struct fib6_config *r_cfg) +{ + struct rt6_nh *nh; + struct rt6_info *rtnh; + int err = -EEXIST; + + list_for_each_entry(nh, rt6_nh_list, next) { + /* check if rt6_info already exists */ + rtnh = nh->rt6_info; + + if (rtnh->dst.dev == rt->dst.dev && + rtnh->rt6i_idev == rt->rt6i_idev && + ipv6_addr_equal(&rtnh->rt6i_gateway, + &rt->rt6i_gateway)) + return err; + } + + nh = kzalloc(sizeof(*nh), GFP_KERNEL); + if (!nh) + return -ENOMEM; + nh->rt6_info = rt; + err = ip6_convert_metrics(&nh->mxc, r_cfg); + if (err) { + kfree(nh); + return err; + } + memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); + list_add_tail(&nh->next, rt6_nh_list); + + return 0; +} + +static int ip6_route_multipath_add(struct fib6_config *cfg) { struct fib6_config r_cfg; struct rtnexthop *rtnh; + struct rt6_info *rt; + struct rt6_nh *err_nh; + struct rt6_nh *nh, *nh_safe; int remaining; int attrlen; - int err = 0, last_err = 0; + int err = 1; + int nhn = 0; + int replace = (cfg->fc_nlinfo.nlh && + (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); + LIST_HEAD(rt6_nh_list); remaining = cfg->fc_mp_len; -beginning: rtnh = (struct rtnexthop *)cfg->fc_mp; - /* Parse a Multipath Entry */ + /* Parse a Multipath Entry and build a list (rt6_nh_list) of + * rt6_info structs per nexthop + */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); if (rtnh->rtnh_ifindex) @@ -2808,22 +2890,32 @@ beginning: if (nla) r_cfg.fc_encap_type = nla_get_u16(nla); } - err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg); + + err = ip6_route_info_create(&r_cfg, &rt); + if (err) + goto cleanup; + + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { - last_err = err; - /* If we are trying to remove a route, do not stop the - * loop when ip6_route_del() fails (because next hop is - * already gone), we should try to remove all next hops. - */ - if (add) { - /* If add fails, we should try to delete all - * next hops that have been already added. - */ - add = 0; - remaining = cfg->fc_mp_len - remaining; - goto beginning; - } + dst_free(&rt->dst); + goto cleanup; + } + + rtnh = rtnh_next(rtnh, &remaining); + } + + err_nh = NULL; + list_for_each_entry(nh, &rt6_nh_list, next) { + err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc); + /* nh->rt6_info is used or freed at this point, reset to NULL*/ + nh->rt6_info = NULL; + if (err) { + if (replace && nhn) + ip6_print_replace_route_err(&rt6_nh_list); + err_nh = nh; + goto add_errout; } + /* Because each route is added like a single route we remove * these flags after the first nexthop: if there is a collision, * we have already failed to add the first nexthop: @@ -2833,6 +2925,63 @@ beginning: */ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); + nhn++; + } + + goto cleanup; + +add_errout: + /* Delete routes that were already added */ + list_for_each_entry(nh, &rt6_nh_list, next) { + if (err_nh == nh) + break; + ip6_route_del(&nh->r_cfg); + } + +cleanup: + list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { + if (nh->rt6_info) + dst_free(&nh->rt6_info->dst); + if (nh->mxc.mx) + kfree(nh->mxc.mx); + list_del(&nh->next); + kfree(nh); + } + + return err; +} + +static int ip6_route_multipath_del(struct fib6_config *cfg) +{ + struct fib6_config r_cfg; + struct rtnexthop *rtnh; + int remaining; + int attrlen; + int err = 1, last_err = 0; + + remaining = cfg->fc_mp_len; + rtnh = (struct rtnexthop *)cfg->fc_mp; + + /* Parse a Multipath Entry */ + while (rtnh_ok(rtnh, remaining)) { + memcpy(&r_cfg, cfg, sizeof(*cfg)); + if (rtnh->rtnh_ifindex) + r_cfg.fc_ifindex = rtnh->rtnh_ifindex; + + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla) { + nla_memcpy(&r_cfg.fc_gateway, nla, 16); + r_cfg.fc_flags |= RTF_GATEWAY; + } + } + err = ip6_route_del(&r_cfg); + if (err) + last_err = err; + rtnh = rtnh_next(rtnh, &remaining); } @@ -2849,7 +2998,7 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) return err; if (cfg.fc_mp) - return ip6_route_multipath(&cfg, 0); + return ip6_route_multipath_del(&cfg); else return ip6_route_del(&cfg); } @@ -2864,7 +3013,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) return err; if (cfg.fc_mp) - return ip6_route_multipath(&cfg, 1); + return ip6_route_multipath_add(&cfg); else return ip6_route_add(&cfg); } -- cgit v1.2.3-70-g09d2 From 52fe51f8523751da0e79c85350c47eb3bb94da5b Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 10 Sep 2015 06:57:12 +0800 Subject: ipv6: fix ifnullfree.cocci warnings net/ipv6/route.c:2946:3-8: WARNING: NULL check before freeing functions like kfree, debugfs_remove, debugfs_remove_recursive or usb_free_urb is not needed. Maybe consider reorganizing relevant code to avoid passing NULL values. NULL check before some freeing functions is not needed. Based on checkpatch warning "kfree(NULL) is safe this check is probably not required" and kfreeaddr.cocci by Julia Lawall. Generated by: scripts/coccinelle/free/ifnullfree.cocci CC: Roopa Prabhu Signed-off-by: Fengguang Wu Signed-off-by: David S. Miller --- net/ipv6/route.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv6/route.c') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 34539d3b843f..53617d715188 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2942,8 +2942,7 @@ cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { if (nh->rt6_info) dst_free(&nh->rt6_info->dst); - if (nh->mxc.mx) - kfree(nh->mxc.mx); + kfree(nh->mxc.mx); list_del(&nh->next); kfree(nh); } -- cgit v1.2.3-70-g09d2