From 6596a0229541270fb8d38d989f91b78838e5e9da Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Wed, 19 Jan 2022 10:22:53 +0100 Subject: xfrm: fix MTU regression Commit 749439bfac6e1a2932c582e2699f91d329658196 ("ipv6: fix udpv6 sendmsg crash caused by too small MTU") breaks PMTU for xfrm. A Packet Too Big ICMPv6 message received in response to an ESP packet will prevent all further communication through the tunnel if the reported MTU minus the ESP overhead is smaller than 1280. E.g. in a case of a tunnel-mode ESP with sha256/aes the overhead is 92 bytes. Receiving a PTB with MTU of 1371 or less will result in all further packets in the tunnel dropped. A ping through the tunnel fails with "ping: sendmsg: Invalid argument". Apparently the MTU on the xfrm route is smaller than 1280 and fails the check inside ip6_setup_cork() added by 749439bf. We found this by debugging USGv6/ipv6ready failures. Failing tests are: "Phase-2 Interoperability Test Scenario IPsec" / 5.3.11 and 5.4.11 (Tunnel Mode: Fragmentation). Commit b515d2637276a3810d6595e10ab02c13bfd0b63a ("xfrm: xfrm_state_mtu should return at least 1280 for ipv6") attempted to fix this but caused another regression in TCP MSS calculations and had to be reverted. The patch below fixes the situation by dropping the MTU check and instead checking for the underflows described in the 749439bf commit message. Signed-off-by: Jiri Bohac Fixes: 749439bfac6e ("ipv6: fix udpv6 sendmsg crash caused by too small MTU") Signed-off-by: Steffen Klassert --- net/ipv6/ip6_output.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 2995f8d89e7e..4ff110214dea 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1408,8 +1408,6 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, if (np->frag_size) mtu = np->frag_size; } - if (mtu < IPV6_MIN_MTU) - return -EINVAL; cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; @@ -1471,8 +1469,6 @@ static int __ip6_append_data(struct sock *sk, fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); - maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - - sizeof(struct frag_hdr); headersize = sizeof(struct ipv6hdr) + (opt ? opt->opt_flen + opt->opt_nflen : 0) + @@ -1480,6 +1476,13 @@ static int __ip6_append_data(struct sock *sk, sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; + if (mtu < fragheaderlen || + ((mtu - fragheaderlen) & ~7) + fragheaderlen < sizeof(struct frag_hdr)) + goto emsgsize; + + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - + sizeof(struct frag_hdr); + /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit * the first fragment */ -- cgit v1.2.3-70-g09d2 From a6d95c5a628a09be129f25d5663a7e9db8261f51 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Wed, 26 Jan 2022 16:00:18 +0100 Subject: Revert "xfrm: xfrm_state_mtu should return at least 1280 for ipv6" This reverts commit b515d2637276a3810d6595e10ab02c13bfd0b63a. Commit b515d2637276a3810d6595e10ab02c13bfd0b63a ("xfrm: xfrm_state_mtu should return at least 1280 for ipv6") in v5.14 breaks the TCP MSS calculation in ipsec transport mode, resulting complete stalls of TCP connections. This happens when the (P)MTU is 1280 or slighly larger. The desired formula for the MSS is: MSS = (MTU - ESP_overhead) - IP header - TCP header However, the above commit clamps the (MTU - ESP_overhead) to a minimum of 1280, turning the formula into MSS = max(MTU - ESP overhead, 1280) - IP header - TCP header With the (P)MTU near 1280, the calculated MSS is too large and the resulting TCP packets never make it to the destination because they are over the actual PMTU. The above commit also causes suboptimal double fragmentation in xfrm tunnel mode, as described in https://lore.kernel.org/netdev/20210429202529.codhwpc7w6kbudug@dwarf.suse.cz/ The original problem the above commit was trying to fix is now fixed by commit 6596a0229541270fb8d38d989f91b78838e5e9da ("xfrm: fix MTU regression"). Signed-off-by: Jiri Bohac Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- net/xfrm/xfrm_state.c | 14 ++------------ 4 files changed, 4 insertions(+), 15 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 743dd1da506e..76aa6f11a540 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1568,7 +1568,6 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x); -u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); int xfrm_init_state(struct xfrm_state *x); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 851f542928a3..e1b1d080e908 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -671,7 +671,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 8bb2c407b46b..7591160edce1 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -707,7 +707,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1ba6fbfe8cdb..b749935152ba 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2579,7 +2579,7 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_delete_tunnel); -u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) +u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { const struct xfrm_type *type = READ_ONCE(x->type); struct crypto_aead *aead; @@ -2610,17 +2610,7 @@ u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - net_adj) & ~(blksize - 1)) + net_adj - 2; } -EXPORT_SYMBOL_GPL(__xfrm_state_mtu); - -u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) -{ - mtu = __xfrm_state_mtu(x, mtu); - - if (x->props.family == AF_INET6 && mtu < IPV6_MIN_MTU) - return IPV6_MIN_MTU; - - return mtu; -} +EXPORT_SYMBOL_GPL(xfrm_state_mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) { -- cgit v1.2.3-70-g09d2 From a1cdec57e03a1352e92fbbe7974039dda4efcec0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Feb 2022 09:05:02 -0800 Subject: net-timestamp: convert sk->sk_tskey to atomic_t UDP sendmsg() can be lockless, this is causing all kinds of data races. This patch converts sk->sk_tskey to remove one of these races. BUG: KCSAN: data-race in __ip_append_data / __ip_append_data read to 0xffff8881035d4b6c of 4 bytes by task 8877 on cpu 1: __ip_append_data+0x1c1/0x1de0 net/ipv4/ip_output.c:994 ip_make_skb+0x13f/0x2d0 net/ipv4/ip_output.c:1636 udp_sendmsg+0x12bd/0x14c0 net/ipv4/udp.c:1249 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae write to 0xffff8881035d4b6c of 4 bytes by task 8880 on cpu 0: __ip_append_data+0x1d8/0x1de0 net/ipv4/ip_output.c:994 ip_make_skb+0x13f/0x2d0 net/ipv4/ip_output.c:1636 udp_sendmsg+0x12bd/0x14c0 net/ipv4/udp.c:1249 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000054d -> 0x0000054e Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 8880 Comm: syz-executor.5 Not tainted 5.17.0-rc2-syzkaller-00167-gdcb85f85fa6f-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 09c2d251b707 ("net-timestamp: add key to disambiguate concurrent datagrams") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- net/can/j1939/transport.c | 2 +- net/core/skbuff.c | 2 +- net/core/sock.c | 4 ++-- net/ipv4/ip_output.c | 2 +- net/ipv6/ip6_output.c | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/sock.h b/include/net/sock.h index ff9b508d9c5f..50aecd28b355 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -507,7 +507,7 @@ struct sock { #endif u16 sk_tsflags; u8 sk_shutdown; - u32 sk_tskey; + atomic_t sk_tskey; atomic_t sk_zckey; u8 sk_clockid; @@ -2667,7 +2667,7 @@ static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) - *tskey = sk->sk_tskey++; + *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index a271688780a2..307ee1174a6e 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -2006,7 +2006,7 @@ struct j1939_session *j1939_tp_send(struct j1939_priv *priv, /* set the end-packet for broadcast */ session->pkt.last = session->pkt.total; - skcb->tskey = session->sk->sk_tskey++; + skcb->tskey = atomic_inc_return(&session->sk->sk_tskey) - 1; session->tskey = skcb->tskey; return session; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9d0388bed0c1..6a15ce3eb1d3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4730,7 +4730,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk_is_tcp(sk)) - serr->ee.ee_data -= sk->sk_tskey; + serr->ee.ee_data -= atomic_read(&sk->sk_tskey); } err = sock_queue_err_skb(sk, skb); diff --git a/net/core/sock.c b/net/core/sock.c index 4ff806d71921..6eb174805bf0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -879,9 +879,9 @@ int sock_set_timestamping(struct sock *sk, int optname, if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; - sk->sk_tskey = tcp_sk(sk)->snd_una; + atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); } else { - sk->sk_tskey = 0; + atomic_set(&sk->sk_tskey, 0); } } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 139cec29ed06..7911916a480b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -991,7 +991,7 @@ static int __ip_append_data(struct sock *sk, if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = sk->sk_tskey++; + tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 2995f8d89e7e..304a295de84f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1465,7 +1465,7 @@ static int __ip6_append_data(struct sock *sk, if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = sk->sk_tskey++; + tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); -- cgit v1.2.3-70-g09d2 From cc20cced0598d9a5ff91ae4ab147b3b5e99ee819 Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Fri, 18 Feb 2022 22:35:24 +0800 Subject: gso: do not skip outer ip header in case of ipip and net_failover We encounter a tcp drop issue in our cloud environment. Packet GROed in host forwards to a VM virtio_net nic with net_failover enabled. VM acts as a IPVS LB with ipip encapsulation. The full path like: host gro -> vm virtio_net rx -> net_failover rx -> ipvs fullnat -> ipip encap -> net_failover tx -> virtio_net tx When net_failover transmits a ipip pkt (gso_type = 0x0103, which means SKB_GSO_TCPV4, SKB_GSO_DODGY and SKB_GSO_IPXIP4), there is no gso did because it supports TSO and GSO_IPXIP4. But network_header points to inner ip header. Call Trace: tcp4_gso_segment ------> return NULL inet_gso_segment ------> inner iph, network_header points to ipip_gso_segment inet_gso_segment ------> outer iph skb_mac_gso_segment Afterwards virtio_net transmits the pkt, only inner ip header is modified. And the outer one just keeps unchanged. The pkt will be dropped in remote host. Call Trace: inet_gso_segment ------> inner iph, outer iph is skipped skb_mac_gso_segment __skb_gso_segment validate_xmit_skb validate_xmit_skb_list sch_direct_xmit __qdisc_run __dev_queue_xmit ------> virtio_net dev_hard_start_xmit __dev_queue_xmit ------> net_failover ip_finish_output2 ip_output iptunnel_xmit ip_tunnel_xmit ipip_tunnel_xmit ------> ipip dev_hard_start_xmit __dev_queue_xmit ip_finish_output2 ip_output ip_forward ip_rcv __netif_receive_skb_one_core netif_receive_skb_internal napi_gro_receive receive_buf virtnet_poll net_rx_action The root cause of this issue is specific with the rare combination of SKB_GSO_DODGY and a tunnel device that adds an SKB_GSO_ tunnel option. SKB_GSO_DODGY is set from external virtio_net. We need to reset network header when callbacks.gso_segment() returns NULL. This patch also includes ipv6_gso_segment(), considering SIT, etc. Fixes: cb32f511a70b ("ipip: add GSO/TSO support") Signed-off-by: Tao Liu Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 5 ++++- net/ipv6/ip6_offload.c | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9c465bac1eb0..72fde2888ad2 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1376,8 +1376,11 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, } ops = rcu_dereference(inet_offloads[proto]); - if (likely(ops && ops->callbacks.gso_segment)) + if (likely(ops && ops->callbacks.gso_segment)) { segs = ops->callbacks.gso_segment(skb, features); + if (!segs) + skb->network_header = skb_mac_header(skb) + nhoff - skb->head; + } if (IS_ERR_OR_NULL(segs)) goto out; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index b29e9ba5e113..5f577e21459b 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -114,6 +114,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (likely(ops && ops->callbacks.gso_segment)) { skb_reset_transport_header(skb); segs = ops->callbacks.gso_segment(skb, features); + if (!segs) + skb->network_header = skb_mac_header(skb) + nhoff - skb->head; } if (IS_ERR_OR_NULL(segs)) -- cgit v1.2.3-70-g09d2 From 6c0d8833a605e195ae219b5042577ce52bf71fff Mon Sep 17 00:00:00 2001 From: Niels Dossche Date: Wed, 23 Feb 2022 14:19:56 +0100 Subject: ipv6: prevent a possible race condition with lifetimes valid_lft, prefered_lft and tstamp are always accessed under the lock "lock" in other places. Reading these without taking the lock may result in inconsistencies regarding the calculation of the valid and preferred variables since decisions are taken on these fields for those variables. Signed-off-by: Niels Dossche Reviewed-by: David Ahern Signed-off-by: Niels Dossche Link: https://lore.kernel.org/r/20220223131954.6570-1-niels.dossche@ugent.be Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3f23da8c0b10..6c8ab3e6e6fe 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4998,6 +4998,7 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) goto error; + spin_lock_bh(&ifa->lock); if (!((ifa->flags&IFA_F_PERMANENT) && (ifa->prefered_lft == INFINITY_LIFE_TIME))) { preferred = ifa->prefered_lft; @@ -5019,6 +5020,7 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, preferred = INFINITY_LIFE_TIME; valid = INFINITY_LIFE_TIME; } + spin_unlock_bh(&ifa->lock); if (!ipv6_addr_any(&ifa->peer_addr)) { if (nla_put_in6_addr(skb, IFA_LOCAL, &ifa->addr) < 0 || -- cgit v1.2.3-70-g09d2 From 9995b408f17ff8c7f11bc725c8aa225ba3a63b1c Mon Sep 17 00:00:00 2001 From: "j.nixdorf@avm.de" Date: Thu, 24 Feb 2022 10:06:49 +0100 Subject: net: ipv6: ensure we call ipv6_mc_down() at most once There are two reasons for addrconf_notify() to be called with NETDEV_DOWN: either the network device is actually going down, or IPv6 was disabled on the interface. If either of them stays down while the other is toggled, we repeatedly call the code for NETDEV_DOWN, including ipv6_mc_down(), while never calling the corresponding ipv6_mc_up() in between. This will cause a new entry in idev->mc_tomb to be allocated for each multicast group the interface is subscribed to, which in turn leaks one struct ifmcaddr6 per nontrivial multicast group the interface is subscribed to. The following reproducer will leak at least $n objects: ip addr add ff2e::4242/32 dev eth0 autojoin sysctl -w net.ipv6.conf.eth0.disable_ipv6=1 for i in $(seq 1 $n); do ip link set up eth0; ip link set down eth0 done Joining groups with IPV6_ADD_MEMBERSHIP (unprivileged) or setting the sysctl net.ipv6.conf.eth0.forwarding to 1 (=> subscribing to ff02::2) can also be used to create a nontrivial idev->mc_list, which will the leak objects with the right up-down-sequence. Based on both sources for NETDEV_DOWN events the interface IPv6 state should be considered: - not ready if the network interface is not ready OR IPv6 is disabled for it - ready if the network interface is ready AND IPv6 is enabled for it The functions ipv6_mc_up() and ipv6_down() should only be run when this state changes. Implement this by remembering when the IPv6 state is ready, and only run ipv6_mc_down() if it actually changed from ready to not ready. The other direction (not ready -> ready) already works correctly, as: - the interface notification triggered codepath for NETDEV_UP / NETDEV_CHANGE returns early if ipv6 is disabled, and - the disable_ipv6=0 triggered codepath skips fully initializing the interface as long as addrconf_link_ready(dev) returns false - calling ipv6_mc_up() repeatedly does not leak anything Fixes: 3ce62a84d53c ("ipv6: exit early in addrconf_notify() if IPv6 is disabled") Signed-off-by: Johannes Nixdorf Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6c8ab3e6e6fe..f908e2fd30b2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3732,6 +3732,7 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister) struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; bool keep_addr = false; + bool was_ready; int state, i; ASSERT_RTNL(); @@ -3797,7 +3798,10 @@ restart: addrconf_del_rs_timer(idev); - /* Step 2: clear flags for stateless addrconf */ + /* Step 2: clear flags for stateless addrconf, repeated down + * detection + */ + was_ready = idev->if_flags & IF_READY; if (!unregister) idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); @@ -3871,7 +3875,7 @@ restart: if (unregister) { ipv6_ac_destroy_dev(idev); ipv6_mc_destroy_dev(idev); - } else { + } else if (was_ready) { ipv6_mc_down(idev); } -- cgit v1.2.3-70-g09d2 From 4ff2980b6bd2aa6b4ded3ce3b7c0ccfab29980af Mon Sep 17 00:00:00 2001 From: Lina Wang Date: Sat, 26 Feb 2022 15:48:01 +0800 Subject: xfrm: fix tunnel model fragmentation behavior in tunnel mode, if outer interface(ipv4) is less, it is easily to let inner IPV6 mtu be less than 1280. If so, a Packet Too Big ICMPV6 message is received. When send again, packets are fragmentized with 1280, they are still rejected with ICMPV6(Packet Too Big) by xfrmi_xmit2(). According to RFC4213 Section3.2.2: if (IPv4 path MTU - 20) is less than 1280 if packet is larger than 1280 bytes Send ICMPv6 "packet too big" with MTU=1280 Drop packet else Encapsulate but do not set the Don't Fragment flag in the IPv4 header. The resulting IPv4 packet might be fragmented by the IPv4 layer on the encapsulator or by some router along the IPv4 path. endif else if packet is larger than (IPv4 path MTU - 20) Send ICMPv6 "packet too big" with MTU = (IPv4 path MTU - 20). Drop packet. else Encapsulate and set the Don't Fragment flag in the IPv4 header. endif endif Packets should be fragmentized with ipv4 outer interface, so change it. After it is fragemtized with ipv4, there will be double fragmenation. No.48 & No.51 are ipv6 fragment packets, No.48 is double fragmentized, then tunneled with IPv4(No.49& No.50), which obey spec. And received peer cannot decrypt it rightly. 48 2002::10 2002::11 1296(length) IPv6 fragment (off=0 more=y ident=0xa20da5bc nxt=50) 49 0x0000 (0) 2002::10 2002::11 1304 IPv6 fragment (off=0 more=y ident=0x7448042c nxt=44) 50 0x0000 (0) 2002::10 2002::11 200 ESP (SPI=0x00035000) 51 2002::10 2002::11 180 Echo (ping) request 52 0x56dc 2002::10 2002::11 248 IPv6 fragment (off=1232 more=n ident=0xa20da5bc nxt=50) xfrm6_noneed_fragment has fixed above issues. Finally, it acted like below: 1 0x6206 192.168.1.138 192.168.1.1 1316 Fragmented IP protocol (proto=Encap Security Payload 50, off=0, ID=6206) [Reassembled in #2] 2 0x6206 2002::10 2002::11 88 IPv6 fragment (off=0 more=y ident=0x1f440778 nxt=50) 3 0x0000 2002::10 2002::11 248 ICMPv6 Echo (ping) request Signed-off-by: Lina Wang Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_output.c | 16 ++++++++++++++++ net/xfrm/xfrm_interface.c | 5 ++++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'net/ipv6') diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index d0d280077721..ad07904642ca 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -45,6 +45,19 @@ static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buf return xfrm_output(sk, skb); } +static int xfrm6_noneed_fragment(struct sk_buff *skb) +{ + struct frag_hdr *fh; + u8 prevhdr = ipv6_hdr(skb)->nexthdr; + + if (prevhdr != NEXTHDR_FRAGMENT) + return 0; + fh = (struct frag_hdr *)(skb->data + sizeof(struct ipv6hdr)); + if (fh->nexthdr == NEXTHDR_ESP || fh->nexthdr == NEXTHDR_AUTH) + return 1; + return 0; +} + static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -73,6 +86,9 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) xfrm6_local_rxpmtu(skb, mtu); kfree_skb(skb); return -EMSGSIZE; + } else if (toobig && xfrm6_noneed_fragment(skb)) { + skb->ignore_df = 1; + goto skip_frag; } else if (!skb->ignore_df && toobig && skb->sk) { xfrm_local_error(skb, mtu); kfree_skb(skb); diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 4e3c62d1ad9e..1e8b26eecb3f 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -304,7 +304,10 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + if (skb->len > 1280) + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + else + goto xmit; } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; -- cgit v1.2.3-70-g09d2 From 2d3916f3189172d5c69d33065c3c21119fe539fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Mar 2022 09:37:28 -0800 Subject: ipv6: fix skb drops in igmp6_event_query() and igmp6_event_report() While investigating on why a synchronize_net() has been added recently in ipv6_mc_down(), I found that igmp6_event_query() and igmp6_event_report() might drop skbs in some cases. Discussion about removing synchronize_net() from ipv6_mc_down() will happen in a different thread. Fixes: f185de28d9ae ("mld: add new workqueues for process mld events") Signed-off-by: Eric Dumazet Cc: Taehee Yoo Cc: Cong Wang Cc: David Ahern Link: https://lore.kernel.org/r/20220303173728.937869-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ndisc.h | 4 ++-- net/ipv6/mcast.c | 32 ++++++++++++-------------------- 2 files changed, 14 insertions(+), 22 deletions(-) (limited to 'net/ipv6') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 53cb8de0e589..47ffb360ddfa 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -475,9 +475,9 @@ int igmp6_late_init(void); void igmp6_cleanup(void); void igmp6_late_cleanup(void); -int igmp6_event_query(struct sk_buff *skb); +void igmp6_event_query(struct sk_buff *skb); -int igmp6_event_report(struct sk_buff *skb); +void igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index a8861db52c18..909f937befd7 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1371,27 +1371,23 @@ static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, } /* called with rcu_read_lock() */ -int igmp6_event_query(struct sk_buff *skb) +void igmp6_event_query(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); - if (!idev) - return -EINVAL; - - if (idev->dead) { - kfree_skb(skb); - return -ENODEV; - } + if (!idev || idev->dead) + goto out; spin_lock_bh(&idev->mc_query_lock); if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_query_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0)) in6_dev_hold(idev); + skb = NULL; } spin_unlock_bh(&idev->mc_query_lock); - - return 0; +out: + kfree_skb(skb); } static void __mld_query_work(struct sk_buff *skb) @@ -1542,27 +1538,23 @@ static void mld_query_work(struct work_struct *work) } /* called with rcu_read_lock() */ -int igmp6_event_report(struct sk_buff *skb) +void igmp6_event_report(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); - if (!idev) - return -EINVAL; - - if (idev->dead) { - kfree_skb(skb); - return -ENODEV; - } + if (!idev || idev->dead) + goto out; spin_lock_bh(&idev->mc_report_lock); if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_report_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0)) in6_dev_hold(idev); + skb = NULL; } spin_unlock_bh(&idev->mc_report_lock); - - return 0; +out: + kfree_skb(skb); } static void __mld_report_work(struct sk_buff *skb) -- cgit v1.2.3-70-g09d2 From ebe48d368e97d007bfeb76fcb065d6cfc4c96645 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Mar 2022 13:11:39 +0100 Subject: esp: Fix possible buffer overflow in ESP transformation The maximum message size that can be send is bigger than the maximum site that skb_page_frag_refill can allocate. So it is possible to write beyond the allocated buffer. Fix this by doing a fallback to COW in that case. v2: Avoid get get_order() costs as suggested by Linus Torvalds. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Reported-by: valis Signed-off-by: Steffen Klassert --- include/net/esp.h | 2 ++ net/ipv4/esp4.c | 5 +++++ net/ipv6/esp6.c | 5 +++++ 3 files changed, 12 insertions(+) (limited to 'net/ipv6') diff --git a/include/net/esp.h b/include/net/esp.h index 9c5637d41d95..90cd02ff77ef 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,6 +4,8 @@ #include +#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) + struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index e1b1d080e908..70e6c87fbe3d 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -446,6 +446,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { @@ -455,6 +456,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 7591160edce1..b0ffbcd5432d 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -482,6 +482,7 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; if (x->encap) { int err = esp6_output_encap(x, skb, esp); @@ -490,6 +491,10 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info return err; } + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; -- cgit v1.2.3-70-g09d2 From 053c8fdf2c930efdff5496960842bbb5c34ad43a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Mar 2022 13:11:40 +0100 Subject: esp: Fix BEET mode inter address family tunneling on GSO The xfrm{4,6}_beet_gso_segment() functions did not correctly set the SKB_GSO_IPXIP4 and SKB_GSO_IPXIP6 gso types for the address family tunneling case. Fix this by setting these gso types. Fixes: 384a46ea7bdc7 ("esp4: add gso_segment for esp4 beet mode") Fixes: 7f9e40eb18a99 ("esp6: add gso_segment for esp6 beet mode") Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 3 +++ net/ipv6/esp6_offload.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'net/ipv6') diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index d87f02a6e934..146d4d54830c 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -160,6 +160,9 @@ static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x, skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; } + if (proto == IPPROTO_IPV6) + skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; + __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index ba5e81cd569c..e61172d50817 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -199,6 +199,9 @@ static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x, ipv6_skip_exthdr(skb, 0, &proto, &frag); } + if (proto == IPPROTO_IPIP) + skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; + __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) -- cgit v1.2.3-70-g09d2 From 23c7f8d7989e1646aac82f75761b7648c355cb8a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Mar 2022 13:11:41 +0100 Subject: net: Fix esp GSO on inter address family tunnels. The esp tunnel GSO handlers use skb_mac_gso_segment to push the inner packet to the segmentation handlers. However, skb_mac_gso_segment takes the Ethernet Protocol ID from 'skb->protocol' which is wrong for inter address family tunnels. We fix this by introducing a new skb_eth_gso_segment function. This function can be used if it is necessary to pass the Ethernet Protocol ID directly to the segmentation handler. First users of this function will be the esp4 and esp6 tunnel segmentation handlers. Fixes: c35fe4106b92 ("xfrm: Add mode handlers for IPsec on layer 2") Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 2 ++ net/core/gro.c | 25 +++++++++++++++++++++++++ net/ipv4/esp4_offload.c | 3 +-- net/ipv6/esp6_offload.c | 3 +-- 4 files changed, 29 insertions(+), 4 deletions(-) (limited to 'net/ipv6') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8b5a314db167..f53ea7038441 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4602,6 +4602,8 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type); struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features); diff --git a/net/core/gro.c b/net/core/gro.c index a11b286d1495..b7d2b0dc59a2 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -92,6 +92,31 @@ void dev_remove_offload(struct packet_offload *po) } EXPORT_SYMBOL(dev_remove_offload); +/** + * skb_eth_gso_segment - segmentation handler for ethernet protocols. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @type: Ethernet Protocol ID + */ +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + return segs; +} +EXPORT_SYMBOL(skb_eth_gso_segment); + /** * skb_mac_gso_segment - mac layer segmentation handler. * @skb: buffer to segment diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 146d4d54830c..935026f4c807 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -110,8 +110,7 @@ static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __skb_push(skb, skb->mac_len); - return skb_mac_gso_segment(skb, features); + return skb_eth_gso_segment(skb, features, htons(ETH_P_IP)); } static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index e61172d50817..3a293838a91d 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -145,8 +145,7 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __skb_push(skb, skb->mac_len); - return skb_mac_gso_segment(skb, features); + return skb_eth_gso_segment(skb, features, htons(ETH_P_IPV6)); } static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x, -- cgit v1.2.3-70-g09d2