summaryrefslogtreecommitdiff
path: root/net/ipv6/ip6_output.c
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2023-10-25 18:04:31 -0700
committerJakub Kicinski <kuba@kernel.org>2023-10-25 18:04:31 -0700
commitd8c4ef76d7ccd478f8c9a3b7de1ba0b25fdffbee (patch)
tree9283e779aab9e7621bb9ae824dcadb28f699096e /net/ipv6/ip6_output.c
parent8846f9a04b10b7f61214425409838d764df7080d (diff)
parent03d6c848bfb406e9ef6d9846d759e97beaeea113 (diff)
Merge branch 'ipv6-avoid-atomic-fragment-on-gso-output'
Yan Zhai says: ==================== ipv6: avoid atomic fragment on GSO output When the ipv6 stack output a GSO packet, if its gso_size is larger than dst MTU, then all segments would be fragmented. However, it is possible for a GSO packet to have a trailing segment with smaller actual size than both gso_size as well as the MTU, which leads to an "atomic fragment". Atomic fragments are considered harmful in RFC-8021. An Existing report from APNIC also shows that atomic fragments are more likely to be dropped even it is equivalent to a no-op [1]. The series contains following changes: * drop feature RTAX_FEATURE_ALLFRAG, which has been broken. This helps simplifying other changes in this set. * refactor __ip6_finish_output code to separate GSO and non-GSO packet processing, mirroring IPv4 side logic. * avoid generating atomic fragment on GSO packets. Link: https://www.potaroo.net/presentations/2022-03-01-ipv6-frag.pdf [1] V4: https://lore.kernel.org/netdev/cover.1698114636.git.yan@cloudflare.com/ V3: https://lore.kernel.org/netdev/cover.1697779681.git.yan@cloudflare.com/ V2: https://lore.kernel.org/netdev/ZS1%2Fqtr0dZJ35VII@debian.debian/ ==================== Link: https://lore.kernel.org/r/cover.1698156966.git.yan@cloudflare.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/ipv6/ip6_output.c')
-rw-r--r--net/ipv6/ip6_output.c45
1 files changed, 24 insertions, 21 deletions
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3c7de89d6755..a722a43dd668 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -164,7 +164,13 @@ ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
int err;
skb_mark_not_on_list(segs);
- err = ip6_fragment(net, sk, segs, ip6_finish_output2);
+ /* Last GSO segment can be smaller than gso_size (and MTU).
+ * Adding a fragment header would produce an "atomic fragment",
+ * which is considered harmful (RFC-8021). Avoid that.
+ */
+ err = segs->len > mtu ?
+ ip6_fragment(net, sk, segs, ip6_finish_output2) :
+ ip6_finish_output2(net, sk, segs);
if (err && ret == 0)
ret = err;
}
@@ -172,6 +178,16 @@ ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
return ret;
}
+static int ip6_finish_output_gso(struct net *net, struct sock *sk,
+ struct sk_buff *skb, unsigned int mtu)
+{
+ if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
+ !skb_gso_validate_network_len(skb, mtu))
+ return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
+
+ return ip6_finish_output2(net, sk, skb);
+}
+
static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
@@ -185,17 +201,14 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff
#endif
mtu = ip6_skb_dst_mtu(skb);
- if (skb_is_gso(skb) &&
- !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
- !skb_gso_validate_network_len(skb, mtu))
- return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
+ if (skb_is_gso(skb))
+ return ip6_finish_output_gso(net, sk, skb, mtu);
- if ((skb->len > mtu && !skb_is_gso(skb)) ||
- dst_allfrag(skb_dst(skb)) ||
+ if (skb->len > mtu ||
(IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
return ip6_fragment(net, sk, skb, ip6_finish_output2);
- else
- return ip6_finish_output2(net, sk, skb);
+
+ return ip6_finish_output2(net, sk, skb);
}
static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -1017,9 +1030,6 @@ slow_path:
return err;
fail_toobig:
- if (skb->sk && dst_allfrag(skb_dst(skb)))
- sk_gso_disable(skb->sk);
-
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
err = -EMSGSIZE;
@@ -1384,10 +1394,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
cork->base.mark = ipc6->sockc.mark;
sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
- if (dst_allfrag(xfrm_dst_path(&rt->dst)))
- cork->base.flags |= IPCORK_ALLFRAG;
cork->base.length = 0;
-
cork->base.transmit_time = ipc6->sockc.transmit_time;
return 0;
@@ -1444,8 +1451,6 @@ static int __ip6_append_data(struct sock *sk,
headersize = sizeof(struct ipv6hdr) +
(opt ? opt->opt_flen + opt->opt_nflen : 0) +
- (dst_allfrag(&rt->dst) ?
- sizeof(struct frag_hdr) : 0) +
rt->rt6i_nfheader_len;
if (mtu <= fragheaderlen ||
@@ -1555,7 +1560,7 @@ emsgsize:
while (length > 0) {
/* Check if the remaining data fits into current packet. */
- copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
+ copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
if (copy < length)
copy = maxfraglen - skb->len;
@@ -1586,7 +1591,7 @@ alloc_new_skb:
*/
datalen = length + fraggap;
- if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
+ if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
fraglen = datalen + fragheaderlen;
pagedlen = 0;
@@ -1835,7 +1840,6 @@ static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
struct dst_entry *dst = cork->base.dst;
cork->base.dst = NULL;
- cork->base.flags &= ~IPCORK_ALLFRAG;
skb_dst_set(skb, dst);
}
@@ -1856,7 +1860,6 @@ static void ip6_cork_release(struct inet_cork_full *cork,
if (cork->base.dst) {
dst_release(cork->base.dst);
cork->base.dst = NULL;
- cork->base.flags &= ~IPCORK_ALLFRAG;
}
}