diff options
author | David S. Miller <davem@davemloft.net> | 2017-01-09 15:49:13 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-01-09 15:49:13 -0500 |
commit | 9f2f27a9a518cdcd724bd360a1e25a10e7cd2f82 (patch) | |
tree | 23bd866126b5115cf4a14541f7700117a38efc29 | |
parent | aaa9c1071da4cb231fca0774b01ee5792aa60f8a (diff) | |
parent | 7ba91ecb16824f74ba4fcbc4e88cd4d24a839b25 (diff) |
Merge branch 'icmp-reply-optimize'
Jesper Dangaard Brouer says:
====================
net: optimize ICMP-reply code path
This patchset is optimizing the ICMP-reply code path, for ICMP packets
that gets rate limited. A remote party can easily trigger this code
path by sending packets to port number with no listening service.
Generally the patchset moves the sysctl_icmp_msgs_per_sec ratelimit
checking to earlier in the code path and removes an allocation.
Use-case: The specific case I experienced this being a bottleneck is,
sending UDP packets to a port with no listener, which obviously result
in kernel replying with ICMP Destination Unreachable (type:3), Port
Unreachable (code:3), which cause the bottleneck.
After Eric and Paolo optimized the UDP socket code, the kernels PPS
processing capabilities is lower for no-listen ports, than normal UDP
sockets. This is bad for capacity planning when restarting a service.
UDP no-listen benchmark 8xCPUs using pktgen_sample04_many_flows.sh:
Baseline: 6.6 Mpps
Patch: 14.7 Mpps
Driver mlx5 at 50Gbit/s.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/ipv4/icmp.c | 125 | ||||
-rw-r--r-- | net/ipv6/icmp.c | 68 |
2 files changed, 123 insertions, 70 deletions
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 0777ea949223..fc310db2708b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net) return *this_cpu_ptr(net->ipv4.icmp_sk); } +/* Called with BH disabled */ static inline struct sock *icmp_xmit_lock(struct net *net) { struct sock *sk; - local_bh_disable(); - sk = icmp_sk(net); if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path signals a * dst_link_failure() for an outgoing ICMP packet. */ - local_bh_enable(); return NULL; } return sk; @@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net) static inline void icmp_xmit_unlock(struct sock *sk) { - spin_unlock_bh(&sk->sk_lock.slock); + spin_unlock(&sk->sk_lock.slock); } int sysctl_icmp_msgs_per_sec __read_mostly = 1000; @@ -282,6 +280,33 @@ bool icmp_global_allow(void) } EXPORT_SYMBOL(icmp_global_allow); +static bool icmpv4_mask_allow(struct net *net, int type, int code) +{ + if (type > NR_ICMP_TYPES) + return true; + + /* Don't limit PMTU discovery. */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + return true; + + /* Limit if icmp type is enabled in ratemask. */ + if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask)) + return true; + + return false; +} + +static bool icmpv4_global_allow(struct net *net, int type, int code) +{ + if (icmpv4_mask_allow(net, type, code)) + return true; + + if (icmp_global_allow()) + return true; + + return false; +} + /* * Send an ICMP frame. */ @@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, struct flowi4 *fl4, int type, int code) { struct dst_entry *dst = &rt->dst; + struct inet_peer *peer; bool rc = true; + int vif; - if (type > NR_ICMP_TYPES) - goto out; - - /* Don't limit PMTU discovery. */ - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + if (icmpv4_mask_allow(net, type, code)) goto out; /* No rate limit on loopback */ if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) goto out; - /* Limit if icmp type is enabled in ratemask. */ - if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask)) - goto out; - - rc = false; - if (icmp_global_allow()) { - int vif = l3mdev_master_ifindex(dst->dev); - struct inet_peer *peer; - - peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); - rc = inet_peer_xrlim_allow(peer, - net->ipv4.sysctl_icmp_ratelimit); - if (peer) - inet_putpeer(peer); - } + vif = l3mdev_master_ifindex(dst->dev); + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); + rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); + if (peer) + inet_putpeer(peer); out: return rc; } @@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct inet_sock *inet; __be32 daddr, saddr; u32 mark = IP4_REPLY_MARK(net, skb->mark); + int type = icmp_param->data.icmph.type; + int code = icmp_param->data.icmph.code; if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) return; + /* Needed by both icmp_global_allow and icmp_xmit_lock */ + local_bh_disable(); + + /* global icmp_msgs_per_sec */ + if (!icmpv4_global_allow(net, type, code)) + goto out_bh_enable; + sk = icmp_xmit_lock(net); if (!sk) - return; + goto out_bh_enable; inet = inet_sk(sk); icmp_param->data.icmph.checksum = 0; @@ -433,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) goto out_unlock; - if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type, - icmp_param->data.icmph.code)) + if (icmpv4_xrlim_allow(net, rt, &fl4, type, code)) icmp_push_reply(icmp_param, &fl4, &ipc, &rt); ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); +out_bh_enable: + local_bh_enable(); } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -571,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct iphdr *iph; int room; - struct icmp_bxm *icmp_param; + struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); struct ipcm_cookie ipc; struct flowi4 fl4; @@ -648,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) } } - icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC); - if (!icmp_param) - return; + /* Needed by both icmp_global_allow and icmp_xmit_lock */ + local_bh_disable(); + + /* Check global sysctl_icmp_msgs_per_sec ratelimit */ + if (!icmpv4_global_allow(net, type, code)) + goto out_bh_enable; sk = icmp_xmit_lock(net); if (!sk) - goto out_free; + goto out_bh_enable; /* * Construct source address and options. @@ -681,7 +707,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) + if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in)) goto out_unlock; @@ -689,25 +715,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) * Prepare data for ICMP header. */ - icmp_param->data.icmph.type = type; - icmp_param->data.icmph.code = code; - icmp_param->data.icmph.un.gateway = info; - icmp_param->data.icmph.checksum = 0; - icmp_param->skb = skb_in; - icmp_param->offset = skb_network_offset(skb_in); + icmp_param.data.icmph.type = type; + icmp_param.data.icmph.code = code; + icmp_param.data.icmph.un.gateway = info; + icmp_param.data.icmph.checksum = 0; + icmp_param.skb = skb_in; + icmp_param.offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; sk->sk_mark = mark; ipc.addr = iph->saddr; - ipc.opt = &icmp_param->replyopts.opt; + ipc.opt = &icmp_param.replyopts.opt; ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, - type, code, icmp_param); + type, code, &icmp_param); if (IS_ERR(rt)) goto out_unlock; + /* peer icmp_ratelimit */ if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) goto ende; @@ -716,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) room = dst_mtu(&rt->dst); if (room > 576) room = 576; - room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen; + room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; room -= sizeof(struct icmphdr); - icmp_param->data_len = skb_in->len - icmp_param->offset; - if (icmp_param->data_len > room) - icmp_param->data_len = room; - icmp_param->head_len = sizeof(struct icmphdr); + icmp_param.data_len = skb_in->len - icmp_param.offset; + if (icmp_param.data_len > room) + icmp_param.data_len = room; + icmp_param.head_len = sizeof(struct icmphdr); - icmp_push_reply(icmp_param, &fl4, &ipc, &rt); + icmp_push_reply(&icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); -out_free: - kfree(icmp_param); +out_bh_enable: + local_bh_enable(); out:; } EXPORT_SYMBOL(icmp_send); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 3036f665e6c8..230b5aac9f03 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -110,19 +110,17 @@ static const struct inet6_protocol icmpv6_protocol = { .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; +/* Called with BH disabled */ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) { struct sock *sk; - local_bh_disable(); - sk = icmpv6_sk(net); if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path (f.e. SIT or * ip6ip6 tunnel) signals dst_link_failure() for an * outgoing ICMP6 packet. */ - local_bh_enable(); return NULL; } return sk; @@ -130,7 +128,7 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) static __inline__ void icmpv6_xmit_unlock(struct sock *sk) { - spin_unlock_bh(&sk->sk_lock.slock); + spin_unlock(&sk->sk_lock.slock); } /* @@ -168,6 +166,30 @@ static bool is_ineligible(const struct sk_buff *skb) return false; } +static bool icmpv6_mask_allow(int type) +{ + /* Informational messages are not limited. */ + if (type & ICMPV6_INFOMSG_MASK) + return true; + + /* Do not limit pmtu discovery, it would break it. */ + if (type == ICMPV6_PKT_TOOBIG) + return true; + + return false; +} + +static bool icmpv6_global_allow(int type) +{ + if (icmpv6_mask_allow(type)) + return true; + + if (icmp_global_allow()) + return true; + + return false; +} + /* * Check the ICMP output rate limit */ @@ -178,12 +200,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct dst_entry *dst; bool res = false; - /* Informational messages are not limited. */ - if (type & ICMPV6_INFOMSG_MASK) - return true; - - /* Do not limit pmtu discovery, it would break it. */ - if (type == ICMPV6_PKT_TOOBIG) + if (icmpv6_mask_allow(type)) return true; /* @@ -200,20 +217,16 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, } else { struct rt6_info *rt = (struct rt6_info *)dst; int tmo = net->ipv6.sysctl.icmpv6_time; + struct inet_peer *peer; /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - if (icmp_global_allow()) { - struct inet_peer *peer; - - peer = inet_getpeer_v6(net->ipv6.peers, - &fl6->daddr, 1); - res = inet_peer_xrlim_allow(peer, tmo); - if (peer) - inet_putpeer(peer); - } + peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1); + res = inet_peer_xrlim_allow(peer, tmo); + if (peer) + inet_putpeer(peer); } dst_release(dst); return res; @@ -474,6 +487,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, return; } + /* Needed by both icmp_global_allow and icmpv6_xmit_lock */ + local_bh_disable(); + + /* Check global sysctl_icmp_msgs_per_sec ratelimit */ + if (!icmpv6_global_allow(type)) + goto out_bh_enable; + mip6_addr_swap(skb); memset(&fl6, 0, sizeof(fl6)); @@ -492,7 +512,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, sk = icmpv6_xmit_lock(net); if (!sk) - return; + goto out_bh_enable; + sk->sk_mark = mark; np = inet6_sk(sk); @@ -552,6 +573,8 @@ out_dst_release: dst_release(dst); out: icmpv6_xmit_unlock(sk); +out_bh_enable: + local_bh_enable(); } /* Slightly more convenient version of icmp6_send. @@ -665,9 +688,10 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.flowi6_uid = sock_net_uid(net, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + local_bh_disable(); sk = icmpv6_xmit_lock(net); if (!sk) - return; + goto out_bh_enable; sk->sk_mark = mark; np = inet6_sk(sk); @@ -709,6 +733,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb) dst_release(dst); out: icmpv6_xmit_unlock(sk); +out_bh_enable: + local_bh_enable(); } void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) |