diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-05 11:23:45 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-05 11:23:45 -0800 |
commit | 9d82f5eb3376cbae96ad36a063a9390de1694546 (patch) | |
tree | d52daee3296d28455aff25c98b23fffab5282cd8 /net/ipv4 | |
parent | 14365ea2b868c96e18da73a3f454c7bcdb0627c5 (diff) | |
parent | a409caecb2e17fc475533738dd1c69b32e13fe09 (diff) |
MMerge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
Pull networking fixes from David Miller:
1) Stretch ACKs can kill performance with Reno and CUBIC congestion
control, largely due to LRO and GRO. Fix from Neal Cardwell.
2) Fix userland breakage because we accidently emit zero length netlink
messages from the bridging code. From Roopa Prabhu.
3) Carry handling in generic csum_tcpudp_nofold is broken, fix from
Karl Beldan.
4) Remove bogus dev_set_net() calls from CAIF driver, from Nicolas
Dichtel.
5) Make sure PPP deflation never returns a length greater then the
output buffer, otherwise we overflow and trigger skb_over_panic().
Fix from Florian Westphal.
6) COSA driver needs VIRT_TO_BUS Kconfig dependencies, from Arnd
Bergmann.
7) Don't increase route cached MTU on datagram too big ICMPs. From Li
Wei.
8) Fix error path leaks in nf_tables, from Pablo Neira Ayuso.
9) Fix bitmask handling regression in netlink that broke things like
acpi userland tools. From Pablo Neira Ayuso.
10) Wrong header pointer passed to param_type2af() in SCTP code, from
Saran Maruti Ramanara.
11) Stacked vlans not handled correctly by vlan_get_protocol(), from
Toshiaki Makita.
12) Add missing DMA memory barrier to xgene driver, from Iyappan
Subramanian.
13) Fix crash in rate estimators, from Eric Dumazet.
14) We've been adding various workarounds, one after another, for the
change which added the per-net tcp_sock. It was meant to reduce
socket contention but added lots of problems.
Reduce this instead to a proper per-cpu socket and that rids us of
all the daemons.
From Eric Dumazet.
15) Fix memory corruption and OOPS in mlx4 driver, from Jack
Morgenstein.
16) When we disabled UFO in the virtio_net device, it introduces some
serious performance regressions. The orignal problem was IPV6
fragment ID generation, so fix that properly instead. From Vlad
Yasevich.
17) sr9700 driver build breaks on xtensa because it defines macros with
the same name as those used by the arch code. Use more unique
names. From Chen Gang.
18) Fix endianness in new virio 1.0 mode of the vhost net driver, from
Michael S Tsirkin.
19) Several sysctls were setting the maxlen attribute incorrectly, from
Sasha Levin.
20) Don't accept an FQ scheduler quantum of zero, that leads to crashes.
From Kenneth Klette Jonassen.
21) Fix dumping of non-existing actions in the packet scheduler
classifier. From Ignacy Gawędzki.
22) Return the write work_done value when doing TX work in the qlcnic
driver.
23) ip6gre_err accesses the info field with the wrong endianness, from
Sabrina Dubroca.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (54 commits)
sit: fix some __be16/u16 mismatches
ipv6: fix sparse errors in ip6_make_flowlabel()
net: remove some sparse warnings
flow_keys: n_proto type should be __be16
ip6_gre: fix endianness errors in ip6gre_err
qlcnic: Fix NAPI poll routine for Tx completion
amd-xgbe: Set RSS enablement based on hardware features
amd-xgbe: Adjust for zero-based traffic class count
cls_api.c: Fix dumping of non-existing actions' stats.
pkt_sched: fq: avoid hang when quantum 0
net: rds: use correct size for max unacked packets and bytes
vhost/net: fix up num_buffers endian-ness
gianfar: correct the bad expression while writing bit-pattern
net: usb: sr9700: Use 'SR_' prefix for the common register macros
Revert "drivers/net: Disable UFO through virtio"
Revert "drivers/net, ipv6: Select IPv6 fragment idents for virtio UFO packets"
ipv6: Select fragment id during UFO segmentation if not set.
xen-netback: stop the guest rx thread after a fatal error
net/mlx4_core: Fix kernel Oops (mem corruption) when working with more than 80 VFs
isdn: off by one in connect_res()
...
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/ip_output.c | 29 | ||||
-rw-r--r-- | net/ipv4/route.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_bic.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 32 | ||||
-rw-r--r-- | net/ipv4/tcp_cubic.c | 39 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 37 | ||||
-rw-r--r-- | net/ipv4/tcp_scalable.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_veno.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_yeah.c | 2 |
9 files changed, 80 insertions, 69 deletions
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b50861b22b6b..c373c0708d97 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1506,23 +1506,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, /* * Generic function to send a packet as reply to another packet. * Used to send some TCP resets/acks so far. - * - * Use a fake percpu inet socket to avoid false sharing and contention. */ -static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { - .sk = { - .__sk_common = { - .skc_refcnt = ATOMIC_INIT(1), - }, - .sk_wmem_alloc = ATOMIC_INIT(1), - .sk_allocation = GFP_ATOMIC, - .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), - }, - .pmtudisc = IP_PMTUDISC_WANT, - .uc_ttl = -1, -}; - -void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, +void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, @@ -1532,9 +1517,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, struct ipcm_cookie ipc; struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); + struct net *net = sock_net(sk); struct sk_buff *nskb; - struct sock *sk; - struct inet_sock *inet; int err; if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) @@ -1565,15 +1549,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, if (IS_ERR(rt)) return; - inet = &get_cpu_var(unicast_sock); + inet_sk(sk)->tos = arg->tos; - inet->tos = arg->tos; - sk = &inet->sk; sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; - sock_net_set(sk, net); - __skb_queue_head_init(&sk->sk_write_queue); sk->sk_sndbuf = sysctl_wmem_default; err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); @@ -1589,13 +1569,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - skb_orphan(nskb); skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } out: - put_cpu_var(unicast_sock); - ip_rt_put(rt); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d58dd0ec3e53..52e1f2bf0ca2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -966,6 +966,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (dst->dev->mtu < mtu) return; + if (rt->rt_pmtu && rt->rt_pmtu < mtu) + return; + if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index bb395d46a389..c037644eafb7 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -150,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) tcp_slow_start(tp, acked); else { bictcp_update(ca, tp->snd_cwnd); - tcp_cong_avoid_ai(tp, ca->cnt); + tcp_cong_avoid_ai(tp, ca->cnt, 1); } } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 27ead0dd16bc..8670e68e2ce6 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -291,26 +291,32 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ -void tcp_slow_start(struct tcp_sock *tp, u32 acked) +u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { u32 cwnd = tp->snd_cwnd + acked; if (cwnd > tp->snd_ssthresh) cwnd = tp->snd_ssthresh + 1; + acked -= cwnd - tp->snd_cwnd; tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); + + return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); -/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */ -void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w) +/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), + * for every packet that was ACKed. + */ +void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { + tp->snd_cwnd_cnt += acked; if (tp->snd_cwnd_cnt >= w) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else { - tp->snd_cwnd_cnt++; + u32 delta = tp->snd_cwnd_cnt / w; + + tp->snd_cwnd_cnt -= delta * w; + tp->snd_cwnd += delta; } + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); @@ -329,11 +335,13 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; /* In "safe" area, increase. */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp, acked); + if (tp->snd_cwnd <= tp->snd_ssthresh) { + acked = tcp_slow_start(tp, acked); + if (!acked) + return; + } /* In dangerous area, increase slowly. */ - else - tcp_cong_avoid_ai(tp, tp->snd_cwnd); + tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 6b6002416a73..4b276d1ed980 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -93,9 +93,7 @@ struct bictcp { u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ -#define ACK_RATIO_SHIFT 4 -#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT) - u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ + u16 unused; u8 sample_cnt; /* number of samples to decide curr_rtt */ u8 found; /* the exit point is found? */ u32 round_start; /* beginning of each round */ @@ -114,7 +112,6 @@ static inline void bictcp_reset(struct bictcp *ca) ca->bic_K = 0; ca->delay_min = 0; ca->epoch_start = 0; - ca->delayed_ack = 2 << ACK_RATIO_SHIFT; ca->ack_cnt = 0; ca->tcp_cwnd = 0; ca->found = 0; @@ -205,23 +202,30 @@ static u32 cubic_root(u64 a) /* * Compute congestion window to use. */ -static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) { u32 delta, bic_target, max_cnt; u64 offs, t; - ca->ack_cnt++; /* count the number of ACKs */ + ca->ack_cnt += acked; /* count the number of ACKed packets */ if (ca->last_cwnd == cwnd && (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) return; + /* The CUBIC function can update ca->cnt at most once per jiffy. + * On all cwnd reduction events, ca->epoch_start is set to 0, + * which will force a recalculation of ca->cnt. + */ + if (ca->epoch_start && tcp_time_stamp == ca->last_time) + goto tcp_friendliness; + ca->last_cwnd = cwnd; ca->last_time = tcp_time_stamp; if (ca->epoch_start == 0) { ca->epoch_start = tcp_time_stamp; /* record beginning */ - ca->ack_cnt = 1; /* start counting */ + ca->ack_cnt = acked; /* start counting */ ca->tcp_cwnd = cwnd; /* syn with cubic */ if (ca->last_max_cwnd <= cwnd) { @@ -283,6 +287,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) if (ca->last_max_cwnd == 0 && ca->cnt > 20) ca->cnt = 20; /* increase cwnd 5% per RTT */ +tcp_friendliness: /* TCP Friendly */ if (tcp_friendliness) { u32 scale = beta_scale; @@ -301,7 +306,6 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) } } - ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; if (ca->cnt == 0) /* cannot be zero */ ca->cnt = 1; } @@ -317,11 +321,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (tp->snd_cwnd <= tp->snd_ssthresh) { if (hystart && after(ack, ca->end_seq)) bictcp_hystart_reset(sk); - tcp_slow_start(tp, acked); - } else { - bictcp_update(ca, tp->snd_cwnd); - tcp_cong_avoid_ai(tp, ca->cnt); + acked = tcp_slow_start(tp, acked); + if (!acked) + return; } + bictcp_update(ca, tp->snd_cwnd, acked); + tcp_cong_avoid_ai(tp, ca->cnt, acked); } static u32 bictcp_recalc_ssthresh(struct sock *sk) @@ -411,20 +416,10 @@ static void hystart_update(struct sock *sk, u32 delay) */ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) { - const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); u32 delay; - if (icsk->icsk_ca_state == TCP_CA_Open) { - u32 ratio = ca->delayed_ack; - - ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; - ratio += cnt; - - ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT); - } - /* Some calls are for duplicates without timetamps */ if (rtt_us < 0) return; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a3f72d7fc06c..d22f54482bab 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -683,7 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.bound_dev_if = sk->sk_bound_dev_if; arg.tos = ip_hdr(skb)->tos; - ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), + skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); @@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; arg.tos = tos; - ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), + skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); @@ -2428,14 +2430,39 @@ struct proto tcp_prot = { }; EXPORT_SYMBOL(tcp_prot); +static void __net_exit tcp_sk_exit(struct net *net) +{ + int cpu; + + for_each_possible_cpu(cpu) + inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); + free_percpu(net->ipv4.tcp_sk); +} + static int __net_init tcp_sk_init(struct net *net) { + int res, cpu; + + net->ipv4.tcp_sk = alloc_percpu(struct sock *); + if (!net->ipv4.tcp_sk) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct sock *sk; + + res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, + IPPROTO_TCP, net); + if (res) + goto fail; + *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; + } net->ipv4.sysctl_tcp_ecn = 2; return 0; -} -static void __net_exit tcp_sk_exit(struct net *net) -{ +fail: + tcp_sk_exit(net); + + return res; } static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 6824afb65d93..333bcb2415ff 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -25,7 +25,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp, acked); else - tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)); + tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), + 1); } static u32 tcp_scalable_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index a4d2d2d88dca..112151eeee45 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -159,7 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) /* In the "non-congestive state", increase cwnd * every rtt. */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd); + tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); } else { /* In the "congestive state", increase cwnd * every other rtt. diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index cd7273218598..17d35662930d 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -92,7 +92,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) } else { /* Reno */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd); + tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); } /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. |