diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-02 16:40:27 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-02 16:40:27 -0700 |
commit | 8d65b08debc7e62b2c6032d7fe7389d895b92cbc (patch) | |
tree | 0c3141b60c3a03cc32742b5750c5e763b9dae489 /net/ipv4 | |
parent | 5a0387a8a8efb90ae7fea1e2e5c62de3efa74691 (diff) | |
parent | 5d15af6778b8e4ed1fd41b040283af278e7a9a72 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Millar:
"Here are some highlights from the 2065 networking commits that
happened this development cycle:
1) XDP support for IXGBE (John Fastabend) and thunderx (Sunil Kowuri)
2) Add a generic XDP driver, so that anyone can test XDP even if they
lack a networking device whose driver has explicit XDP support
(me).
3) Sparc64 now has an eBPF JIT too (me)
4) Add a BPF program testing framework via BPF_PROG_TEST_RUN (Alexei
Starovoitov)
5) Make netfitler network namespace teardown less expensive (Florian
Westphal)
6) Add symmetric hashing support to nft_hash (Laura Garcia Liebana)
7) Implement NAPI and GRO in netvsc driver (Stephen Hemminger)
8) Support TC flower offload statistics in mlxsw (Arkadi Sharshevsky)
9) Multiqueue support in stmmac driver (Joao Pinto)
10) Remove TCP timewait recycling, it never really could possibly work
well in the real world and timestamp randomization really zaps any
hint of usability this feature had (Soheil Hassas Yeganeh)
11) Support level3 vs level4 ECMP route hashing in ipv4 (Nikolay
Aleksandrov)
12) Add socket busy poll support to epoll (Sridhar Samudrala)
13) Netlink extended ACK support (Johannes Berg, Pablo Neira Ayuso,
and several others)
14) IPSEC hw offload infrastructure (Steffen Klassert)"
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2065 commits)
tipc: refactor function tipc_sk_recv_stream()
tipc: refactor function tipc_sk_recvmsg()
net: thunderx: Optimize page recycling for XDP
net: thunderx: Support for XDP header adjustment
net: thunderx: Add support for XDP_TX
net: thunderx: Add support for XDP_DROP
net: thunderx: Add basic XDP support
net: thunderx: Cleanup receive buffer allocation
net: thunderx: Optimize CQE_TX handling
net: thunderx: Optimize RBDR descriptor handling
net: thunderx: Support for page recycling
ipx: call ipxitf_put() in ioctl error path
net: sched: add helpers to handle extended actions
qed*: Fix issues in the ptp filter config implementation.
qede: Fix concurrency issue in PTP Tx path processing.
stmmac: Add support for SIMATIC IOT2000 platform
net: hns: fix ethtool_get_strings overflow in hns driver
tcp: fix wraparound issue in tcp_lp
bpf, arm64: fix jit branch offset related to ldimm64
bpf, arm64: implement jiting of BPF_XADD
...
Diffstat (limited to 'net/ipv4')
53 files changed, 1365 insertions, 910 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index c6d4238ff94a..f83de23a30e7 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -11,7 +11,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_rate.o tcp_recovery.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ - fib_frontend.o fib_semantics.o fib_trie.o \ + fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 13a9a3297eae..f3dad1661343 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1602,8 +1602,9 @@ static const struct net_protocol igmp_protocol = { }; #endif -static const struct net_protocol tcp_protocol = { +static struct net_protocol tcp_protocol = { .early_demux = tcp_v4_early_demux, + .early_demux_handler = tcp_v4_early_demux, .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, @@ -1611,8 +1612,9 @@ static const struct net_protocol tcp_protocol = { .icmp_strict_tag_validation = 1, }; -static const struct net_protocol udp_protocol = { +static struct net_protocol udp_protocol = { .early_demux = udp_v4_early_demux, + .early_demux_handler = udp_v4_early_demux, .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, @@ -1723,6 +1725,8 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; + net->ipv4.sysctl_udp_early_demux = 1; + net->ipv4.sysctl_tcp_early_demux = 1; #ifdef CONFIG_SYSCTL net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 51b27ae09fbd..0937b34c27ca 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -872,7 +872,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) skb->pkt_type != PACKET_HOST) state = NUD_STALE; neigh_update(n, sha, state, - override ? NEIGH_UPDATE_F_OVERRIDE : 0); + override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0); neigh_release(n); } @@ -1033,7 +1033,7 @@ static int arp_req_set(struct net *net, struct arpreq *r, err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN); + NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); } return err; @@ -1084,7 +1084,7 @@ static int arp_invalidate(struct net_device *dev, __be32 ip) if (neigh->nud_state & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| - NEIGH_UPDATE_F_ADMIN); + NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index cebedd545e5e..df14815a3b8c 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -571,7 +571,8 @@ static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa) return ret; } -static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) +static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFA_MAX+1]; @@ -582,7 +583,8 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, + extack); if (err < 0) goto errout; @@ -752,7 +754,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, struct in_device *in_dev; int err; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, + NULL); if (err < 0) goto errout; @@ -843,7 +846,8 @@ static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) return NULL; } -static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) +static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct in_ifaddr *ifa; @@ -1192,6 +1196,18 @@ out: return done; } +static __be32 in_dev_select_addr(const struct in_device *in_dev, + int scope) +{ + for_primary_ifa(in_dev) { + if (ifa->ifa_scope != RT_SCOPE_LINK && + ifa->ifa_scope <= scope) + return ifa->ifa_local; + } endfor_ifa(in_dev); + + return 0; +} + __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) { __be32 addr = 0; @@ -1228,13 +1244,9 @@ no_in_dev: if (master_idx && (dev = dev_get_by_index_rcu(net, master_idx)) && (in_dev = __in_dev_get_rcu(dev))) { - for_primary_ifa(in_dev) { - if (ifa->ifa_scope != RT_SCOPE_LINK && - ifa->ifa_scope <= scope) { - addr = ifa->ifa_local; - goto out_unlock; - } - } endfor_ifa(in_dev); + addr = in_dev_select_addr(in_dev, scope); + if (addr) + goto out_unlock; } /* Not loopback addresses on loopback should be preferred @@ -1249,13 +1261,9 @@ no_in_dev: if (!in_dev) continue; - for_primary_ifa(in_dev) { - if (ifa->ifa_scope != RT_SCOPE_LINK && - ifa->ifa_scope <= scope) { - addr = ifa->ifa_local; - goto out_unlock; - } - } endfor_ifa(in_dev); + addr = in_dev_select_addr(in_dev, scope); + if (addr) + goto out_unlock; } out_unlock: rcu_read_unlock(); @@ -1713,7 +1721,7 @@ static int inet_validate_link_af(const struct net_device *dev, if (dev && !__in_dev_get_rtnl(dev)) return -EAFNOSUPPORT; - err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy); + err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy, NULL); if (err < 0) return err; @@ -1741,7 +1749,7 @@ static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla) if (!in_dev) return -EAFNOSUPPORT; - if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0) + if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0) BUG(); if (tb[IFLA_INET_CONF]) { @@ -1798,6 +1806,9 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) goto nla_put_failure; + if (!devconf) + goto out; + if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, IPV4_DEVCONF(*devconf, FORWARDING)) < 0) @@ -1819,6 +1830,7 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) goto nla_put_failure; +out: nlmsg_end(skb, nlh); return 0; @@ -1827,8 +1839,8 @@ nla_put_failure: return -EMSGSIZE; } -void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, - struct ipv4_devconf *devconf) +void inet_netconf_notify_devconf(struct net *net, int event, int type, + int ifindex, struct ipv4_devconf *devconf) { struct sk_buff *skb; int err = -ENOBUFS; @@ -1838,7 +1850,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, - RTM_NEWNETCONF, 0, type); + event, 0, type); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); @@ -1861,7 +1873,8 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { }; static int inet_netconf_get_devconf(struct sk_buff *in_skb, - struct nlmsghdr *nlh) + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX+1]; @@ -1874,7 +1887,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, int err; err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, - devconf_ipv4_policy); + devconf_ipv4_policy, extack); if (err < 0) goto errout; @@ -2017,10 +2030,12 @@ static void inet_forward_change(struct net *net) IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; IPV4_DEVCONF_DFLT(net, FORWARDING) = on; - inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); - inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); @@ -2033,7 +2048,8 @@ static void inet_forward_change(struct net *net) in_dev = __in_dev_get_rtnl(dev); if (in_dev) { IN_DEV_CONF_SET(in_dev, FORWARDING, on); - inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, dev->ifindex, &in_dev->cnf); } } @@ -2078,19 +2094,22 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); - inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_RP_FILTER, ifindex, cnf); } if (i == IPV4_DEVCONF_PROXY_ARP - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); - inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_PROXY_NEIGH, ifindex, cnf); } if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); - inet_netconf_notify_devconf(net, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, ifindex, cnf); } } @@ -2125,7 +2144,7 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write, container_of(cnf, struct in_device, cnf); if (*valp) dev_disable_lro(idev->dev); - inet_netconf_notify_devconf(net, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, idev->dev->ifindex, cnf); @@ -2133,7 +2152,8 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write, rtnl_unlock(); rt_cache_flush(net); } else - inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); } @@ -2255,7 +2275,8 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, p->sysctl = t; - inet_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p); + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL, + ifindex, p); return 0; free: @@ -2264,16 +2285,18 @@ out: return -ENOBUFS; } -static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) +static void __devinet_sysctl_unregister(struct net *net, + struct ipv4_devconf *cnf, int ifindex) { struct devinet_sysctl_table *t = cnf->sysctl; - if (!t) - return; + if (t) { + cnf->sysctl = NULL; + unregister_net_sysctl_table(t->sysctl_header); + kfree(t); + } - cnf->sysctl = NULL; - unregister_net_sysctl_table(t->sysctl_header); - kfree(t); + inet_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL); } static int devinet_sysctl_register(struct in_device *idev) @@ -2295,7 +2318,9 @@ static int devinet_sysctl_register(struct in_device *idev) static void devinet_sysctl_unregister(struct in_device *idev) { - __devinet_sysctl_unregister(&idev->cnf); + struct net *net = dev_net(idev->dev); + + __devinet_sysctl_unregister(net, &idev->cnf, idev->dev->ifindex); neigh_sysctl_unregister(idev->arp_parms); } @@ -2370,9 +2395,9 @@ static __net_init int devinet_init_net(struct net *net) #ifdef CONFIG_SYSCTL err_reg_ctl: - __devinet_sysctl_unregister(dflt); + __devinet_sysctl_unregister(net, dflt, NETCONFA_IFINDEX_DEFAULT); err_reg_dflt: - __devinet_sysctl_unregister(all); + __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: if (tbl != ctl_forward_entry) kfree(tbl); @@ -2394,8 +2419,10 @@ static __net_exit void devinet_exit_net(struct net *net) tbl = net->ipv4.forw_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.forw_hdr); - __devinet_sysctl_unregister(net->ipv4.devconf_dflt); - __devinet_sysctl_unregister(net->ipv4.devconf_all); + __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt, + NETCONFA_IFINDEX_DEFAULT); + __devinet_sysctl_unregister(net, net->ipv4.devconf_all, + NETCONFA_IFINDEX_ALL); kfree(tbl); #endif kfree(net->ipv4.devconf_dflt); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b1e24446e297..65cc02bd82bc 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -152,21 +152,28 @@ static void esp_output_restore_header(struct sk_buff *skb) } static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb, + struct xfrm_state *x, struct ip_esp_hdr *esph, struct esp_output_extra *extra) { - struct xfrm_state *x = skb_dst(skb)->xfrm; - /* For ESN we move the header forward by 4 bytes to * accomodate the high bits. We will move it back after * encryption. */ if ((x->props.flags & XFRM_STATE_ESN)) { + __u32 seqhi; + struct xfrm_offload *xo = xfrm_offload(skb); + + if (xo) + seqhi = xo->seq.hi; + else + seqhi = XFRM_SKB_CB(skb)->seq.output.hi; + extra->esphoff = (unsigned char *)esph - skb_transport_header(skb); esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4); extra->seqhi = esph->spi; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + esph->seq_no = htonl(seqhi); } esph->spi = x->id.spi; @@ -198,98 +205,56 @@ static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) tail[plen - 1] = proto; } -static int esp_output(struct xfrm_state *x, struct sk_buff *skb) +static void esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { - struct esp_output_extra *extra; - int err = -ENOMEM; - struct ip_esp_hdr *esph; - struct crypto_aead *aead; - struct aead_request *req; - struct scatterlist *sg, *dsg; - struct sk_buff *trailer; - struct page *page; - void *tmp; - u8 *iv; - u8 *tail; - u8 *vaddr; - int blksize; - int clen; - int alen; - int plen; - int ivlen; - int tfclen; - int nfrags; - int assoclen; - int extralen; - int tailen; - __be64 seqno; - __u8 proto = *skb_mac_header(skb); - - /* skb is pure payload to encrypt */ - - aead = x->data; - alen = crypto_aead_authsize(aead); - ivlen = crypto_aead_ivsize(aead); - - tfclen = 0; - if (x->tfcpad) { - struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); - u32 padto; - - padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached)); - if (skb->len < padto) - tfclen = padto - skb->len; + int encap_type; + struct udphdr *uh; + __be32 *udpdata32; + __be16 sport, dport; + struct xfrm_encap_tmpl *encap = x->encap; + struct ip_esp_hdr *esph = esp->esph; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + encap_type = encap->encap_type; + spin_unlock_bh(&x->lock); + + uh = (struct udphdr *)esph; + uh->source = sport; + uh->dest = dport; + uh->len = htons(skb->len + esp->tailen + - skb_transport_offset(skb)); + uh->check = 0; + + switch (encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + esph = (struct ip_esp_hdr *)(uh + 1); + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + udpdata32 = (__be32 *)(uh + 1); + udpdata32[0] = udpdata32[1] = 0; + esph = (struct ip_esp_hdr *)(udpdata32 + 2); + break; } - blksize = ALIGN(crypto_aead_blocksize(aead), 4); - clen = ALIGN(skb->len + 2 + tfclen, blksize); - plen = clen - skb->len - tfclen; - tailen = tfclen + plen + alen; - assoclen = sizeof(*esph); - extralen = 0; - if (x->props.flags & XFRM_STATE_ESN) { - extralen += sizeof(*extra); - assoclen += sizeof(__be32); - } + *skb_mac_header(skb) = IPPROTO_UDP; + esp->esph = esph; +} - *skb_mac_header(skb) = IPPROTO_ESP; - esph = ip_esp_hdr(skb); +int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) +{ + u8 *tail; + u8 *vaddr; + int nfrags; + struct page *page; + struct sk_buff *trailer; + int tailen = esp->tailen; /* this is non-NULL only with UDP Encapsulation */ - if (x->encap) { - struct xfrm_encap_tmpl *encap = x->encap; - struct udphdr *uh; - __be32 *udpdata32; - __be16 sport, dport; - int encap_type; - - spin_lock_bh(&x->lock); - sport = encap->encap_sport; - dport = encap->encap_dport; - encap_type = encap->encap_type; - spin_unlock_bh(&x->lock); - - uh = (struct udphdr *)esph; - uh->source = sport; - uh->dest = dport; - uh->len = htons(skb->len + tailen - - skb_transport_offset(skb)); - uh->check = 0; - - switch (encap_type) { - default: - case UDP_ENCAP_ESPINUDP: - esph = (struct ip_esp_hdr *)(uh + 1); - break; - case UDP_ENCAP_ESPINUDP_NON_IKE: - udpdata32 = (__be32 *)(uh + 1); - udpdata32[0] = udpdata32[1] = 0; - esph = (struct ip_esp_hdr *)(udpdata32 + 2); - break; - } - - *skb_mac_header(skb) = IPPROTO_UDP; - } + if (x->encap) + esp_output_udp_encap(x, skb, esp); if (!skb_cloned(skb)) { if (tailen <= skb_availroom(skb)) { @@ -304,6 +269,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct sock *sk = skb->sk; struct page_frag *pfrag = &x->xfrag; + esp->inplace = false; + allocsize = ALIGN(tailen, L1_CACHE_BYTES); spin_lock_bh(&x->lock); @@ -320,10 +287,12 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) tail = vaddr + pfrag->offset; - esp_output_fill_trailer(tail, tfclen, plen, proto); + esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); kunmap_atomic(vaddr); + spin_unlock_bh(&x->lock); + nfrags = skb_shinfo(skb)->nr_frags; __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, @@ -339,107 +308,113 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) if (sk) atomic_add(tailen, &sk->sk_wmem_alloc); - skb_push(skb, -skb_network_offset(skb)); - - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); - esph->spi = x->id.spi; - - tmp = esp_alloc_tmp(aead, nfrags + 2, extralen); - if (!tmp) { - spin_unlock_bh(&x->lock); - err = -ENOMEM; - goto error; - } - - extra = esp_tmp_extra(tmp); - iv = esp_tmp_iv(aead, tmp, extralen); - req = esp_tmp_req(aead, iv); - sg = esp_req_sg(aead, req); - dsg = &sg[nfrags]; - - esph = esp_output_set_extra(skb, esph, extra); - - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, - (unsigned char *)esph - skb->data, - assoclen + ivlen + clen + alen); - - allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES); - - if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { - spin_unlock_bh(&x->lock); - err = -ENOMEM; - goto error; - } - - skb_shinfo(skb)->nr_frags = 1; - - page = pfrag->page; - get_page(page); - /* replace page frags in skb with new page */ - __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len); - pfrag->offset = pfrag->offset + allocsize; - - sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1); - skb_to_sgvec(skb, dsg, - (unsigned char *)esph - skb->data, - assoclen + ivlen + clen + alen); - - spin_unlock_bh(&x->lock); - - goto skip_cow2; + goto out; } } cow: - err = skb_cow_data(skb, tailen, &trailer); - if (err < 0) - goto error; - nfrags = err; + nfrags = skb_cow_data(skb, tailen, &trailer); + if (nfrags < 0) + goto out; tail = skb_tail_pointer(trailer); - esph = ip_esp_hdr(skb); + esp->esph = ip_esp_hdr(skb); skip_cow: - esp_output_fill_trailer(tail, tfclen, plen, proto); + esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); + pskb_put(skb, trailer, tailen); - pskb_put(skb, trailer, clen - skb->len + alen); - skb_push(skb, -skb_network_offset(skb)); - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); - esph->spi = x->id.spi; +out: + return nfrags; +} +EXPORT_SYMBOL_GPL(esp_output_head); - tmp = esp_alloc_tmp(aead, nfrags, extralen); - if (!tmp) { - err = -ENOMEM; - goto error; +int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) +{ + u8 *iv; + int alen; + void *tmp; + int ivlen; + int assoclen; + int extralen; + struct page *page; + struct ip_esp_hdr *esph; + struct crypto_aead *aead; + struct aead_request *req; + struct scatterlist *sg, *dsg; + struct esp_output_extra *extra; + int err = -ENOMEM; + + assoclen = sizeof(struct ip_esp_hdr); + extralen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + extralen += sizeof(*extra); + assoclen += sizeof(__be32); } + aead = x->data; + alen = crypto_aead_authsize(aead); + ivlen = crypto_aead_ivsize(aead); + + tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen); + if (!tmp) + goto error; + extra = esp_tmp_extra(tmp); iv = esp_tmp_iv(aead, tmp, extralen); req = esp_tmp_req(aead, iv); sg = esp_req_sg(aead, req); - dsg = sg; - esph = esp_output_set_extra(skb, esph, extra); + if (esp->inplace) + dsg = sg; + else + dsg = &sg[esp->nfrags]; - sg_init_table(sg, nfrags); + esph = esp_output_set_extra(skb, x, esp->esph, extra); + esp->esph = esph; + + sg_init_table(sg, esp->nfrags); skb_to_sgvec(skb, sg, (unsigned char *)esph - skb->data, - assoclen + ivlen + clen + alen); + assoclen + ivlen + esp->clen + alen); + + if (!esp->inplace) { + int allocsize; + struct page_frag *pfrag = &x->xfrag; + + allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES); + + spin_lock_bh(&x->lock); + if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { + spin_unlock_bh(&x->lock); + goto error; + } + + skb_shinfo(skb)->nr_frags = 1; + + page = pfrag->page; + get_page(page); + /* replace page frags in skb with new page */ + __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len); + pfrag->offset = pfrag->offset + allocsize; + spin_unlock_bh(&x->lock); + + sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1); + skb_to_sgvec(skb, dsg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + esp->clen + alen); + } -skip_cow2: if ((x->props.flags & XFRM_STATE_ESN)) aead_request_set_callback(req, 0, esp_output_done_esn, skb); else aead_request_set_callback(req, 0, esp_output_done, skb); - aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv); + aead_request_set_crypt(req, sg, dsg, ivlen + esp->clen, iv); aead_request_set_ad(req, assoclen); - seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + - ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); - memset(iv, 0, ivlen); - memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&seqno + 8 - min(ivlen, 8), + memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&esp->seqno + 8 - min(ivlen, 8), min(ivlen, 8)); ESP_SKB_CB(skb)->tmp = tmp; @@ -465,11 +440,63 @@ skip_cow2: error: return err; } +EXPORT_SYMBOL_GPL(esp_output_tail); -static int esp_input_done2(struct sk_buff *skb, int err) +static int esp_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int alen; + int blksize; + struct ip_esp_hdr *esph; + struct crypto_aead *aead; + struct esp_info esp; + + esp.inplace = true; + + esp.proto = *skb_mac_header(skb); + *skb_mac_header(skb) = IPPROTO_ESP; + + /* skb is pure payload to encrypt */ + + aead = x->data; + alen = crypto_aead_authsize(aead); + + esp.tfclen = 0; + if (x->tfcpad) { + struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); + u32 padto; + + padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached)); + if (skb->len < padto) + esp.tfclen = padto - skb->len; + } + blksize = ALIGN(crypto_aead_blocksize(aead), 4); + esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize); + esp.plen = esp.clen - skb->len - esp.tfclen; + esp.tailen = esp.tfclen + esp.plen + alen; + + esp.esph = ip_esp_hdr(skb); + + esp.nfrags = esp_output_head(x, skb, &esp); + if (esp.nfrags < 0) + return esp.nfrags; + + esph = esp.esph; + esph->spi = x->id.spi; + + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + esp.seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + + ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); + + skb_push(skb, -skb_network_offset(skb)); + + return esp_output_tail(x, skb, &esp); +} + +int esp_input_done2(struct sk_buff *skb, int err) { const struct iphdr *iph; struct xfrm_state *x = xfrm_input_state(skb); + struct xfrm_offload *xo = xfrm_offload(skb); struct crypto_aead *aead = x->data; int alen = crypto_aead_authsize(aead); int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); @@ -478,7 +505,8 @@ static int esp_input_done2(struct sk_buff *skb, int err) u8 nexthdr[2]; int padlen; - kfree(ESP_SKB_CB(skb)->tmp); + if (!xo || (xo && !(xo->flags & CRYPTO_DONE))) + kfree(ESP_SKB_CB(skb)->tmp); if (unlikely(err)) goto out; @@ -549,6 +577,7 @@ static int esp_input_done2(struct sk_buff *skb, int err) out: return err; } +EXPORT_SYMBOL_GPL(esp_input_done2); static void esp_input_done(struct crypto_async_request *base, int err) { @@ -751,13 +780,17 @@ static int esp_init_aead(struct xfrm_state *x) char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; + u32 mask = 0; err = -ENAMETOOLONG; if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) goto error; - aead = crypto_alloc_aead(aead_name, 0, 0); + if (x->xso.offload_handle) + mask |= CRYPTO_ALG_ASYNC; + + aead = crypto_alloc_aead(aead_name, 0, mask); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -787,6 +820,7 @@ static int esp_init_authenc(struct xfrm_state *x) char authenc_name[CRYPTO_MAX_ALG_NAME]; unsigned int keylen; int err; + u32 mask = 0; err = -EINVAL; if (!x->ealg) @@ -812,7 +846,10 @@ static int esp_init_authenc(struct xfrm_state *x) goto error; } - aead = crypto_alloc_aead(authenc_name, 0, 0); + if (x->xso.offload_handle) + mask |= CRYPTO_ALG_ASYNC; + + aead = crypto_alloc_aead(authenc_name, 0, mask); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -931,7 +968,7 @@ static const struct xfrm_type esp_type = .destructor = esp_destroy, .get_mtu = esp4_get_mtu, .input = esp_input, - .output = esp_output + .output = esp_output, }; static struct xfrm4_protocol esp4_protocol = { diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 1de442632406..e0666016a764 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -43,27 +43,31 @@ static struct sk_buff **esp4_gro_receive(struct sk_buff **head, if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) goto out; - err = secpath_set(skb); - if (err) - goto out; + xo = xfrm_offload(skb); + if (!xo || !(xo->flags & CRYPTO_DONE)) { + err = secpath_set(skb); + if (err) + goto out; - if (skb->sp->len == XFRM_MAX_DEPTH) - goto out; + if (skb->sp->len == XFRM_MAX_DEPTH) + goto out; - x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, - (xfrm_address_t *)&ip_hdr(skb)->daddr, - spi, IPPROTO_ESP, AF_INET); - if (!x) - goto out; + x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, + (xfrm_address_t *)&ip_hdr(skb)->daddr, + spi, IPPROTO_ESP, AF_INET); + if (!x) + goto out; - skb->sp->xvec[skb->sp->len++] = x; - skb->sp->olen++; + skb->sp->xvec[skb->sp->len++] = x; + skb->sp->olen++; - xo = xfrm_offload(skb); - if (!xo) { - xfrm_state_put(x); - goto out; + xo = xfrm_offload(skb); + if (!xo) { + xfrm_state_put(x); + goto out; + } } + xo->flags |= XFRM_GRO; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; @@ -84,19 +88,214 @@ out: return NULL; } +static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ip_esp_hdr *esph; + struct iphdr *iph = ip_hdr(skb); + struct xfrm_offload *xo = xfrm_offload(skb); + int proto = iph->protocol; + + skb_push(skb, -skb_network_offset(skb)); + esph = ip_esp_hdr(skb); + *skb_mac_header(skb) = IPPROTO_ESP; + + esph->spi = x->id.spi; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + + xo->proto = proto; +} + +static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + __u32 seq; + int err = 0; + struct sk_buff *skb2; + struct xfrm_state *x; + struct ip_esp_hdr *esph; + struct crypto_aead *aead; + struct sk_buff *segs = ERR_PTR(-EINVAL); + netdev_features_t esp_features = features; + struct xfrm_offload *xo = xfrm_offload(skb); + + if (!xo) + goto out; + + seq = xo->seq.low; + + x = skb->sp->xvec[skb->sp->len - 1]; + aead = x->data; + esph = ip_esp_hdr(skb); + + if (esph->spi != x->id.spi) + goto out; + + if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) + goto out; + + __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); + + skb->encap_hdr_csum = 1; + + if (!(features & NETIF_F_HW_ESP)) + esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); + + segs = x->outer_mode->gso_segment(x, skb, esp_features); + if (IS_ERR_OR_NULL(segs)) + goto out; + + __skb_pull(skb, skb->data - skb_mac_header(skb)); + + skb2 = segs; + do { + struct sk_buff *nskb = skb2->next; + + xo = xfrm_offload(skb2); + xo->flags |= XFRM_GSO_SEGMENT; + xo->seq.low = seq; + xo->seq.hi = xfrm_replay_seqhi(x, seq); + + if(!(features & NETIF_F_HW_ESP)) + xo->flags |= CRYPTO_FALLBACK; + + x->outer_mode->xmit(x, skb2); + + err = x->type_offload->xmit(x, skb2, esp_features); + if (err) { + kfree_skb_list(segs); + return ERR_PTR(err); + } + + if (!skb_is_gso(skb2)) + seq++; + else + seq += skb_shinfo(skb2)->gso_segs; + + skb_push(skb2, skb2->mac_len); + skb2 = nskb; + } while (skb2); + +out: + return segs; +} + +static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb) +{ + struct crypto_aead *aead = x->data; + + if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead))) + return -EINVAL; + + skb->ip_summed = CHECKSUM_NONE; + + return esp_input_done2(skb, 0); +} + +static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) +{ + int err; + int alen; + int blksize; + struct xfrm_offload *xo; + struct ip_esp_hdr *esph; + struct crypto_aead *aead; + struct esp_info esp; + bool hw_offload = true; + + esp.inplace = true; + + xo = xfrm_offload(skb); + + if (!xo) + return -EINVAL; + + if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || + (x->xso.dev != skb->dev)) { + xo->flags |= CRYPTO_FALLBACK; + hw_offload = false; + } + + esp.proto = xo->proto; + + /* skb is pure payload to encrypt */ + + aead = x->data; + alen = crypto_aead_authsize(aead); + + esp.tfclen = 0; + /* XXX: Add support for tfc padding here. */ + + blksize = ALIGN(crypto_aead_blocksize(aead), 4); + esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize); + esp.plen = esp.clen - skb->len - esp.tfclen; + esp.tailen = esp.tfclen + esp.plen + alen; + + esp.esph = ip_esp_hdr(skb); + + + if (!hw_offload || (hw_offload && !skb_is_gso(skb))) { + esp.nfrags = esp_output_head(x, skb, &esp); + if (esp.nfrags < 0) + return esp.nfrags; + } + + esph = esp.esph; + esph->spi = x->id.spi; + + skb_push(skb, -skb_network_offset(skb)); + + if (xo->flags & XFRM_GSO_SEGMENT) { + esph->seq_no = htonl(xo->seq.low); + } else { + ip_hdr(skb)->tot_len = htons(skb->len); + ip_send_check(ip_hdr(skb)); + } + + if (hw_offload) + return 0; + + esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); + + err = esp_output_tail(x, skb, &esp); + if (err < 0) + return err; + + secpath_reset(skb); + + return 0; +} + static const struct net_offload esp4_offload = { .callbacks = { .gro_receive = esp4_gro_receive, + .gso_segment = esp4_gso_segment, }, }; +static const struct xfrm_type_offload esp_type_offload = { + .description = "ESP4 OFFLOAD", + .owner = THIS_MODULE, + .proto = IPPROTO_ESP, + .input_tail = esp_input_tail, + .xmit = esp_xmit, + .encap = esp4_gso_encap, +}; + static int __init esp4_offload_init(void) { + if (xfrm_register_type_offload(&esp_type_offload, AF_INET) < 0) { + pr_info("%s: can't add xfrm type offload\n", __func__); + return -EAGAIN; + } + return inet_add_offload(&esp4_offload, IPPROTO_ESP); } static void __exit esp4_offload_exit(void) { + if (xfrm_unregister_type_offload(&esp_type_offload, AF_INET) < 0) + pr_info("%s: can't remove xfrm type offload\n", __func__); + inet_del_offload(&esp4_offload, IPPROTO_ESP); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 8f2133ffc2ff..39bd1edee676 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -632,7 +632,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, int err, remaining; struct rtmsg *rtm; - err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy); + err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy, + NULL); if (err < 0) goto errout; @@ -709,7 +710,8 @@ errout: return err; } -static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) +static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; @@ -731,7 +733,8 @@ errout: return err; } -static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) +static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; @@ -1127,7 +1130,8 @@ static void fib_disable_ip(struct net_device *dev, unsigned long event, { if (fib_sync_down_dev(dev, event, force)) fib_flush(dev_net(dev)); - rt_cache_flush(dev_net(dev)); + else + rt_cache_flush(dev_net(dev)); arp_ifdown(dev); } diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c new file mode 100644 index 000000000000..e0714d975947 --- /dev/null +++ b/net/ipv4/fib_notifier.c @@ -0,0 +1,86 @@ +#include <linux/rtnetlink.h> +#include <linux/notifier.h> +#include <linux/rcupdate.h> +#include <linux/kernel.h> +#include <net/net_namespace.h> +#include <net/netns/ipv4.h> +#include <net/ip_fib.h> + +static ATOMIC_NOTIFIER_HEAD(fib_chain); + +int call_fib_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + info->net = net; + return nb->notifier_call(nb, event_type, info); +} + +int call_fib_notifiers(struct net *net, enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + net->ipv4.fib_seq++; + info->net = net; + return atomic_notifier_call_chain(&fib_chain, event_type, info); +} + +static unsigned int fib_seq_sum(void) +{ + unsigned int fib_seq = 0; + struct net *net; + + rtnl_lock(); + for_each_net(net) + fib_seq += net->ipv4.fib_seq; + rtnl_unlock(); + + return fib_seq; +} + +static bool fib_dump_is_consistent(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + unsigned int fib_seq) +{ + atomic_notifier_chain_register(&fib_chain, nb); + if (fib_seq == fib_seq_sum()) + return true; + atomic_notifier_chain_unregister(&fib_chain, nb); + if (cb) + cb(nb); + return false; +} + +#define FIB_DUMP_MAX_RETRIES 5 +int register_fib_notifier(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb)) +{ + int retries = 0; + + do { + unsigned int fib_seq = fib_seq_sum(); + struct net *net; + + /* Mutex semantics guarantee that every change done to + * FIB tries before we read the change sequence counter + * is now visible to us. + */ + rcu_read_lock(); + for_each_net_rcu(net) { + fib_rules_notify(net, nb); + fib_notify(net, nb); + } + rcu_read_unlock(); + + if (fib_dump_is_consistent(nb, cb, fib_seq)) + return 0; + } while (++retries < FIB_DUMP_MAX_RETRIES); + + return -EBUSY; +} +EXPORT_SYMBOL(register_fib_notifier); + +int unregister_fib_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&fib_chain, nb); +} +EXPORT_SYMBOL(unregister_fib_notifier); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 2e50062f642d..778ecf977eb2 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -47,6 +47,27 @@ struct fib4_rule { #endif }; +static bool fib4_rule_matchall(const struct fib_rule *rule) +{ + struct fib4_rule *r = container_of(rule, struct fib4_rule, common); + + if (r->dst_len || r->src_len || r->tos) + return false; + return fib_rule_matchall(rule); +} + +bool fib4_rule_default(const struct fib_rule *rule) +{ + if (!fib4_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL || + rule->l3mdev) + return false; + if (rule->table != RT_TABLE_LOCAL && rule->table != RT_TABLE_MAIN && + rule->table != RT_TABLE_DEFAULT) + return false; + return true; +} +EXPORT_SYMBOL_GPL(fib4_rule_default); + int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { @@ -164,12 +185,36 @@ static struct fib_table *fib_empty_table(struct net *net) return NULL; } +static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct fib_rule *rule) +{ + struct fib_rule_notifier_info info = { + .rule = rule, + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + static int call_fib_rule_notifiers(struct net *net, - enum fib_event_type event_type) + enum fib_event_type event_type, + struct fib_rule *rule) +{ + struct fib_rule_notifier_info info = { + .rule = rule, + }; + + return call_fib_notifiers(net, event_type, &info.info); +} + +/* Called with rcu_read_lock() */ +void fib_rules_notify(struct net *net, struct notifier_block *nb) { - struct fib_notifier_info info; + struct fib_rules_ops *ops = net->ipv4.rules_ops; + struct fib_rule *rule; - return call_fib_notifiers(net, event_type, &info); + list_for_each_entry_rcu(rule, &ops->rules_list, list) + call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule); } static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = { @@ -228,7 +273,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->tos = frh->tos; net->ipv4.fib_has_custom_rules = true; - call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule); err = 0; errout: @@ -250,7 +295,7 @@ static int fib4_rule_delete(struct fib_rule *rule) net->ipv4.fib_num_tclassid_users--; #endif net->ipv4.fib_has_custom_rules = true; - call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule); errout: return err; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 317026a39cfa..da449ddb8cc1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -57,7 +57,6 @@ static unsigned int fib_info_cnt; static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; #ifdef CONFIG_IP_ROUTE_MULTIPATH -u32 fib_multipath_secret __read_mostly; #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ @@ -576,9 +575,6 @@ static void fib_rebalance(struct fib_info *fi) atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); } endfor_nexthops(fi); - - net_get_random_once(&fib_multipath_secret, - sizeof(fib_multipath_secret)); } static inline void fib_add_weight(struct fib_info *fi, @@ -1641,7 +1637,7 @@ void fib_select_multipath(struct fib_result *res, int hash) #endif void fib_select_path(struct net *net, struct fib_result *res, - struct flowi4 *fl4, int mp_hash) + struct flowi4 *fl4, const struct sk_buff *skb) { bool oif_check; @@ -1650,10 +1646,9 @@ void fib_select_path(struct net *net, struct fib_result *res, #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi->fib_nhs > 1 && oif_check) { - if (mp_hash < 0) - mp_hash = get_hash_from_flowi4(fl4) >> 1; + int h = fib_multipath_hash(res->fi, fl4, skb); - fib_select_multipath(res, mp_hash); + fib_select_multipath(res, h); } else #endif diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 2f0d8233950f..1201409ba1dc 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -84,43 +84,6 @@ #include <trace/events/fib.h> #include "fib_lookup.h" -static unsigned int fib_seq_sum(void) -{ - unsigned int fib_seq = 0; - struct net *net; - - rtnl_lock(); - for_each_net(net) - fib_seq += net->ipv4.fib_seq; - rtnl_unlock(); - - return fib_seq; -} - -static ATOMIC_NOTIFIER_HEAD(fib_chain); - -static int call_fib_notifier(struct notifier_block *nb, struct net *net, - enum fib_event_type event_type, - struct fib_notifier_info *info) -{ - info->net = net; - return nb->notifier_call(nb, event_type, info); -} - -static void fib_rules_notify(struct net *net, struct notifier_block *nb, - enum fib_event_type event_type) -{ -#ifdef CONFIG_IP_MULTIPLE_TABLES - struct fib_notifier_info info; - - if (net->ipv4.fib_has_custom_rules) - call_fib_notifier(nb, net, event_type, &info); -#endif -} - -static void fib_notify(struct net *net, struct notifier_block *nb, - enum fib_event_type event_type); - static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_info *fi, @@ -137,62 +100,6 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, return call_fib_notifier(nb, net, event_type, &info.info); } -static bool fib_dump_is_consistent(struct notifier_block *nb, - void (*cb)(struct notifier_block *nb), - unsigned int fib_seq) -{ - atomic_notifier_chain_register(&fib_chain, nb); - if (fib_seq == fib_seq_sum()) - return true; - atomic_notifier_chain_unregister(&fib_chain, nb); - if (cb) - cb(nb); - return false; -} - -#define FIB_DUMP_MAX_RETRIES 5 -int register_fib_notifier(struct notifier_block *nb, - void (*cb)(struct notifier_block *nb)) -{ - int retries = 0; - - do { - unsigned int fib_seq = fib_seq_sum(); - struct net *net; - - /* Mutex semantics guarantee that every change done to - * FIB tries before we read the change sequence counter - * is now visible to us. - */ - rcu_read_lock(); - for_each_net_rcu(net) { - fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD); - fib_notify(net, nb, FIB_EVENT_ENTRY_ADD); - } - rcu_read_unlock(); - - if (fib_dump_is_consistent(nb, cb, fib_seq)) - return 0; - } while (++retries < FIB_DUMP_MAX_RETRIES); - - return -EBUSY; -} -EXPORT_SYMBOL(register_fib_notifier); - -int unregister_fib_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&fib_chain, nb); -} -EXPORT_SYMBOL(unregister_fib_notifier); - -int call_fib_notifiers(struct net *net, enum fib_event_type event_type, - struct fib_notifier_info *info) -{ - net->ipv4.fib_seq++; - info->net = net; - return atomic_notifier_call_chain(&fib_chain, event_type, info); -} - static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_info *fi, @@ -1995,8 +1902,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb) } static void fib_leaf_notify(struct net *net, struct key_vector *l, - struct fib_table *tb, struct notifier_block *nb, - enum fib_event_type event_type) + struct fib_table *tb, struct notifier_block *nb) { struct fib_alias *fa; @@ -2012,22 +1918,21 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, if (tb->tb_id != fa->tb_id) continue; - call_fib_entry_notifier(nb, net, event_type, l->key, + call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key, KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, fa->fa_type, fa->tb_id); } } static void fib_table_notify(struct net *net, struct fib_table *tb, - struct notifier_block *nb, - enum fib_event_type event_type) + struct notifier_block *nb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - fib_leaf_notify(net, l, tb, nb, event_type); + fib_leaf_notify(net, l, tb, nb); key = l->key + 1; /* stop in case of wrap around */ @@ -2036,8 +1941,7 @@ static void fib_table_notify(struct net *net, struct fib_table *tb, } } -static void fib_notify(struct net *net, struct notifier_block *nb, - enum fib_event_type event_type) +void fib_notify(struct net *net, struct notifier_block *nb) { unsigned int h; @@ -2046,7 +1950,7 @@ static void fib_notify(struct net *net, struct notifier_block *nb, struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) - fib_table_notify(net, tb, nb, event_type); + fib_table_notify(net, tb, nb); } } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index fc310db2708b..43318b5f5647 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -464,22 +464,6 @@ out_bh_enable: local_bh_enable(); } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - -/* Source and destination is swapped. See ip_multipath_icmp_hash */ -static int icmp_multipath_hash_skb(const struct sk_buff *skb) -{ - const struct iphdr *iph = ip_hdr(skb); - - return fib_multipath_hash(iph->daddr, iph->saddr); -} - -#else - -#define icmp_multipath_hash_skb(skb) (-1) - -#endif - static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -505,8 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key_hash(net, fl4, - icmp_multipath_hash_skb(skb_in)); + rt = __ip_route_output_key_hash(net, fl4, skb_in); if (IS_ERR(rt)) return rt; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c9c1cb635d9a..e90c80a548ad 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -829,7 +829,8 @@ out: static int ipgre_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm *parms, + __u32 *fwmark) { struct ip_tunnel *t = netdev_priv(dev); @@ -886,6 +887,9 @@ static int ipgre_netlink_parms(struct net_device *dev, t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]); } + if (data[IFLA_GRE_FWMARK]) + *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); + return 0; } @@ -957,6 +961,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + __u32 fwmark = 0; int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { @@ -967,31 +972,32 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, return err; } - err = ipgre_netlink_parms(dev, data, tb, &p); + err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; - return ip_tunnel_newlink(dev, tb, &p); + return ip_tunnel_newlink(dev, tb, &p, fwmark); } static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { + struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + __u32 fwmark = t->fwmark; int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { - struct ip_tunnel *t = netdev_priv(dev); err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - err = ipgre_netlink_parms(dev, data, tb, &p); + err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; - return ip_tunnel_changelink(dev, tb, &p); + return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t ipgre_get_size(const struct net_device *dev) @@ -1029,6 +1035,8 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(0) + /* IFLA_GRE_IGNORE_DF */ nla_total_size(1) + + /* IFLA_GRE_FWMARK */ + nla_total_size(4) + 0; } @@ -1049,7 +1057,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || nla_put_u8(skb, IFLA_GRE_PMTUDISC, - !!(p->iph.frag_off & htons(IP_DF)))) + !!(p->iph.frag_off & htons(IP_DF))) || + nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, @@ -1093,6 +1102,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, + [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d6feabb03516..fa2dc8f692c6 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -313,6 +313,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct net_device *dev = skb->dev; + void (*edemux)(struct sk_buff *skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -329,8 +330,8 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) int protocol = iph->protocol; ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot && ipprot->early_demux) { - ipprot->early_demux(skb); + if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { + edemux(skb); /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 1d46d05efb0f..ec4fe3d4b5c9 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -330,7 +330,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, sent to multicast group to reach destination designated router. */ struct ip_ra_chain __rcu *ip_ra_chain; -static DEFINE_SPINLOCK(ip_ra_lock); static void ip_ra_destroy_rcu(struct rcu_head *head) @@ -352,21 +351,17 @@ int ip_ra_control(struct sock *sk, unsigned char on, new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; - spin_lock_bh(&ip_ra_lock); for (rap = &ip_ra_chain; - (ra = rcu_dereference_protected(*rap, - lockdep_is_held(&ip_ra_lock))) != NULL; + (ra = rtnl_dereference(*rap)) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (on) { - spin_unlock_bh(&ip_ra_lock); kfree(new_ra); return -EADDRINUSE; } /* dont let ip_call_ra_chain() use sk again */ ra->sk = NULL; RCU_INIT_POINTER(*rap, ra->next); - spin_unlock_bh(&ip_ra_lock); if (ra->destructor) ra->destructor(sk); @@ -380,17 +375,14 @@ int ip_ra_control(struct sock *sk, unsigned char on, return 0; } } - if (!new_ra) { - spin_unlock_bh(&ip_ra_lock); + if (!new_ra) return -ENOBUFS; - } new_ra->sk = sk; new_ra->destructor = destructor; RCU_INIT_POINTER(new_ra->next, ra); rcu_assign_pointer(*rap, new_ra); sock_hold(sk); - spin_unlock_bh(&ip_ra_lock); return 0; } diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 823abaef006b..b878ecbc0608 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -293,7 +293,8 @@ failed: static inline void init_tunnel_flow(struct flowi4 *fl4, int proto, __be32 daddr, __be32 saddr, - __be32 key, __u8 tos, int oif) + __be32 key, __u8 tos, int oif, + __u32 mark) { memset(fl4, 0, sizeof(*fl4)); fl4->flowi4_oif = oif; @@ -302,6 +303,7 @@ static inline void init_tunnel_flow(struct flowi4 *fl4, fl4->flowi4_tos = tos; fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; + fl4->flowi4_mark = mark; } static int ip_tunnel_bind_dev(struct net_device *dev) @@ -322,7 +324,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev) init_tunnel_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, - RT_TOS(iph->tos), tunnel->parms.link); + RT_TOS(iph->tos), tunnel->parms.link, + tunnel->fwmark); rt = ip_route_output_key(tunnel->net, &fl4); if (!IS_ERR(rt)) { @@ -578,7 +581,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); } init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0, - RT_TOS(tos), tunnel->parms.link); + RT_TOS(tos), tunnel->parms.link, tunnel->fwmark); if (tunnel->encap.type != TUNNEL_ENCAP_NONE) goto tx_error; rt = ip_route_output_key(tunnel->net, &fl4); @@ -707,7 +710,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, - tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); + tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, + tunnel->fwmark); if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; @@ -795,7 +799,8 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, struct ip_tunnel *t, struct net_device *dev, struct ip_tunnel_parm *p, - bool set_mtu) + bool set_mtu, + __u32 fwmark) { ip_tunnel_del(itn, t); t->parms.iph.saddr = p->iph.saddr; @@ -812,10 +817,11 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, t->parms.iph.tos = p->iph.tos; t->parms.iph.frag_off = p->iph.frag_off; - if (t->parms.link != p->link) { + if (t->parms.link != p->link || t->fwmark != fwmark) { int mtu; t->parms.link = p->link; + t->fwmark = fwmark; mtu = ip_tunnel_bind_dev(dev); if (set_mtu) dev->mtu = mtu; @@ -893,7 +899,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) if (t) { err = 0; - ip_tunnel_update(itn, t, dev, p, true); + ip_tunnel_update(itn, t, dev, p, true, 0); } else { err = -ENOENT; } @@ -1066,7 +1072,7 @@ void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p) + struct ip_tunnel_parm *p, __u32 fwmark) { struct ip_tunnel *nt; struct net *net = dev_net(dev); @@ -1087,6 +1093,7 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], nt->net = net; nt->parms = *p; + nt->fwmark = fwmark; err = register_netdevice(dev); if (err) goto out; @@ -1105,7 +1112,7 @@ out: EXPORT_SYMBOL_GPL(ip_tunnel_newlink); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p) + struct ip_tunnel_parm *p, __u32 fwmark) { struct ip_tunnel *t; struct ip_tunnel *tunnel = netdev_priv(dev); @@ -1137,7 +1144,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], } } - ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]); + ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark); return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_changelink); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index a31f47ccaad9..baf196eaf1d8 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -235,7 +235,7 @@ static int ip_tun_build_state(struct nlattr *attr, struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; int err; - err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy); + err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, NULL); if (err < 0) return err; @@ -332,7 +332,8 @@ static int ip6_tun_build_state(struct nlattr *attr, struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; int err; - err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy); + err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy, + NULL); if (err < 0) return err; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 8b14f1404c8f..40977413fd48 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -471,7 +471,8 @@ static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) } static void vti_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm *parms, + __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -497,24 +498,29 @@ static void vti_netlink_parms(struct nlattr *data[], if (data[IFLA_VTI_REMOTE]) parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]); + if (data[IFLA_VTI_FWMARK]) + *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } static int vti_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel_parm parms; + __u32 fwmark = 0; - vti_netlink_parms(data, &parms); - return ip_tunnel_newlink(dev, tb, &parms); + vti_netlink_parms(data, &parms, &fwmark); + return ip_tunnel_newlink(dev, tb, &parms, fwmark); } static int vti_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { + struct ip_tunnel *t = netdev_priv(dev); + __u32 fwmark = t->fwmark; struct ip_tunnel_parm p; - vti_netlink_parms(data, &p); - return ip_tunnel_changelink(dev, tb, &p); + vti_netlink_parms(data, &p, &fwmark); + return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t vti_get_size(const struct net_device *dev) @@ -530,6 +536,8 @@ static size_t vti_get_size(const struct net_device *dev) nla_total_size(4) + /* IFLA_VTI_REMOTE */ nla_total_size(4) + + /* IFLA_VTI_FWMARK */ + nla_total_size(4) + 0; } @@ -543,6 +551,7 @@ static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key); nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr); nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr); + nla_put_u32(skb, IFLA_VTI_FWMARK, t->fwmark); return 0; } @@ -553,6 +562,7 @@ static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = { [IFLA_VTI_OKEY] = { .type = NLA_U32 }, [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, + [IFLA_VTI_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vti_link_ops __read_mostly = { diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index dfb2ab2dd3c8..c3b12b1c7162 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -57,6 +57,7 @@ #include <linux/export.h> #include <net/net_namespace.h> #include <net/arp.h> +#include <net/dsa.h> #include <net/ip.h> #include <net/ipconfig.h> #include <net/route.h> diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 00d4229b6954..1e441c6f2160 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -390,7 +390,8 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) } static void ipip_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, bool *collect_md) + struct ip_tunnel_parm *parms, bool *collect_md, + __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -428,6 +429,9 @@ static void ipip_netlink_parms(struct nlattr *data[], if (data[IFLA_IPTUN_COLLECT_METADATA]) *collect_md = true; + + if (data[IFLA_IPTUN_FWMARK]) + *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } /* This function returns true when ENCAP attributes are present in the nl msg */ @@ -470,6 +474,7 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev, struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + __u32 fwmark = 0; if (ipip_netlink_encap_parms(data, &ipencap)) { int err = ip_tunnel_encap_setup(t, &ipencap); @@ -478,26 +483,27 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev, return err; } - ipip_netlink_parms(data, &p, &t->collect_md); - return ip_tunnel_newlink(dev, tb, &p); + ipip_netlink_parms(data, &p, &t->collect_md, &fwmark); + return ip_tunnel_newlink(dev, tb, &p, fwmark); } static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { + struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; bool collect_md; + __u32 fwmark = t->fwmark; if (ipip_netlink_encap_parms(data, &ipencap)) { - struct ip_tunnel *t = netdev_priv(dev); int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - ipip_netlink_parms(data, &p, &collect_md); + ipip_netlink_parms(data, &p, &collect_md, &fwmark); if (collect_md) return -EINVAL; @@ -505,7 +511,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) return -EINVAL; - return ip_tunnel_changelink(dev, tb, &p); + return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t ipip_get_size(const struct net_device *dev) @@ -535,6 +541,8 @@ static size_t ipip_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + + /* IFLA_IPTUN_FWMARK */ + nla_total_size(4) + 0; } @@ -550,7 +558,8 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, - !!(parm->iph.frag_off & htons(IP_DF)))) + !!(parm->iph.frag_off & htons(IP_DF))) || + nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, @@ -585,6 +594,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, + [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ipip_link_ops __read_mostly = { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index b036e85e093b..3a02d52ed50e 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -631,7 +631,7 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, in_dev = __in_dev_get_rtnl(dev); if (in_dev) { IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; - inet_netconf_notify_devconf(dev_net(dev), + inet_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in_dev->cnf); ip_rt_multicast_event(in_dev); @@ -820,8 +820,8 @@ static int vif_add(struct net *net, struct mr_table *mrt, return -EADDRNOTAVAIL; } IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; - inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, dev->ifindex, - &in_dev->cnf); + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, + dev->ifindex, &in_dev->cnf); ip_rt_multicast_event(in_dev); /* Fill in the VIF structures */ @@ -1282,7 +1282,8 @@ static void mrtsock_destruct(struct sock *sk) ipmr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; - inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); RCU_INIT_POINTER(mrt->mroute_sk, NULL); @@ -1343,7 +1344,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, if (ret == 0) { rcu_assign_pointer(mrt->mroute_sk, sk); IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; - inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); } @@ -2421,7 +2423,8 @@ static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc) /* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, struct mfcctl *mfcc, int *mrtsock, - struct mr_table **mrtret) + struct mr_table **mrtret, + struct netlink_ext_ack *extack) { struct net_device *dev = NULL; u32 tblid = RT_TABLE_DEFAULT; @@ -2430,7 +2433,8 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, struct rtmsg *rtm; int ret, rem; - ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy); + ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy, + extack); if (ret < 0) goto out; rtm = nlmsg_data(nlh); @@ -2489,7 +2493,8 @@ out: } /* takes care of both newroute and delroute */ -static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh) +static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); int ret, mrtsock, parent; @@ -2498,7 +2503,7 @@ static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh) mrtsock = 0; tbl = NULL; - ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl); + ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack); if (ret < 0) return ret; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 6241a81fd7f5..0bc3c3d73e61 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -309,8 +309,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, */ for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; - struct arpt_entry *e - = (struct arpt_entry *)(entry0 + pos); + struct arpt_entry *e = entry0 + pos; if (!(valid_hooks & (1 << hook))) continue; @@ -354,14 +353,12 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if (pos == oldpos) goto next; - e = (struct arpt_entry *) - (entry0 + pos); + e = entry0 + pos; } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; - e = (struct arpt_entry *) - (entry0 + pos + size); + e = entry0 + pos + size; if (pos + size >= newinfo->size) return 0; e->counters.pcnt = pos; @@ -376,16 +373,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = (struct arpt_entry *) - (entry0 + newpos); + e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; if (newpos >= newinfo->size) return 0; } - e = (struct arpt_entry *) - (entry0 + newpos); + e = entry0 + newpos; e->counters.pcnt = pos; pos = newpos; } @@ -562,8 +557,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } - if (ret != 0) - goto out_free; ret = -EINVAL; if (i != repl->num_entries) @@ -683,7 +676,7 @@ static int copy_entries_to_user(unsigned int total_size, for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ const struct xt_entry_target *t; - e = (struct arpt_entry *)(loc_cpu_entry + off); + e = loc_cpu_entry + off; if (copy_to_user(userptr + off, e, sizeof(*e))) { ret = -EFAULT; goto free_counters; @@ -1130,7 +1123,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, int h; origsize = *size; - de = (struct arpt_entry *)*dstptr; + de = *dstptr; memcpy(de, e, sizeof(struct arpt_entry)); memcpy(&de->counters, &e->counters, sizeof(e->counters)); @@ -1324,7 +1317,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, int ret; origsize = *size; - ce = (struct compat_arpt_entry __user *)*dstptr; + ce = *dstptr; if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 || copy_to_user(&ce->counters, &counters[i], sizeof(counters[i])) != 0) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 384b85713e06..2a55a40211cb 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -382,7 +382,7 @@ mark_source_chains(const struct xt_table_info *newinfo, to 0 as we leave), and comefrom to save source hook bitmask */ for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; - struct ipt_entry *e = (struct ipt_entry *)(entry0 + pos); + struct ipt_entry *e = entry0 + pos; if (!(valid_hooks & (1 << hook))) continue; @@ -424,14 +424,12 @@ mark_source_chains(const struct xt_table_info *newinfo, if (pos == oldpos) goto next; - e = (struct ipt_entry *) - (entry0 + pos); + e = entry0 + pos; } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; - e = (struct ipt_entry *) - (entry0 + pos + size); + e = entry0 + pos + size; if (pos + size >= newinfo->size) return 0; e->counters.pcnt = pos; @@ -446,16 +444,14 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = (struct ipt_entry *) - (entry0 + newpos); + e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; if (newpos >= newinfo->size) return 0; } - e = (struct ipt_entry *) - (entry0 + newpos); + e = entry0 + newpos; e->counters.pcnt = pos; pos = newpos; } @@ -834,7 +830,7 @@ copy_entries_to_user(unsigned int total_size, const struct xt_entry_match *m; const struct xt_entry_target *t; - e = (struct ipt_entry *)(loc_cpu_entry + off); + e = loc_cpu_entry + off; if (copy_to_user(userptr + off, e, sizeof(*e))) { ret = -EFAULT; goto free_counters; @@ -1229,7 +1225,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, int ret = 0; origsize = *size; - ce = (struct compat_ipt_entry __user *)*dstptr; + ce = *dstptr; if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 || copy_to_user(&ce->counters, &counters[i], sizeof(counters[i])) != 0) @@ -1366,7 +1362,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, struct xt_entry_match *ematch; origsize = *size; - de = (struct ipt_entry *)*dstptr; + de = *dstptr; memcpy(de, e, sizeof(struct ipt_entry)); memcpy(&de->counters, &e->counters, sizeof(e->counters)); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 9b8841316e7b..038f293c2376 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -22,6 +22,7 @@ #include <linux/icmp.h> #include <linux/if_arp.h> #include <linux/seq_file.h> +#include <linux/refcount.h> #include <linux/netfilter_arp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> @@ -40,8 +41,8 @@ MODULE_DESCRIPTION("Xtables: CLUSTERIP target"); struct clusterip_config { struct list_head list; /* list of all configs */ - atomic_t refcount; /* reference count */ - atomic_t entries; /* number of entries/rules + refcount_t refcount; /* reference count */ + refcount_t entries; /* number of entries/rules * referencing us */ __be32 clusterip; /* the IP address */ @@ -77,7 +78,7 @@ struct clusterip_net { static inline void clusterip_config_get(struct clusterip_config *c) { - atomic_inc(&c->refcount); + refcount_inc(&c->refcount); } @@ -89,7 +90,7 @@ static void clusterip_config_rcu_free(struct rcu_head *head) static inline void clusterip_config_put(struct clusterip_config *c) { - if (atomic_dec_and_test(&c->refcount)) + if (refcount_dec_and_test(&c->refcount)) call_rcu_bh(&c->rcu, clusterip_config_rcu_free); } @@ -103,7 +104,7 @@ clusterip_config_entry_put(struct clusterip_config *c) struct clusterip_net *cn = net_generic(net, clusterip_net_id); local_bh_disable(); - if (atomic_dec_and_lock(&c->entries, &cn->lock)) { + if (refcount_dec_and_lock(&c->entries, &cn->lock)) { list_del_rcu(&c->list); spin_unlock(&cn->lock); local_bh_enable(); @@ -149,10 +150,10 @@ clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) c = NULL; else #endif - if (unlikely(!atomic_inc_not_zero(&c->refcount))) + if (unlikely(!refcount_inc_not_zero(&c->refcount))) c = NULL; else if (entry) - atomic_inc(&c->entries); + refcount_inc(&c->entries); } rcu_read_unlock_bh(); @@ -188,8 +189,8 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, clusterip_config_init_nodelist(c, i); c->hash_mode = i->hash_mode; c->hash_initval = i->hash_initval; - atomic_set(&c->refcount, 1); - atomic_set(&c->entries, 1); + refcount_set(&c->refcount, 1); + refcount_set(&c->entries, 1); spin_lock_bh(&cn->lock); if (__clusterip_config_find(net, ip)) { diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 3240a2614e82..af2b69b6895f 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -293,12 +293,16 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_ECN); synproxy_send_client_synack(net, skb, th, &opts); - return NF_DROP; - + consume_skb(skb); + return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ - synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq)); - return NF_DROP; + if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) { + consume_skb(skb); + return NF_STOLEN; + } else { + return NF_DROP; + } } return XT_CONTINUE; @@ -367,10 +371,13 @@ static unsigned int ipv4_synproxy_hook(void *priv, * number match the one of first SYN. */ if (synproxy_recv_client_ack(net, skb, th, &opts, - ntohl(th->seq) + 1)) + ntohl(th->seq) + 1)) { this_cpu_inc(snet->stats->cookie_retrans); - - return NF_DROP; + consume_skb(skb); + return NF_STOLEN; + } else { + return NF_DROP; + } } synproxy->isn = ntohl(th->ack_seq); @@ -409,19 +416,56 @@ static unsigned int ipv4_synproxy_hook(void *priv, return NF_ACCEPT; } +static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { + { + .hook = ipv4_synproxy_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv4_synproxy_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + static int synproxy_tg4_check(const struct xt_tgchk_param *par) { + struct synproxy_net *snet = synproxy_pernet(par->net); const struct ipt_entry *e = par->entryinfo; + int err; if (e->ip.proto != IPPROTO_TCP || e->ip.invflags & XT_INV_PROTO) return -EINVAL; - return nf_ct_netns_get(par->net, par->family); + err = nf_ct_netns_get(par->net, par->family); + if (err) + return err; + + if (snet->hook_ref4 == 0) { + err = nf_register_net_hooks(par->net, ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); + if (err) { + nf_ct_netns_put(par->net, par->family); + return err; + } + } + + snet->hook_ref4++; + return err; } static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) { + struct synproxy_net *snet = synproxy_pernet(par->net); + + snet->hook_ref4--; + if (snet->hook_ref4 == 0) + nf_unregister_net_hooks(par->net, ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); nf_ct_netns_put(par->net, par->family); } @@ -436,46 +480,14 @@ static struct xt_target synproxy_tg4_reg __read_mostly = { .me = THIS_MODULE, }; -static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { - { - .hook = ipv4_synproxy_hook, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, - { - .hook = ipv4_synproxy_hook, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, -}; - static int __init synproxy_tg4_init(void) { - int err; - - err = nf_register_hooks(ipv4_synproxy_ops, - ARRAY_SIZE(ipv4_synproxy_ops)); - if (err < 0) - goto err1; - - err = xt_register_target(&synproxy_tg4_reg); - if (err < 0) - goto err2; - - return 0; - -err2: - nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); -err1: - return err; + return xt_register_target(&synproxy_tg4_reg); } static void __exit synproxy_tg4_exit(void) { xt_unregister_target(&synproxy_tg4_reg); - nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); } module_init(synproxy_tg4_init); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index f0dbff05fc28..39895b9ddeb9 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -69,8 +69,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Avoid counting cloned packets towards the original connection. */ nf_reset(skb); - nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW); - nf_conntrack_get(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); #endif /* * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 6f5e8d01b876..feedd759ca80 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -264,13 +264,7 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, if (!ct) return NF_ACCEPT; - /* Don't try to NAT if this packet is not conntracked */ - if (nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - nat = nf_ct_nat_ext_add(ct); - if (nat == NULL) - return NF_ACCEPT; + nat = nfct_nat(ct); switch (ctinfo) { case IP_CT_RELATED: diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c index ea91058b5f6f..dc1dea15c1b4 100644 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -37,7 +37,6 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); - nat = nfct_nat(ct); NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY)); @@ -56,7 +55,9 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, return NF_DROP; } - nat->masq_index = out->ifindex; + nat = nf_ct_nat_ext_add(ct); + if (nat) + nat->masq_index = out->ifindex; /* Transfer from original range. */ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index b3ca21b2ba9b..8a69363b4884 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -49,9 +49,14 @@ static void pptp_nat_expected(struct nf_conn *ct, const struct nf_ct_pptp_master *ct_pptp_info; const struct nf_nat_pptp *nat_pptp_info; struct nf_nat_range range; + struct nf_conn_nat *nat; + nat = nf_ct_nat_ext_add(ct); + if (WARN_ON_ONCE(!nat)) + return; + + nat_pptp_info = &nat->help.nat_pptp_info; ct_pptp_info = nfct_help_data(master); - nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; /* And here goes the grand finale of corrosion... */ if (exp->dir == IP_CT_DIR_ORIGINAL) { @@ -120,13 +125,17 @@ pptp_outbound_pkt(struct sk_buff *skb, { struct nf_ct_pptp_master *ct_pptp_info; + struct nf_conn_nat *nat = nfct_nat(ct); struct nf_nat_pptp *nat_pptp_info; u_int16_t msg; __be16 new_callid; unsigned int cid_off; + if (WARN_ON_ONCE(!nat)) + return NF_DROP; + + nat_pptp_info = &nat->help.nat_pptp_info; ct_pptp_info = nfct_help_data(ct); - nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; new_callid = ct_pptp_info->pns_call_id; @@ -177,11 +186,11 @@ pptp_outbound_pkt(struct sk_buff *skb, ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid)); /* mangle packet */ - if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, - cid_off + sizeof(struct pptp_pkt_hdr) + - sizeof(struct PptpControlHeader), - sizeof(new_callid), (char *)&new_callid, - sizeof(new_callid)) == 0) + if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, + cid_off + sizeof(struct pptp_pkt_hdr) + + sizeof(struct PptpControlHeader), + sizeof(new_callid), (char *)&new_callid, + sizeof(new_callid))) return NF_DROP; return NF_ACCEPT; } @@ -191,11 +200,15 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig, struct nf_conntrack_expect *expect_reply) { const struct nf_conn *ct = expect_orig->master; + struct nf_conn_nat *nat = nfct_nat(ct); struct nf_ct_pptp_master *ct_pptp_info; struct nf_nat_pptp *nat_pptp_info; + if (WARN_ON_ONCE(!nat)) + return; + + nat_pptp_info = &nat->help.nat_pptp_info; ct_pptp_info = nfct_help_data(ct); - nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; /* save original PAC call ID in nat_info */ nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id; @@ -223,11 +236,15 @@ pptp_inbound_pkt(struct sk_buff *skb, union pptp_ctrl_union *pptpReq) { const struct nf_nat_pptp *nat_pptp_info; + struct nf_conn_nat *nat = nfct_nat(ct); u_int16_t msg; __be16 new_pcid; unsigned int pcid_off; - nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; + if (WARN_ON_ONCE(!nat)) + return NF_DROP; + + nat_pptp_info = &nat->help.nat_pptp_info; new_pcid = nat_pptp_info->pns_call_id; switch (msg = ntohs(ctlh->messageType)) { @@ -271,11 +288,11 @@ pptp_inbound_pkt(struct sk_buff *skb, pr_debug("altering peer call id from 0x%04x to 0x%04x\n", ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid)); - if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, - pcid_off + sizeof(struct pptp_pkt_hdr) + - sizeof(struct PptpControlHeader), - sizeof(new_pcid), (char *)&new_pcid, - sizeof(new_pcid)) == 0) + if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, + pcid_off + sizeof(struct pptp_pkt_hdr) + + sizeof(struct PptpControlHeader), + sizeof(new_pcid), (char *)&new_pcid, + sizeof(new_pcid))) return NF_DROP; return NF_ACCEPT; } diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 53e49f5011d3..d5b1e0b3f687 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -827,8 +827,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, return 1; } -static unsigned char snmp_request_decode(struct asn1_ctx *ctx, - struct snmp_request *request) +static unsigned char noinline_for_stack +snmp_request_decode(struct asn1_ctx *ctx, struct snmp_request *request) { unsigned int cls, con, tag; unsigned char *end; @@ -920,10 +920,10 @@ static inline void mangle_address(unsigned char *begin, } } -static unsigned char snmp_trap_decode(struct asn1_ctx *ctx, - struct snmp_v1_trap *trap, - const struct oct1_map *map, - __sum16 *check) +static unsigned char noinline_for_stack +snmp_trap_decode(struct asn1_ctx *ctx, struct snmp_v1_trap *trap, + const struct oct1_map *map, + __sum16 *check) { unsigned int cls, con, tag, len; unsigned char *end; @@ -998,18 +998,6 @@ err_id_free: * *****************************************************************************/ -static void hex_dump(const unsigned char *buf, size_t len) -{ - size_t i; - - for (i = 0; i < len; i++) { - if (i && !(i % 16)) - printk("\n"); - printk("%02x ", *(buf + i)); - } - printk("\n"); -} - /* * Parse and mangle SNMP message according to mapping. * (And this is the fucking 'basic' method). @@ -1026,7 +1014,8 @@ static int snmp_parse_mangle(unsigned char *msg, struct snmp_object *obj; if (debug > 1) - hex_dump(msg, len); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 16, 1, + msg, len, 0); asn1_open(&ctx, msg, len); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 146d86105183..7cd8d0d918f8 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -104,7 +104,6 @@ EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; - const struct iphdr *oiph; struct iphdr *niph; const struct tcphdr *oth; struct tcphdr _oth; @@ -116,8 +115,6 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) return; - oiph = ip_hdr(oldskb); - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + LL_MAX_HEADER, GFP_ATOMIC); if (!nskb) diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index a83d558e1aae..e9293bdebba0 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -139,7 +139,7 @@ struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, * SNAT-ted connection. */ ct = nf_ct_get(skb, &ctinfo); - if (ct && !nf_ct_is_untracked(ct) && + if (ct && ((iph->protocol != IPPROTO_ICMP && ctinfo == IP_CT_ESTABLISHED_REPLY) || (iph->protocol == IPPROTO_ICMP && diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 2981291910dd..de3681df2ce7 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -90,7 +90,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv->result, pkt, + nft_fib_store_result(dest, priv, pkt, nft_in(pkt)->ifindex); return; } @@ -99,7 +99,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (ipv4_is_zeronet(iph->saddr)) { if (ipv4_is_lbcast(iph->daddr) || ipv4_is_local_multicast(iph->daddr)) { - nft_fib_store_result(dest, priv->result, pkt, + nft_fib_store_result(dest, priv, pkt, get_ifindex(pkt->skb->dev)); return; } @@ -212,7 +212,7 @@ nft_fib4_select_ops(const struct nft_ctx *ctx, static struct nft_expr_type nft_fib4_type __read_mostly = { .name = "fib", - .select_ops = &nft_fib4_select_ops, + .select_ops = nft_fib4_select_ops, .policy = nft_fib_policy, .maxattr = NFTA_FIB_MAX, .family = NFPROTO_IPV4, diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 69cf49e8356d..fa44e752a9a3 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -199,7 +199,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED), SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED), SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), - SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), @@ -282,6 +281,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), + SNMP_MIB_ITEM("TCPFastOpenBlackhole", LINUX_MIB_TCPFASTOPENBLACKHOLE), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 4b7c0ec65251..32a691b7ce2c 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -28,7 +28,7 @@ #include <linux/spinlock.h> #include <net/protocol.h> -const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; +struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; EXPORT_SYMBOL(inet_offloads); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d9724889ff09..655d9eebe43e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1250,15 +1250,11 @@ static void set_class_tag(struct rtable *rt, u32 tag) static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { - unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); + unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); + unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size, + ip_rt_min_advmss); - if (advmss == 0) { - advmss = max_t(unsigned int, dst->dev->mtu - 40, - ip_rt_min_advmss); - if (advmss > 65535 - 40) - advmss = 65535 - 40; - } - return advmss; + return min(advmss, IPV4_MAX_PMTU - header_size); } static unsigned int ipv4_mtu(const struct dst_entry *dst) @@ -1734,45 +1730,97 @@ out: } #ifdef CONFIG_IP_ROUTE_MULTIPATH - /* To make ICMP packets follow the right flow, the multipath hash is - * calculated from the inner IP addresses in reverse order. + * calculated from the inner IP addresses. */ -static int ip_multipath_icmp_hash(struct sk_buff *skb) +static void ip_multipath_l3_keys(const struct sk_buff *skb, + struct flow_keys *hash_keys) { const struct iphdr *outer_iph = ip_hdr(skb); - struct icmphdr _icmph; + const struct iphdr *inner_iph; const struct icmphdr *icmph; struct iphdr _inner_iph; - const struct iphdr *inner_iph; + struct icmphdr _icmph; + + hash_keys->addrs.v4addrs.src = outer_iph->saddr; + hash_keys->addrs.v4addrs.dst = outer_iph->daddr; + if (likely(outer_iph->protocol != IPPROTO_ICMP)) + return; if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) - goto standard_hash; + return; icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), &_icmph); if (!icmph) - goto standard_hash; + return; if (icmph->type != ICMP_DEST_UNREACH && icmph->type != ICMP_REDIRECT && icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB) { - goto standard_hash; - } + icmph->type != ICMP_PARAMETERPROB) + return; inner_iph = skb_header_pointer(skb, outer_iph->ihl * 4 + sizeof(_icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) - goto standard_hash; + return; + hash_keys->addrs.v4addrs.src = inner_iph->saddr; + hash_keys->addrs.v4addrs.dst = inner_iph->daddr; +} - return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr); +/* if skb is set it will be used and fl4 can be NULL */ +int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, + const struct sk_buff *skb) +{ + struct net *net = fi->fib_net; + struct flow_keys hash_keys; + u32 mhash; -standard_hash: - return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr); -} + switch (net->ipv4.sysctl_fib_multipath_hash_policy) { + case 0: + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (skb) { + ip_multipath_l3_keys(skb, &hash_keys); + } else { + hash_keys.addrs.v4addrs.src = fl4->saddr; + hash_keys.addrs.v4addrs.dst = fl4->daddr; + } + break; + case 1: + /* skb is currently provided only when forwarding */ + if (skb) { + unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; + struct flow_keys keys; + + /* short-circuit if we already have L4 hash present */ + if (skb->l4_hash) + return skb_get_hash_raw(skb) >> 1; + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, flag); + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + hash_keys.ports.src = keys.ports.src; + hash_keys.ports.dst = keys.ports.dst; + hash_keys.basic.ip_proto = keys.basic.ip_proto; + } else { + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = fl4->saddr; + hash_keys.addrs.v4addrs.dst = fl4->daddr; + hash_keys.ports.src = fl4->fl4_sport; + hash_keys.ports.dst = fl4->fl4_dport; + hash_keys.basic.ip_proto = fl4->flowi4_proto; + } + break; + } + mhash = flow_hash_from_keys(&hash_keys); + return mhash >> 1; +} +EXPORT_SYMBOL_GPL(fib_multipath_hash); #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static int ip_mkroute_input(struct sk_buff *skb, @@ -1782,12 +1830,8 @@ static int ip_mkroute_input(struct sk_buff *skb, { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) { - int h; + int h = fib_multipath_hash(res->fi, NULL, skb); - if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP)) - h = ip_multipath_icmp_hash(skb); - else - h = fib_multipath_hash(saddr, daddr); fib_select_multipath(res, h); } #endif @@ -2203,7 +2247,7 @@ add: */ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, - int mp_hash) + const struct sk_buff *skb) { struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); @@ -2366,7 +2410,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, goto make_route; } - fib_select_path(net, &res, fl4, mp_hash); + fib_select_path(net, &res, fl4, skb); dev_out = FIB_RES_DEV(res); fl4->flowi4_oif = dev_out->ifindex; @@ -2586,7 +2630,8 @@ nla_put_failure: return -EMSGSIZE; } -static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) +static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct rtmsg *rtm; @@ -2602,7 +2647,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) u32 table_id = RT_TABLE_MAIN; kuid_t uid; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, + extack); if (err < 0) goto errout; @@ -2620,10 +2666,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) skb_reset_mac_header(skb); skb_reset_network_header(skb); - /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ - ip_hdr(skb)->protocol = IPPROTO_UDP; - skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); - src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; @@ -2633,6 +2675,15 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) else uid = (iif ? INVALID_UID : current_uid()); + /* Bugfix: need to give ip_route_input enough of an IP header to + * not gag. + */ + ip_hdr(skb)->protocol = IPPROTO_UDP; + ip_hdr(skb)->saddr = src; + ip_hdr(skb)->daddr = dst; + + skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); + memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst; fl4.saddr = src; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d6880a6149ee..86957e9cd6c6 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -24,6 +24,7 @@ #include <net/cipso_ipv4.h> #include <net/inet_frag.h> #include <net/ping.h> +#include <net/protocol.h> static int zero; static int one = 1; @@ -294,6 +295,74 @@ bad_key: return ret; } +static void proc_configure_early_demux(int enabled, int protocol) +{ + struct net_protocol *ipprot; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_protocol *ip6prot; +#endif + + rcu_read_lock(); + + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot) + ipprot->early_demux = enabled ? ipprot->early_demux_handler : + NULL; + +#if IS_ENABLED(CONFIG_IPV6) + ip6prot = rcu_dereference(inet6_protos[protocol]); + if (ip6prot) + ip6prot->early_demux = enabled ? ip6prot->early_demux_handler : + NULL; +#endif + rcu_read_unlock(); +} + +static int proc_tcp_early_demux(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (write && !ret) { + int enabled = init_net.ipv4.sysctl_tcp_early_demux; + + proc_configure_early_demux(enabled, IPPROTO_TCP); + } + + return ret; +} + +static int proc_udp_early_demux(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (write && !ret) { + int enabled = init_net.ipv4.sysctl_udp_early_demux; + + proc_configure_early_demux(enabled, IPPROTO_UDP); + } + + return ret; +} + +static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, + int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + tcp_fastopen_active_timeout_reset(); + return ret; +} + static struct ctl_table ipv4_table[] = { { .procname = "tcp_timestamps", @@ -344,6 +413,14 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_tcp_fastopen_key, }, { + .procname = "tcp_fastopen_blackhole_timeout_sec", + .data = &sysctl_tcp_fastopen_blackhole_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tfo_blackhole_detect_timeout, + .extra1 = &zero, + }, + { .procname = "tcp_abort_on_overflow", .data = &sysctl_tcp_abort_on_overflow, .maxlen = sizeof(int), @@ -750,6 +827,20 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { + .procname = "udp_early_demux", + .data = &init_net.ipv4.sysctl_udp_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_udp_early_demux + }, + { + .procname = "tcp_early_demux", + .data = &init_net.ipv4.sysctl_tcp_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tcp_early_demux + }, + { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, .maxlen = sizeof(int), @@ -981,13 +1072,6 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_tw_recycle", - .data = &init_net.ipv4.tcp_death_row.sysctl_tw_recycle, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_max_syn_backlog", .data = &init_net.ipv4.sysctl_max_syn_backlog, .maxlen = sizeof(int), @@ -1004,6 +1088,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "fib_multipath_hash_policy", + .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { .procname = "ip_unprivileged_port_start", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 40ba4249a586..1e4c76d2b827 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -533,7 +533,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (tp->urg_data & TCP_URG_VALID) mask |= POLLPRI; - } else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { + } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { /* Active TCP fastopen socket with defer_connect * Return POLLOUT so application can call write() * in order for kernel to generate SYN+data @@ -2296,6 +2296,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); tcp_write_queue_purge(sk); + tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; @@ -2394,7 +2395,7 @@ static int tcp_repair_options_est(struct tcp_sock *tp, u16 snd_wscale = opt.opt_val & 0xFFFF; u16 rcv_wscale = opt.opt_val >> 16; - if (snd_wscale > 14 || rcv_wscale > 14) + if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE) return -EFBIG; tp->rx_opt.snd_wscale = snd_wscale; @@ -2471,7 +2472,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet * know which interface is going to be used */ - if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) { + if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) { err = -EINVAL; break; } @@ -2852,7 +2853,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_advmss = tp->advmss; - info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; + info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3; info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index c99230efcd52..0683ba447d77 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -72,7 +72,7 @@ MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); module_param(hystart, int, 0644); MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm"); module_param(hystart_detect, int, 0644); -MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms" +MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 8ea4e9787f82..4af82b914dd4 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -341,6 +341,13 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, cookie->len = -1; return false; } + + /* Firewall blackhole issue check */ + if (tcp_fastopen_active_should_disable(sk)) { + cookie->len = -1; + return false; + } + if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { cookie->len = -1; return true; @@ -380,3 +387,98 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err) return false; } EXPORT_SYMBOL(tcp_fastopen_defer_connect); + +/* + * The following code block is to deal with middle box issues with TFO: + * Middlebox firewall issues can potentially cause server's data being + * blackholed after a successful 3WHS using TFO. + * The proposed solution is to disable active TFO globally under the + * following circumstances: + * 1. client side TFO socket receives out of order FIN + * 2. client side TFO socket receives out of order RST + * We disable active side TFO globally for 1hr at first. Then if it + * happens again, we disable it for 2h, then 4h, 8h, ... + * And we reset the timeout back to 1hr when we see a successful active + * TFO connection with data exchanges. + */ + +/* Default to 1hr */ +unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60; +static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0); +static unsigned long tfo_active_disable_stamp __read_mostly; + +/* Disable active TFO and record current jiffies and + * tfo_active_disable_times + */ +void tcp_fastopen_active_disable(struct sock *sk) +{ + atomic_inc(&tfo_active_disable_times); + tfo_active_disable_stamp = jiffies; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE); +} + +/* Reset tfo_active_disable_times to 0 */ +void tcp_fastopen_active_timeout_reset(void) +{ + atomic_set(&tfo_active_disable_times, 0); +} + +/* Calculate timeout for tfo active disable + * Return true if we are still in the active TFO disable period + * Return false if timeout already expired and we should use active TFO + */ +bool tcp_fastopen_active_should_disable(struct sock *sk) +{ + int tfo_da_times = atomic_read(&tfo_active_disable_times); + int multiplier; + unsigned long timeout; + + if (!tfo_da_times) + return false; + + /* Limit timout to max: 2^6 * initial timeout */ + multiplier = 1 << min(tfo_da_times - 1, 6); + timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ; + if (time_before(jiffies, tfo_active_disable_stamp + timeout)) + return true; + + /* Mark check bit so we can check for successful active TFO + * condition and reset tfo_active_disable_times + */ + tcp_sk(sk)->syn_fastopen_ch = 1; + return false; +} + +/* Disable active TFO if FIN is the only packet in the ofo queue + * and no data is received. + * Also check if we can reset tfo_active_disable_times if data is + * received successfully on a marked active TFO sockets opened on + * a non-loopback interface + */ +void tcp_fastopen_active_disable_ofo_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct rb_node *p; + struct sk_buff *skb; + struct dst_entry *dst; + + if (!tp->syn_fastopen) + return; + + if (!tp->data_segs_in) { + p = rb_first(&tp->out_of_order_queue); + if (p && !rb_next(p)) { + skb = rb_entry(p, struct sk_buff, rbnode); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { + tcp_fastopen_active_disable(sk); + return; + } + } + } else if (tp->syn_fastopen_ch && + atomic_read(&tfo_active_disable_times)) { + dst = sk_dst_get(sk); + if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) + tcp_fastopen_active_timeout_reset(); + dst_release(dst); + } +} diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 659d1baefb2b..9739962bfb3f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -442,7 +442,8 @@ void tcp_init_buffer_space(struct sock *sk) tcp_sndbuf_expand(sk); tp->rcvq_space.space = tp->rcv_wnd; - tp->rcvq_space.time = tcp_time_stamp; + skb_mstamp_get(&tp->tcp_mstamp); + tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; maxwin = tcp_full_space(sk); @@ -518,7 +519,7 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss); */ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) { - u32 new_sample = tp->rcv_rtt_est.rtt; + u32 new_sample = tp->rcv_rtt_est.rtt_us; long m = sample; if (m == 0) @@ -548,21 +549,23 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) new_sample = m << 3; } - if (tp->rcv_rtt_est.rtt != new_sample) - tp->rcv_rtt_est.rtt = new_sample; + tp->rcv_rtt_est.rtt_us = new_sample; } static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) { - if (tp->rcv_rtt_est.time == 0) + u32 delta_us; + + if (tp->rcv_rtt_est.time.v64 == 0) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; - tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1); + delta_us = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcv_rtt_est.time); + tcp_rcv_rtt_update(tp, delta_us, 1); new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; - tp->rcv_rtt_est.time = tcp_time_stamp; + tp->rcv_rtt_est.time = tp->tcp_mstamp; } static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, @@ -572,7 +575,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, if (tp->rx_opt.rcv_tsecr && (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) - tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); + tcp_rcv_rtt_update(tp, + jiffies_to_usecs(tcp_time_stamp - + tp->rx_opt.rcv_tsecr), + 0); } /* @@ -585,8 +591,8 @@ void tcp_rcv_space_adjust(struct sock *sk) int time; int copied; - time = tcp_time_stamp - tp->rcvq_space.time; - if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) + time = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcvq_space.time); + if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) return; /* Number of bytes copied to user in last RTT */ @@ -642,7 +648,7 @@ void tcp_rcv_space_adjust(struct sock *sk) new_measure: tp->rcvq_space.seq = tp->copied_seq; - tp->rcvq_space.time = tcp_time_stamp; + tp->rcvq_space.time = tp->tcp_mstamp; } /* There is something which you must keep in mind when you analyze the @@ -1131,7 +1137,6 @@ struct tcp_sacktag_state { */ struct skb_mstamp first_sackt; struct skb_mstamp last_sackt; - struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */ struct rate_sample *rate; int flag; }; @@ -1214,8 +1219,7 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { - tcp_rack_advance(tp, sacked, end_seq, - xmit_time, &state->ack_time); + tcp_rack_advance(tp, sacked, end_seq, xmit_time); if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, @@ -2760,8 +2764,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) return false; } -static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, - const struct skb_mstamp *ack_time) +static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2769,7 +2772,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { u32 prior_retrans = tp->retrans_out; - tcp_rack_mark_lost(sk, ack_time); + tcp_rack_mark_lost(sk); if (prior_retrans > tp->retrans_out) *ack_flag |= FLAG_LOST_RETRANS; } @@ -2788,8 +2791,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const int acked, - bool is_dupack, int *ack_flag, int *rexmit, - const struct skb_mstamp *ack_time) + bool is_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2857,11 +2859,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_try_keep_open(sk); return; } - tcp_rack_identify_loss(sk, ack_flag, ack_time); + tcp_rack_identify_loss(sk, ack_flag); break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack, rexmit); - tcp_rack_identify_loss(sk, ack_flag, ack_time); + tcp_rack_identify_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) return; @@ -2877,7 +2879,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); - tcp_rack_identify_loss(sk, ack_flag, ack_time); + tcp_rack_identify_loss(sk, ack_flag); if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; @@ -3059,8 +3061,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, { const struct inet_connection_sock *icsk = inet_csk(sk); struct skb_mstamp first_ackt, last_ackt; - struct skb_mstamp *now = &sack->ack_time; struct tcp_sock *tp = tcp_sk(sk); + struct skb_mstamp *now = &tp->tcp_mstamp; u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; bool fully_acked = true; @@ -3120,8 +3122,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->delivered += acked_pcount; if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, - &skb->skb_mstamp, - &sack->ack_time); + &skb->skb_mstamp); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3576,8 +3577,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; - skb_mstamp_get(&sack_state.ack_time); - if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); @@ -3647,8 +3646,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, - &sack_state.ack_time); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); } if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -3660,8 +3658,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_schedule_loss_probe(sk); delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ - tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time, - sack_state.rate); + tcp_rate_gen(sk, delivered, lost, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); return 1; @@ -3669,8 +3666,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, - &sack_state.ack_time); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3691,11 +3687,9 @@ old_ack: * If data was DSACKed, see if we can undo a cwnd reduction. */ if (TCP_SKB_CB(skb)->sacked) { - skb_mstamp_get(&sack_state.ack_time); flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, - &sack_state.ack_time); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); tcp_xmit_recovery(sk, rexmit); } @@ -3768,11 +3762,12 @@ void tcp_parse_options(const struct sk_buff *skb, !estab && sysctl_tcp_window_scaling) { __u8 snd_wscale = *(__u8 *)ptr; opt_rx->wscale_ok = 1; - if (snd_wscale > 14) { - net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n", + if (snd_wscale > TCP_MAX_WSCALE) { + net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n", __func__, - snd_wscale); - snd_wscale = 14; + snd_wscale, + TCP_MAX_WSCALE); + snd_wscale = TCP_MAX_WSCALE; } opt_rx->snd_wscale = snd_wscale; } @@ -4007,10 +4002,10 @@ void tcp_reset(struct sock *sk) /* This barrier is coupled with smp_rmb() in tcp_poll() */ smp_wmb(); + tcp_done(sk); + if (!sock_flag(sk, SOCK_DEAD)) sk->sk_error_report(sk); - - tcp_done(sk); } /* @@ -5299,8 +5294,16 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (rst_seq_match) tcp_reset(sk); - else + else { + /* Disable TFO if RST is out-of-order + * and no data has been received + * for current active TFO socket + */ + if (tp->syn_fastopen && !tp->data_segs_in && + sk->sk_state == TCP_ESTABLISHED) + tcp_fastopen_active_disable(sk); tcp_send_challenge_ack(sk, skb); + } goto discard; } @@ -5353,6 +5356,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); + skb_mstamp_get(&tp->tcp_mstamp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* @@ -5579,10 +5583,6 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) else tp->pred_flags = 0; - if (!sock_flag(sk, SOCK_DEAD)) { - sk->sk_state_change(sk); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - } } static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, @@ -5651,6 +5651,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp->rx_opt.mss_clamp; + bool fastopen_fail; tcp_parse_options(skb, &tp->rx_opt, 0, &foc); if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) @@ -5754,10 +5755,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_finish_connect(sk, skb); - if ((tp->syn_fastopen || tp->syn_data) && - tcp_rcv_fastopen_synack(sk, skb, &foc)) - return -1; + fastopen_fail = (tp->syn_fastopen || tp->syn_data) && + tcp_rcv_fastopen_synack(sk, skb, &foc); + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); + } + if (fastopen_fail) + return -1; if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) { @@ -5911,6 +5917,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_SYN_SENT: tp->rx_opt.saw_tstamp = 0; + skb_mstamp_get(&tp->tcp_mstamp); queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; @@ -5922,6 +5929,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) return 0; } + skb_mstamp_get(&tp->tcp_mstamp); tp->rx_opt.saw_tstamp = 0; req = tp->fastopen_rsk; if (req) { @@ -6041,9 +6049,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; } - if (tp->linger2 < 0 || - (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && - after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { + if (tp->linger2 < 0) { + tcp_done(sk); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + return 1; + } + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + /* Receive out of order FIN after close() */ + if (tp->syn_fastopen && th->fin) + tcp_fastopen_active_disable(sk); tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; @@ -6333,36 +6348,14 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; if (isn && tmp_opt.tstamp_ok) - af_ops->init_seq(skb, &tcp_rsk(req)->ts_off); + af_ops->init_seq_tsoff(skb, &tcp_rsk(req)->ts_off); if (!want_cookie && !isn) { - /* VJ's idea. We save last timestamp seen - * from the destination in peer table, when entering - * state TIME-WAIT, and check against it before - * accepting new connection request. - * - * If "isn" is not zero, this request hit alive - * timewait bucket, so that all the necessary checks - * are made in the function processing timewait state. - */ - if (net->ipv4.tcp_death_row.sysctl_tw_recycle) { - bool strict; - - dst = af_ops->route_req(sk, &fl, req, &strict); - - if (dst && strict && - !tcp_peer_is_proven(req, dst, true, - tmp_opt.saw_tstamp)) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); - goto drop_and_release; - } - } /* Kill the following clause, if you dislike this way. */ - else if (!net->ipv4.sysctl_tcp_syncookies && - (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (net->ipv4.sysctl_max_syn_backlog >> 2)) && - !tcp_peer_is_proven(req, dst, false, - tmp_opt.saw_tstamp)) { + if (!net->ipv4.sysctl_tcp_syncookies && + (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (net->ipv4.sysctl_max_syn_backlog >> 2)) && + !tcp_peer_is_proven(req, dst)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. @@ -6375,10 +6368,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_release; } - isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off); + isn = af_ops->init_seq_tsoff(skb, &tcp_rsk(req)->ts_off); } if (!dst) { - dst = af_ops->route_req(sk, &fl, req, NULL); + dst = af_ops->route_req(sk, &fl, req); if (!dst) goto drop_and_free; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 575e19dcc017..cbbafe546c0f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -94,12 +94,12 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); -static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff) +static u32 tcp_v4_init_seq_and_tsoff(const struct sk_buff *skb, u32 *tsoff) { - return secure_tcp_sequence_number(ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, - tcp_hdr(skb)->dest, - tcp_hdr(skb)->source, tsoff); + return secure_tcp_seq_and_tsoff(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source, tsoff); } int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -198,10 +198,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->write_seq = 0; } - if (tcp_death_row->sysctl_tw_recycle && - !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) - tcp_fetch_timewait_stamp(sk, &rt->dst); - inet->inet_dport = usin->sin_port; sk_daddr_set(sk, daddr); @@ -236,11 +232,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = NULL; if (likely(!tp->repair)) { - seq = secure_tcp_sequence_number(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - usin->sin_port, - &tp->tsoffset); + seq = secure_tcp_seq_and_tsoff(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + usin->sin_port, + &tp->tsoffset); if (!tp->write_seq) tp->write_seq = seq; } @@ -1217,19 +1213,9 @@ static void tcp_v4_init_req(struct request_sock *req, static struct dst_entry *tcp_v4_route_req(const struct sock *sk, struct flowi *fl, - const struct request_sock *req, - bool *strict) + const struct request_sock *req) { - struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req); - - if (strict) { - if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr) - *strict = true; - else - *strict = false; - } - - return dst; + return inet_csk_route_req(sk, &fl->u.ip4, req); } struct request_sock_ops tcp_request_sock_ops __read_mostly = { @@ -1253,7 +1239,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .cookie_init_seq = cookie_v4_init_sequence, #endif .route_req = tcp_v4_route_req, - .init_seq = tcp_v4_init_sequence, + .init_seq_tsoff = tcp_v4_init_seq_and_tsoff, .send_synack = tcp_v4_send_synack, }; @@ -1423,8 +1409,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (!nsk) goto discard; if (nsk != sk) { - sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; @@ -1871,6 +1855,9 @@ void tcp_v4_destroy_sock(struct sock *sk) /* Cleanup up the write buffer. */ tcp_write_queue_purge(sk); + /* Check if we want to disable active TFO */ + tcp_fastopen_active_disable_ofo_check(sk); + /* Cleans up our, hopefully empty, out_of_order_queue. */ skb_rbtree_purge(&tp->out_of_order_queue); @@ -2466,7 +2453,6 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_tw_reuse = 0; cnt = tcp_hashinfo.ehash_mask + 1; - net->ipv4.tcp_death_row.sysctl_tw_recycle = 0; net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 046fd3910873..d6fb6c067af4 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -264,13 +264,15 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); + u32 delta; if (sample->rtt_us > 0) tcp_lp_rtt_sample(sk, sample->rtt_us); /* calc inference */ - if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) - lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); + delta = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + if ((s32)delta > 0) + lp->inference = 3 * delta; /* test if within inference */ if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 0f46e5fe31ad..9d0d4f39e42b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -45,8 +45,6 @@ struct tcp_metrics_block { struct inetpeer_addr tcpm_saddr; struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; - u32 tcpm_ts; - u32 tcpm_ts_stamp; u32 tcpm_lock; u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1]; struct tcp_fastopen_metrics tcpm_fastopen; @@ -123,8 +121,6 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); - tm->tcpm_ts = 0; - tm->tcpm_ts_stamp = 0; if (fastopen_clear) { tm->tcpm_fastopen.mss = 0; tm->tcpm_fastopen.syn_loss = 0; @@ -273,48 +269,6 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, return tm; } -static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) -{ - struct tcp_metrics_block *tm; - struct inetpeer_addr saddr, daddr; - unsigned int hash; - struct net *net; - - if (tw->tw_family == AF_INET) { - inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr); - inetpeer_set_addr_v4(&daddr, tw->tw_daddr); - hash = ipv4_addr_hash(tw->tw_daddr); - } -#if IS_ENABLED(CONFIG_IPV6) - else if (tw->tw_family == AF_INET6) { - if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) { - inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr); - inetpeer_set_addr_v4(&daddr, tw->tw_daddr); - hash = ipv4_addr_hash(tw->tw_daddr); - } else { - inetpeer_set_addr_v6(&saddr, &tw->tw_v6_rcv_saddr); - inetpeer_set_addr_v6(&daddr, &tw->tw_v6_daddr); - hash = ipv6_addr_hash(&tw->tw_v6_daddr); - } - } -#endif - else - return NULL; - - net = twsk_net(tw); - hash ^= net_hash_mix(net); - hash = hash_32(hash, tcp_metrics_hash_log); - - for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; - tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_saddr, &saddr) && - addr_same(&tm->tcpm_daddr, &daddr) && - net_eq(tm_net(tm), net)) - break; - } - return tm; -} - static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, struct dst_entry *dst, bool create) @@ -573,8 +527,7 @@ reset: tp->snd_cwnd_stamp = tcp_time_stamp; } -bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, - bool paws_check, bool timestamps) +bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) { struct tcp_metrics_block *tm; bool ret; @@ -584,94 +537,10 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, rcu_read_lock(); tm = __tcp_get_metrics_req(req, dst); - if (paws_check) { - if (tm && - (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && - ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW || - !timestamps)) - ret = false; - else - ret = true; - } else { - if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp) - ret = true; - else - ret = false; - } - rcu_read_unlock(); - - return ret; -} - -void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) -{ - struct tcp_metrics_block *tm; - - rcu_read_lock(); - tm = tcp_get_metrics(sk, dst, true); - if (tm) { - struct tcp_sock *tp = tcp_sk(sk); - - if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) { - tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp; - tp->rx_opt.ts_recent = tm->tcpm_ts; - } - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp); - -/* VJ's idea. Save last timestamp seen from this destination and hold - * it at least for normal timewait interval to use for duplicate - * segment detection in subsequent connections, before they enter - * synchronized state. - */ -bool tcp_remember_stamp(struct sock *sk) -{ - struct dst_entry *dst = __sk_dst_get(sk); - bool ret = false; - - if (dst) { - struct tcp_metrics_block *tm; - - rcu_read_lock(); - tm = tcp_get_metrics(sk, dst, true); - if (tm) { - struct tcp_sock *tp = tcp_sk(sk); - - if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 || - ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && - tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { - tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; - tm->tcpm_ts = tp->rx_opt.ts_recent; - } - ret = true; - } - rcu_read_unlock(); - } - return ret; -} - -bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) -{ - struct tcp_metrics_block *tm; - bool ret = false; - - rcu_read_lock(); - tm = __tcp_get_metrics_tw(tw); - if (tm) { - const struct tcp_timewait_sock *tcptw; - struct sock *sk = (struct sock *) tw; - - tcptw = tcp_twsk(sk); - if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 || - ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && - tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { - tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; - tm->tcpm_ts = tcptw->tw_ts_recent; - } + if (tm && tcp_metric_get(tm, TCP_METRIC_RTT)) ret = true; - } + else + ret = false; rcu_read_unlock(); return ret; @@ -791,14 +660,6 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, jiffies - tm->tcpm_stamp, TCP_METRICS_ATTR_PAD) < 0) goto nla_put_failure; - if (tm->tcpm_ts_stamp) { - if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP, - (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0) - goto nla_put_failure; - if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL, - tm->tcpm_ts) < 0) - goto nla_put_failure; - } { int n = 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 65c0f3d13eca..8f6373b0cd77 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -26,6 +26,7 @@ #include <net/tcp.h> #include <net/inet_common.h> #include <net/xfrm.h> +#include <net/busy_poll.h> int sysctl_tcp_abort_on_overflow __read_mostly; @@ -94,7 +95,6 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; - struct inet_timewait_death_row *tcp_death_row = &sock_net((struct sock*)tw)->ipv4.tcp_death_row; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { @@ -149,12 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } - if (tcp_death_row->sysctl_tw_recycle && - tcptw->tw_ts_recent_stamp && - tcp_tw_remember_stamp(tw)) - inet_twsk_reschedule(tw, tw->tw_timeout); - else - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -259,12 +254,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); struct inet_timewait_sock *tw; - bool recycle_ok = false; struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; - if (tcp_death_row->sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) - recycle_ok = tcp_remember_stamp(sk); - tw = inet_twsk_alloc(sk, tcp_death_row, state); if (tw) { @@ -317,13 +308,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (timeo < rto) timeo = rto; - if (recycle_ok) { - tw->tw_timeout = rto; - } else { - tw->tw_timeout = TCP_TIMEWAIT_LEN; - if (state == TCP_TIME_WAIT) - timeo = TCP_TIMEWAIT_LEN; - } + tw->tw_timeout = TCP_TIMEWAIT_LEN; + if (state == TCP_TIME_WAIT) + timeo = TCP_TIMEWAIT_LEN; inet_twsk_schedule(tw, timeo); /* Linkage updates. */ @@ -813,6 +800,9 @@ int tcp_child_process(struct sock *parent, struct sock *child, int ret = 0; int state = child->sk_state; + /* record NAPI ID of child */ + sk_mark_napi_id(child, skb); + tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { ret = tcp_rcv_state_process(child, skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a85d863c4419..60111a0fc201 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -212,12 +212,12 @@ void tcp_select_initial_window(int __space, __u32 mss, /* If no clamp set the clamp to the max possible scaled window */ if (*window_clamp == 0) - (*window_clamp) = (65535 << 14); + (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE); space = min(*window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) - space = (space / mss) * mss; + space = rounddown(space, mss); /* NOTE: offering an initial window larger than 32767 * will break some buggy TCP stacks. If the admin tells us @@ -234,13 +234,11 @@ void tcp_select_initial_window(int __space, __u32 mss, (*rcv_wscale) = 0; if (wscale_ok) { - /* Set window scaling on max possible window - * See RFC1323 for an explanation of the limit to 14 - */ + /* Set window scaling on max possible window */ space = max_t(u32, space, sysctl_tcp_rmem[2]); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); - while (space > 65535 && (*rcv_wscale) < 14) { + while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { space >>= 1; (*rcv_wscale)++; } @@ -253,7 +251,7 @@ void tcp_select_initial_window(int __space, __u32 mss, } /* Set the clamp no higher than max representable value */ - (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); + (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); } EXPORT_SYMBOL(tcp_select_initial_window); @@ -2566,7 +2564,6 @@ u32 __tcp_select_window(struct sock *sk) /* Don't do rounding if we are using window scaling, since the * scaled window will not line up with the MSS boundary anyway. */ - window = tp->rcv_wnd; if (tp->rx_opt.rcv_wscale) { window = free_space; @@ -2574,10 +2571,9 @@ u32 __tcp_select_window(struct sock *sk) * Import case: prevent zero window announcement if * 1<<rcv_wscale > mss. */ - if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window) - window = (((window >> tp->rx_opt.rcv_wscale) + 1) - << tp->rx_opt.rcv_wscale); + window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale)); } else { + window = tp->rcv_wnd; /* Get the largest window that is a nice multiple of mss. * Window clamp already applied above. * If our current window offering is within 1 mss of the @@ -2587,7 +2583,7 @@ u32 __tcp_select_window(struct sock *sk) * is too small. */ if (window <= free_space - mss || window > free_space) - window = (free_space / mss) * mss; + window = rounddown(free_space, mss); else if (mss == full_space && free_space > window + (full_space >> 1)) window = free_space; diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 9be1581a5a08..c6a9fa894646 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, /* Update the connection delivery information and generate a rate sample. */ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - struct skb_mstamp *now, struct rate_sample *rs) + struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); u32 snd_us, ack_us; @@ -120,7 +120,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * to carry current time, flags, stats like "tcp_sacktag_state". */ if (delivered) - tp->delivered_mstamp = *now; + tp->delivered_mstamp = tp->tcp_mstamp; rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ rs->losses = lost; /* freshly marked lost */ @@ -138,7 +138,8 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * longer phase. */ snd_us = rs->interval_us; /* send phase */ - ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */ + ack_us = skb_mstamp_us_delta(&tp->tcp_mstamp, + &rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); /* Normally we expect interval_us >= min-rtt. diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index d8acbd9f477a..362b8c75bfab 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -45,8 +45,7 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1, * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will * make us enter the CA_Recovery state. */ -static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, - u32 *reo_timeout) +static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -79,7 +78,7 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, * A packet is lost if its elapsed time is beyond * the recent RTT plus the reordering window. */ - u32 elapsed = skb_mstamp_us_delta(now, + u32 elapsed = skb_mstamp_us_delta(&tp->tcp_mstamp, &skb->skb_mstamp); s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; @@ -105,7 +104,7 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, } } -void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now) +void tcp_rack_mark_lost(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 timeout; @@ -115,7 +114,7 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now) /* Reset the advanced flag to avoid unnecessary queue scanning */ tp->rack.advanced = 0; - tcp_rack_detect_loss(sk, now, &timeout); + tcp_rack_detect_loss(sk, &timeout); if (timeout) { timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN); inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, @@ -128,8 +127,7 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now) * draft-cheng-tcpm-rack-00.txt */ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - const struct skb_mstamp *xmit_time, - const struct skb_mstamp *ack_time) + const struct skb_mstamp *xmit_time) { u32 rtt_us; @@ -138,7 +136,7 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, end_seq, tp->rack.end_seq)) return; - rtt_us = skb_mstamp_us_delta(ack_time, xmit_time); + rtt_us = skb_mstamp_us_delta(&tp->tcp_mstamp, xmit_time); if (sacked & TCPCB_RETRANS) { /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior @@ -165,12 +163,11 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, void tcp_rack_reo_timeout(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct skb_mstamp now; u32 timeout, prior_inflight; - skb_mstamp_get(&now); prior_inflight = tcp_packets_in_flight(tp); - tcp_rack_detect_loss(sk, &now, &timeout); + skb_mstamp_get(&tp->tcp_mstamp); + tcp_rack_detect_loss(sk, &timeout); if (prior_inflight != tcp_packets_in_flight(tp)) { if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { tcp_enter_recovery(sk, false); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b2ab411c6d37..14672543cf0b 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -201,11 +201,10 @@ static int tcp_write_timeout(struct sock *sk) if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) { /* Some middle-boxes may black-hole Fast Open _after_ * the handshake. Therefore we conservatively disable - * Fast Open on this path on recurring timeouts with - * few or zero bytes acked after Fast Open. + * Fast Open on this path on recurring timeouts after + * successful Fast Open. */ - if (tp->syn_data_acked && - tp->bytes_acked <= tp->rx_opt.mss_clamp) { + if (tp->syn_data_acked) { tcp_fastopen_cache_set(sk, 0, NULL, true, 0); if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) NET_INC_STATS(sock_net(sk), diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index fed66dc0e0f5..9775453b8d17 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -265,8 +265,8 @@ static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr, if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { info->vegas.tcpv_enabled = 1; info->vegas.tcpv_rttcnt = 0; - info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt), - info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), + info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt); + info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); *attr = INET_DIAG_VEGASINFO; return sizeof(struct tcpvegas_info); diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 4acc0508c5eb..3d36644890bb 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -12,6 +12,7 @@ #include <net/dst.h> #include <net/ip.h> #include <net/xfrm.h> +#include <net/protocol.h> /* Add encapsulation header. * @@ -23,6 +24,8 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *iph = ip_hdr(skb); int ihl = iph->ihl * 4; + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); @@ -56,9 +59,40 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) return 0; } +static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + const struct net_offload *ops; + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct xfrm_offload *xo = xfrm_offload(skb); + + skb->transport_header += x->props.header_len; + ops = rcu_dereference(inet_offloads[xo->proto]); + if (likely(ops && ops->callbacks.gso_segment)) + segs = ops->callbacks.gso_segment(skb, features); + + return segs; +} + +static void xfrm4_transport_xmit(struct xfrm_state *x, struct sk_buff *skb) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + + skb_reset_mac_len(skb); + pskb_pull(skb, skb->mac_len + sizeof(struct iphdr) + x->props.header_len); + + if (xo->flags & XFRM_GSO_SEGMENT) { + skb_reset_transport_header(skb); + skb->transport_header -= x->props.header_len; + } +} + static struct xfrm_mode xfrm4_transport_mode = { .input = xfrm4_transport_input, .output = xfrm4_transport_output, + .gso_segment = xfrm4_transport_gso_segment, + .xmit = xfrm4_transport_xmit, .owner = THIS_MODULE, .encap = XFRM_MODE_TRANSPORT, }; diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 35feda676464..e6265e2c274e 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -33,6 +33,9 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *top_iph; int flags; + skb_set_inner_network_header(skb, skb_network_offset(skb)); + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); @@ -96,11 +99,36 @@ out: return err; } +static struct sk_buff *xfrm4_mode_tunnel_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + __skb_push(skb, skb->mac_len); + return skb_mac_gso_segment(skb, features); + +} + +static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + + if (xo->flags & XFRM_GSO_SEGMENT) { + skb->network_header = skb->network_header - x->props.header_len; + skb->transport_header = skb->network_header + + sizeof(struct iphdr); + } + + skb_reset_mac_len(skb); + pskb_pull(skb, skb->mac_len + x->props.header_len); +} + static struct xfrm_mode xfrm4_tunnel_mode = { .input2 = xfrm4_mode_tunnel_input, .input = xfrm_prepare_input, .output2 = xfrm4_mode_tunnel_output, .output = xfrm4_prepare_output, + .gso_segment = xfrm4_mode_tunnel_gso_segment, + .xmit = xfrm4_mode_tunnel_xmit, .owner = THIS_MODULE, .encap = XFRM_MODE_TUNNEL, .flags = XFRM_MODE_FLAG_TUNNEL, diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 7ee6518afa86..94b8702603bc 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -29,7 +29,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) goto out; mtu = dst_mtu(skb_dst(skb)); - if (skb->len > mtu) { + if ((!skb_is_gso(skb) && skb->len > mtu) || + (skb_is_gso(skb) && skb_gso_network_seglen(skb) > ip_skb_dst_mtu(skb->sk, skb))) { skb->protocol = htons(ETH_P_IP); if (skb->sk) |