diff options
Diffstat (limited to 'net/ipv4')
39 files changed, 711 insertions, 362 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4307503a6f0b..b7260c8cef2e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1017,6 +1017,7 @@ static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon const struct proto_ops inet_stream_ops = { .family = PF_INET, + .flags = PROTO_CMSG_DATA_ONLY, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index e3939f76b024..618954f82764 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -28,27 +28,6 @@ static u32 unsupported_ops[] = { static const struct btf_type *tcp_sock_type; static u32 tcp_sock_id, sock_id; -static int btf_sk_storage_get_ids[5]; -static struct bpf_func_proto btf_sk_storage_get_proto __read_mostly; - -static int btf_sk_storage_delete_ids[5]; -static struct bpf_func_proto btf_sk_storage_delete_proto __read_mostly; - -static void convert_sk_func_proto(struct bpf_func_proto *to, int *to_btf_ids, - const struct bpf_func_proto *from) -{ - int i; - - *to = *from; - to->btf_id = to_btf_ids; - for (i = 0; i < ARRAY_SIZE(to->arg_type); i++) { - if (to->arg_type[i] == ARG_PTR_TO_SOCKET) { - to->arg_type[i] = ARG_PTR_TO_BTF_ID; - to->btf_id[i] = tcp_sock_id; - } - } -} - static int bpf_tcp_ca_init(struct btf *btf) { s32 type_id; @@ -64,13 +43,6 @@ static int bpf_tcp_ca_init(struct btf *btf) tcp_sock_id = type_id; tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); - convert_sk_func_proto(&btf_sk_storage_get_proto, - btf_sk_storage_get_ids, - &bpf_sk_storage_get_proto); - convert_sk_func_proto(&btf_sk_storage_delete_proto, - btf_sk_storage_delete_ids, - &bpf_sk_storage_delete_proto); - return 0; } @@ -185,8 +157,8 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = { /* In case we want to report error later */ .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &tcp_sock_id, .arg2_type = ARG_ANYTHING, - .btf_id = &tcp_sock_id, }; static const struct bpf_func_proto * @@ -197,9 +169,9 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, case BPF_FUNC_tcp_send_ack: return &bpf_tcp_send_ack_proto; case BPF_FUNC_sk_storage_get: - return &btf_sk_storage_get_proto; + return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: - return &btf_sk_storage_delete_proto; + return &bpf_sk_storage_delete_proto; default: return bpf_base_func_proto(func_id); } diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 2eb71579f4d2..471d33a0d095 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -498,7 +498,7 @@ static void cipso_v4_doi_free_rcu(struct rcu_head *entry) /** * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine * @doi: the DOI value - * @audit_secid: the LSM secid to use in the audit message + * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CIPSO engine. The NetLabel routines will diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index abd083415f89..e5f69b0bf3df 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -237,7 +237,7 @@ static struct sk_buff *fou_gro_receive(struct sock *sk, /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel - * header to the outer L3 tunnel header, or we are are simply + * header to the outer L3 tunnel header, or we are simply * treating the GRE tunnel header as though it is a UDP protocol * specific header such as VXLAN or GENEVE. */ @@ -429,7 +429,7 @@ next_proto: /* We can clear the encap_mark for GUE as we are essentially doing * one of two possible things. We are either adding an L4 tunnel - * header to the outer L3 tunnel header, or we are are simply + * header to the outer L3 tunnel header, or we are simply * treating the GRE tunnel header as though it is a UDP protocol * specific header such as VXLAN or GENEVE. */ @@ -911,7 +911,7 @@ static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static const struct genl_ops fou_nl_ops[] = { +static const struct genl_small_ops fou_nl_ops[] = { { .cmd = FOU_CMD_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -940,8 +940,8 @@ static struct genl_family fou_nl_family __ro_after_init = { .policy = fou_nl_policy, .netnsok = true, .module = THIS_MODULE, - .ops = fou_nl_ops, - .n_ops = ARRAY_SIZE(fou_nl_ops), + .small_ops = fou_nl_ops, + .n_small_ops = ARRAY_SIZE(fou_nl_ops), }; size_t fou_encap_hlen(struct ip_tunnel_encap *e) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index bdaaee52c41b..07f67ced962a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -457,6 +457,23 @@ out_bh_enable: local_bh_enable(); } +/* + * The device used for looking up which routing table to use for sending an ICMP + * error is preferably the source whenever it is set, which should ensure the + * icmp error can be sent to the source host, else lookup using the routing + * table of the destination device, else use the main routing table (index 0). + */ +static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) +{ + struct net_device *route_lookup_dev = NULL; + + if (skb->dev) + route_lookup_dev = skb->dev; + else if (skb_dst(skb)) + route_lookup_dev = skb_dst(skb)->dev; + return route_lookup_dev; +} + static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -465,6 +482,7 @@ static struct rtable *icmp_route_lookup(struct net *net, int type, int code, struct icmp_bxm *param) { + struct net_device *route_lookup_dev; struct rtable *rt, *rt2; struct flowi4 fl4_dec; int err; @@ -479,7 +497,8 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); + route_lookup_dev = icmp_get_route_lookup_dev(skb_in); + fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = ip_route_output_key_hash(net, fl4, skb_in); @@ -503,7 +522,7 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev, + if (inet_addr_type_dev_table(net, route_lookup_dev, fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) @@ -690,9 +709,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, rcu_read_unlock(); } - tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | + tos = icmp_pointers[type].error ? (RT_TOS(iph->tos) | IPTOS_PREC_INTERNETCONTROL) : - iph->tos; + iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt)) @@ -784,7 +803,7 @@ EXPORT_SYMBOL(icmp_ndo_send); static void icmp_socket_deliver(struct sk_buff *skb, u32 info) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; const struct net_protocol *ipprot; int protocol = iph->protocol; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b457dd2d6c75..4148f5f78f31 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -564,7 +564,7 @@ void inet_csk_clear_xmit_timers(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; + icsk->icsk_pending = icsk->icsk_ack.pending = 0; sk_stop_timer(sk, &icsk->icsk_retransmit_timer); sk_stop_timer(sk, &icsk->icsk_delack_timer); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index f1bd95f243b3..366a4507b5a3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -125,6 +125,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, bool net_admin) { const struct inet_sock *inet = inet_sk(sk); + struct inet_diag_sockopt inet_sockopt; if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) goto errout; @@ -180,6 +181,22 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); + memset(&inet_sockopt, 0, sizeof(inet_sockopt)); + inet_sockopt.recverr = inet->recverr; + inet_sockopt.is_icsk = inet->is_icsk; + inet_sockopt.freebind = inet->freebind; + inet_sockopt.hdrincl = inet->hdrincl; + inet_sockopt.mc_loop = inet->mc_loop; + inet_sockopt.transparent = inet->transparent; + inet_sockopt.mc_all = inet->mc_all; + inet_sockopt.nodefrag = inet->nodefrag; + inet_sockopt.bind_address_no_port = inet->bind_address_no_port; + inet_sockopt.recverr_rfc4884 = inet->recverr_rfc4884; + inet_sockopt.defer_connect = inet->defer_connect; + if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt), + &inet_sockopt)) + goto errout; + return 0; errout: return 1; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 239e54474b65..8cbe74313f38 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -228,7 +228,7 @@ static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, - const int dif, const int sdif, bool exact_dif) + const int dif, const int sdif) { int score = -1; @@ -277,15 +277,13 @@ static struct sock *inet_lhash2_lookup(struct net *net, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { - bool exact_dif = inet_exact_dif_match(net, skb); struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; int score, hiscore = 0; inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { sk = (struct sock *)icsk; - score = compute_score(sk, net, hnum, daddr, - dif, sdif, exact_dif); + score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4e31f23e4117..e70291748889 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -625,9 +625,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, } if (dev->header_ops) { - /* Need space for new headers */ - if (skb_cow_head(skb, dev->needed_headroom - - (tunnel->hlen + sizeof(struct iphdr)))) + if (skb_cow_head(skb, 0)) goto free_skb; tnl_params = (const struct iphdr *)skb->data; @@ -748,7 +746,11 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) len = tunnel->tun_hlen - len; tunnel->hlen = tunnel->hlen + len; - dev->needed_headroom = dev->needed_headroom + len; + if (dev->header_ops) + dev->hard_header_len += len; + else + dev->needed_headroom += len; + if (set_mtu) dev->mtu = max_t(int, dev->mtu - len, 68); @@ -944,6 +946,7 @@ static void __gre_tunnel_init(struct net_device *dev) tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; + dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph); dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; @@ -987,10 +990,14 @@ static int ipgre_tunnel_init(struct net_device *dev) return -EINVAL; dev->flags = IFF_BROADCAST; dev->header_ops = &ipgre_header_ops; + dev->hard_header_len = tunnel->hlen + sizeof(*iph); + dev->needed_headroom = 0; } #endif } else if (!tunnel->collect_md) { dev->header_ops = &ipgre_header_ops; + dev->hard_header_len = tunnel->hlen + sizeof(*iph); + dev->needed_headroom = 0; } return ip_tunnel_init(dev); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 948747aac4e2..da1b5038bdfd 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -47,32 +47,32 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, unsigned char *iph = skb_network_header(skb); memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); - memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen); + memcpy(iph + sizeof(struct iphdr), opt->__data, opt->optlen); opt = &(IPCB(skb)->opt); if (opt->srr) - memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4); + memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4); if (!is_frag) { if (opt->rr_needaddr) - ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt); + ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt); if (opt->ts_needaddr) - ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt); + ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt); if (opt->ts_needtime) { __be32 midtime; midtime = inet_current_timestamp(); - memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4); + memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4); } return; } if (opt->rr) { - memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]); + memset(iph + opt->rr, IPOPT_NOP, iph[opt->rr + 1]); opt->rr = 0; opt->rr_needaddr = 0; } if (opt->ts) { - memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]); + memset(iph + opt->ts, IPOPT_NOP, iph[opt->ts + 1]); opt->ts = 0; opt->ts_needaddr = opt->ts_needtime = 0; } @@ -495,26 +495,29 @@ EXPORT_SYMBOL(ip_options_compile); void ip_options_undo(struct ip_options *opt) { if (opt->srr) { - unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr); - memmove(optptr+7, optptr+3, optptr[1]-7); - memcpy(optptr+3, &opt->faddr, 4); + unsigned char *optptr = opt->__data + opt->srr - sizeof(struct iphdr); + + memmove(optptr + 7, optptr + 3, optptr[1] - 7); + memcpy(optptr + 3, &opt->faddr, 4); } if (opt->rr_needaddr) { - unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr); + unsigned char *optptr = opt->__data + opt->rr - sizeof(struct iphdr); + optptr[2] -= 4; - memset(&optptr[optptr[2]-1], 0, 4); + memset(&optptr[optptr[2] - 1], 0, 4); } if (opt->ts) { - unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr); + unsigned char *optptr = opt->__data + opt->ts - sizeof(struct iphdr); + if (opt->ts_needtime) { optptr[2] -= 4; - memset(&optptr[optptr[2]-1], 0, 4); - if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC) + memset(&optptr[optptr[2] - 1], 0, 4); + if ((optptr[3] & 0xF) == IPOPT_TS_PRESPEC) optptr[2] -= 4; } if (opt->ts_needaddr) { optptr[2] -= 4; - memset(&optptr[optptr[2]-1], 0, 4); + memset(&optptr[optptr[2] - 1], 0, 4); } } } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5131cf70672a..879b76ae4435 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -143,7 +143,8 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) * */ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, - __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) + __be32 saddr, __be32 daddr, struct ip_options_rcu *opt, + u8 tos) { struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); @@ -156,7 +157,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; - iph->tos = inet->tos; + iph->tos = tos; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; @@ -997,7 +998,7 @@ static int __ip_append_data(struct sock *sk, fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu; + maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu; if (cork->length + length > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, @@ -1352,7 +1353,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, if (cork->flags & IPCORK_OPT) opt = cork->opt; - if (!(rt->dst.dev->features&NETIF_F_SG)) + if (!(rt->dst.dev->features & NETIF_F_SG)) return -EOPNOTSUPP; hh_len = LL_RESERVED_SPACE(rt->dst.dev); @@ -1537,7 +1538,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, ip_select_ident(net, skb, sk); if (opt) { - iph->ihl += opt->optlen>>2; + iph->ihl += opt->optlen >> 2; ip_options_build(skb, opt, cork->addr, rt, 0); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d2c223554ff7..ec6036713e2c 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1124,8 +1124,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, dev_put(dev); err = -EINVAL; - if (sk->sk_bound_dev_if && - (!midx || midx != sk->sk_bound_dev_if)) + if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if) break; inet->uc_index = ifindex; @@ -1189,7 +1188,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, err = -EINVAL; if (sk->sk_bound_dev_if && mreq.imr_ifindex != sk->sk_bound_dev_if && - (!midx || midx != sk->sk_bound_dev_if)) + midx != sk->sk_bound_dev_if) break; inet->mc_index = mreq.imr_ifindex; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 0c1f36404471..8b04d1dcfec4 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -360,7 +360,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error) { - struct pcpu_sw_netstats *tstats; const struct iphdr *iph = ip_hdr(skb); int err; @@ -402,12 +401,7 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, } } - tstats = this_cpu_ptr(tunnel->dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); - + dev_sw_netstats_rx_add(tunnel->dev, skb->len); skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); if (tunnel->dev->type == ARPHRD_ETHER) { diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index b2ea1a8c5fd6..25f1caf5abf9 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -433,29 +433,8 @@ EXPORT_SYMBOL(skb_tunnel_check_pmtu); void ip_tunnel_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *tot) { - int i; - netdev_stats_to_stats64(tot, &dev->stats); - - for_each_possible_cpu(i) { - const struct pcpu_sw_netstats *tstats = - per_cpu_ptr(dev->tstats, i); - u64 rx_packets, rx_bytes, tx_packets, tx_bytes; - unsigned int start; - - do { - start = u64_stats_fetch_begin_irq(&tstats->syncp); - rx_packets = tstats->rx_packets; - tx_packets = tstats->tx_packets; - rx_bytes = tstats->rx_bytes; - tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); - - tot->rx_packets += rx_packets; - tot->tx_packets += tx_packets; - tot->rx_bytes += rx_bytes; - tot->tx_bytes += tx_bytes; - } + dev_fetch_sw_netstats(tot, dev->tstats); } EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index f687abb069fa..b957cbee2cf7 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -95,7 +95,6 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; struct net_device *dev; - struct pcpu_sw_netstats *tstats; struct xfrm_state *x; const struct xfrm_mode *inner_mode; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; @@ -138,13 +137,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); skb->dev = dev; - - tstats = this_cpu_ptr(dev->tstats); - - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); + dev_sw_netstats_rx_add(dev, skb->len); return 0; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 876fd6ff1ff9..939792a38814 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1038,10 +1038,13 @@ static int ipmr_cache_report(struct mr_table *mrt, memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); msg->im_msgtype = assert; msg->im_mbz = 0; - if (assert == IGMPMSG_WRVIFWHOLE) + if (assert == IGMPMSG_WRVIFWHOLE) { msg->im_vif = vifi; - else + msg->im_vif_hi = vifi >> 8; + } else { msg->im_vif = mrt->mroute_reg_vif_num; + msg->im_vif_hi = mrt->mroute_reg_vif_num >> 8; + } ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); @@ -1054,6 +1057,7 @@ static int ipmr_cache_report(struct mr_table *mrt, ip_hdr(skb)->protocol = 0; msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; + msg->im_vif_hi = vifi >> 8; skb_dst_set(skb, dst_clone(skb_dst(pkt))); /* Add our header */ igmp = skb_put(skb, sizeof(struct igmphdr)); @@ -2396,6 +2400,7 @@ static size_t igmpmsg_netlink_msgsize(size_t payloadlen) + nla_total_size(4) /* IPMRA_CREPORT_VIF_ID */ + nla_total_size(4) /* IPMRA_CREPORT_SRC_ADDR */ + nla_total_size(4) /* IPMRA_CREPORT_DST_ADDR */ + + nla_total_size(4) /* IPMRA_CREPORT_TABLE */ /* IPMRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; @@ -2427,11 +2432,12 @@ static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt) rtgenm = nlmsg_data(nlh); rtgenm->rtgen_family = RTNL_FAMILY_IPMR; if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) || - nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif) || + nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif | (msg->im_vif_hi << 8)) || nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR, msg->im_src.s_addr) || nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR, - msg->im_dst.s_addr)) + msg->im_dst.s_addr) || + nla_put_u32(skb, IPMRA_CREPORT_TABLE, mrt->id)) goto nla_put_failure; nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen); diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index 7a83f881efa9..136030ad2e54 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -43,16 +43,31 @@ static void dump_arp_packet(struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int nhoff) { - const struct arphdr *ah; - struct arphdr _arph; const struct arppayload *ap; struct arppayload _arpp; + const struct arphdr *ah; + unsigned int logflags; + struct arphdr _arph; ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); if (ah == NULL) { nf_log_buf_add(m, "TRUNCATED"); return; } + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_DEFAULT_MASK; + + if (logflags & NF_LOG_MACDECODE) { + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", + ntohs(eth_hdr(skb)->h_proto)); + } + nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d", ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op)); diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 0c72156130b6..d07583fac8f8 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -284,8 +284,10 @@ static void dump_ipv4_mac_header(struct nf_log_buf *m, switch (dev->type) { case ARPHRD_ETHER: - nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", ntohs(eth_hdr(skb)->h_proto)); return; default: diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 134e92382275..8c0f17c6863c 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -42,8 +42,8 @@ static int call_nexthop_notifiers(struct net *net, { int err; - err = atomic_notifier_call_chain(&net->nexthop.notifier_chain, - event_type, nh); + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, + event_type, nh); return notifier_to_errno(err); } @@ -133,12 +133,9 @@ static struct nexthop *nexthop_alloc(void) static struct nh_group *nexthop_grp_alloc(u16 num_nh) { - size_t sz = offsetof(struct nexthop, nh_grp) - + sizeof(struct nh_group) - + sizeof(struct nh_grp_entry) * num_nh; struct nh_group *nhg; - nhg = kzalloc(sz, GFP_KERNEL); + nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL); if (nhg) nhg->num_nh = num_nh; @@ -279,7 +276,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, case AF_INET: fib_nh = &nhi->fib_nh; if (fib_nh->fib_nh_gw_family && - nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) + nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) goto nla_put_failure; break; @@ -800,7 +797,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, return; } - newg->has_v4 = nhg->has_v4; + newg->has_v4 = false; newg->mpath = nhg->mpath; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; @@ -809,12 +806,18 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, nhges = nhg->nh_entries; new_nhges = newg->nh_entries; for (i = 0, j = 0; i < nhg->num_nh; ++i) { + struct nh_info *nhi; + /* current nexthop getting removed */ if (nhg->nh_entries[i].nh == nh) { newg->num_nh--; continue; } + nhi = rtnl_dereference(nhges[i].nh->nh_info); + if (nhi->family == AF_INET) + newg->has_v4 = true; + list_del(&nhges[i].nh_list); new_nhges[j].nh_parent = nhges[i].nh_parent; new_nhges[j].nh = nhges[i].nh; @@ -867,8 +870,6 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) bool do_flush = false; struct fib_info *fi; - call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); - list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; @@ -906,6 +907,8 @@ static void __remove_nexthop(struct net *net, struct nexthop *nh, static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { + call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); + /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); @@ -961,6 +964,23 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old, return 0; } +static void nh_group_v4_update(struct nh_group *nhg) +{ + struct nh_grp_entry *nhges; + bool has_v4 = false; + int i; + + nhges = nhg->nh_entries; + for (i = 0; i < nhg->num_nh; i++) { + struct nh_info *nhi; + + nhi = rtnl_dereference(nhges[i].nh->nh_info); + if (nhi->family == AF_INET) + has_v4 = true; + } + nhg->has_v4 = has_v4; +} + static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) @@ -984,6 +1004,21 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, rcu_assign_pointer(old->nh_info, newi); rcu_assign_pointer(new->nh_info, oldi); + /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially + * update IPv4 indication in all the groups using the nexthop. + */ + if (oldi->family == AF_INET && newi->family == AF_INET6) { + struct nh_grp_entry *nhge; + + list_for_each_entry(nhge, &old->grp_list, nh_list) { + struct nexthop *nhp = nhge->nh_parent; + struct nh_group *nhg; + + nhg = rtnl_dereference(nhp->nh_grp); + nh_group_v4_update(nhg); + } + } + return 0; } @@ -1101,7 +1136,7 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, while (1) { struct nexthop *nh; - next = rtnl_dereference(*pp); + next = *pp; if (!next) break; @@ -1924,14 +1959,15 @@ static struct notifier_block nh_netdev_notifier = { int register_nexthop_notifier(struct net *net, struct notifier_block *nb) { - return atomic_notifier_chain_register(&net->nexthop.notifier_chain, nb); + return blocking_notifier_chain_register(&net->nexthop.notifier_chain, + nb); } EXPORT_SYMBOL(register_nexthop_notifier); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&net->nexthop.notifier_chain, - nb); + return blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, + nb); } EXPORT_SYMBOL(unregister_nexthop_notifier); @@ -1951,7 +1987,7 @@ static int __net_init nexthop_net_init(struct net *net) net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; - ATOMIC_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); + BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index df6fbefe44d4..248856b301c4 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -293,7 +293,8 @@ EXPORT_SYMBOL_GPL(ping_close); /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, - struct sockaddr *uaddr, int addr_len) { + struct sockaddr *uaddr, int addr_len) +{ struct net *net = sock_net(sk); if (sk->sk_family == AF_INET) { struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; @@ -310,10 +311,10 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); - chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); - if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) chk_addr_ret = RTN_LOCAL; + else + chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); if ((!inet_can_nonlocal_bind(net, isk) && chk_addr_ret != RTN_LOCAL) || @@ -383,20 +384,6 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr) } } -static void ping_clear_saddr(struct sock *sk, int dif) -{ - sk->sk_bound_dev_if = dif; - if (sk->sk_family == AF_INET) { - struct inet_sock *isk = inet_sk(sk); - isk->inet_rcv_saddr = isk->inet_saddr = 0; -#if IS_ENABLED(CONFIG_IPV6) - } else if (sk->sk_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr)); - memset(&np->saddr, 0, sizeof(np->saddr)); -#endif - } -} /* * We need our own bind because there are no privileged id's == local ports. * Moreover, we don't allow binding to multi- and broadcast addresses. @@ -420,12 +407,13 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) goto out; err = -EADDRINUSE; - ping_set_saddr(sk, uaddr); snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port); if (ping_get_port(sk, snum) != 0) { - ping_clear_saddr(sk, dif); + /* Restore possibly modified sk->sk_bound_dev_if by ping_check_bind_addr(). */ + sk->sk_bound_dev_if = dif; goto out; } + ping_set_saddr(sk, uaddr); pr_debug("after bind(): num = %hu, dif = %d\n", isk->inet_num, @@ -647,7 +635,8 @@ static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, } int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, - void *user_icmph, size_t icmph_len) { + void *user_icmph, size_t icmph_len) +{ u8 type, code; if (len > 0xFFFF) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 355f3ca868af..7d26e0f8bdae 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -260,11 +260,12 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) err = EHOSTUNREACH; if (code > NR_ICMP_UNREACH) break; - err = icmp_err_convert[code].errno; - harderr = icmp_err_convert[code].fatal; if (code == ICMP_FRAG_NEEDED) { harderr = inet->pmtudisc != IP_PMTUDISC_DONT; err = EMSGSIZE; + } else { + err = icmp_err_convert[code].errno; + harderr = icmp_err_convert[code].fatal; } } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 58642b29a499..dc2a399cd9f4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -623,7 +623,7 @@ static inline u32 fnhe_hashfun(__be32 daddr) u32 hval; net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); - hval = jhash_1word((__force u32) daddr, fnhe_hashrnd); + hval = jhash_1word((__force u32)daddr, fnhe_hashrnd); return hash_32(hval, FNHE_HASH_SHIFT); } @@ -1016,13 +1016,14 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; struct net *net = dev_net(dst->dev); - u32 old_mtu = ipv4_mtu(dst); struct fib_result res; bool lock = false; + u32 old_mtu; if (ip_mtu_locked(dst)) return; + old_mtu = ipv4_mtu(dst); if (old_mtu < mtu) return; @@ -1066,7 +1067,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u8 protocol) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; u32 mark = IP4_REPLY_MARK(net, skb->mark); @@ -1083,7 +1084,7 @@ EXPORT_SYMBOL_GPL(ipv4_update_pmtu); static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; @@ -1101,7 +1102,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct dst_entry *odst = NULL; @@ -1131,7 +1132,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) new = true; } - __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu); + __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) @@ -1156,7 +1157,7 @@ EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; @@ -1172,7 +1173,7 @@ EXPORT_SYMBOL_GPL(ipv4_redirect); void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct net *net = sock_net(sk); @@ -1312,7 +1313,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) static unsigned int ipv4_mtu(const struct dst_entry *dst) { - const struct rtable *rt = (const struct rtable *) dst; + const struct rtable *rt = (const struct rtable *)dst; unsigned int mtu = rt->rt_pmtu; if (!mtu || time_after_eq(jiffies, rt->dst.expires)) @@ -2769,10 +2770,12 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, if (IS_ERR(rt)) return rt; - if (flp4->flowi4_proto) + if (flp4->flowi4_proto) { + flp4->flowi4_oif = rt->dst.dev->ifindex; rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, flowi4_to_flowi(flp4), sk, 0); + } return rt; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index e03756631541..6ac473b47f30 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -286,11 +286,10 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb) { + struct tcp_request_sock *treq; struct request_sock *req; #ifdef CONFIG_MPTCP - struct tcp_request_sock *treq; - if (sk_is_mptcp(sk)) ops = &mptcp_subflow_request_sock_ops; #endif @@ -299,8 +298,9 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, if (!req) return NULL; -#if IS_ENABLED(CONFIG_MPTCP) treq = tcp_rsk(req); + treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; +#if IS_ENABLED(CONFIG_MPTCP) treq->is_mptcp = sk_is_mptcp(sk); if (treq->is_mptcp) { int err = mptcp_subflow_init_cookie_req(req, sk, skb); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 54023a46db04..3e5f4f2e705e 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1330,6 +1330,15 @@ static struct ctl_table ipv4_net_table[] = { .extra2 = &comp_sack_nr_max, }, { + .procname = "tcp_reflect_tos", + .data = &init_net.ipv4.sysctl_tcp_reflect_tos, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2135ee7c806d..bae4284bf542 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -418,6 +418,8 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto_min = TCP_RTO_MIN; + icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); @@ -562,7 +564,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= EPOLLIN | EPOLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { - if (sk_stream_is_writeable(sk)) { + if (__sk_stream_is_writeable(sk, 1)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -574,7 +576,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * pairs with the input side. */ smp_mb__after_atomic(); - if (sk_stream_is_writeable(sk)) + if (__sk_stream_is_writeable(sk, 1)) mask |= EPOLLOUT | EPOLLWRNORM; } } else @@ -1003,12 +1005,12 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, !tcp_skb_can_collapse_to(skb)) { new_segment: if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; + goto wait_for_space; skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, tcp_rtx_and_write_queues_empty(sk)); if (!skb) - goto wait_for_memory; + goto wait_for_space; #ifdef CONFIG_TLS_DEVICE skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); @@ -1027,7 +1029,7 @@ new_segment: goto new_segment; } if (!sk_wmem_schedule(sk, copy)) - goto wait_for_memory; + goto wait_for_space; if (can_coalesce) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); @@ -1068,9 +1070,8 @@ new_segment: tcp_push_one(sk, mss_now); continue; -wait_for_sndbuf: +wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); @@ -1281,7 +1282,7 @@ restart: new_segment: if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; + goto wait_for_space; if (unlikely(process_backlog >= 16)) { process_backlog = 0; @@ -1292,7 +1293,7 @@ new_segment: skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, first_skb); if (!skb) - goto wait_for_memory; + goto wait_for_space; process_backlog++; skb->ip_summed = CHECKSUM_PARTIAL; @@ -1325,7 +1326,7 @@ new_segment: struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) - goto wait_for_memory; + goto wait_for_space; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { @@ -1339,7 +1340,7 @@ new_segment: copy = min_t(int, copy, pfrag->size - pfrag->offset); if (!sk_wmem_schedule(sk, copy)) - goto wait_for_memory; + goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, @@ -1392,9 +1393,8 @@ new_segment: tcp_push_one(sk, mss_now); continue; -wait_for_sndbuf: +wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); @@ -1526,7 +1526,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) * calculation of whether or not we must ACK for the sake of * a window update. */ -static void tcp_cleanup_rbuf(struct sock *sk, int copied) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; @@ -1539,10 +1539,8 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) if (inet_csk_ack_scheduled(sk)) { const struct inet_connection_sock *icsk = inet_csk(sk); - /* Delayed ACKs frequently hit locked sockets during bulk - * receive. */ - if (icsk->icsk_ack.blocked || - /* Once-per-two-segments ACK was not sent by tcp_input.c */ + + if (/* Once-per-two-segments ACK was not sent by tcp_input.c */ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || /* * If this read emptied read buffer, we send ACK, if @@ -2686,6 +2684,8 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto_min = TCP_RTO_MIN; + icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd = TCP_INIT_CWND; tp->snd_cwnd_cnt = 0; @@ -2695,6 +2695,7 @@ int tcp_disconnect(struct sock *sk, int flags) if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + icsk->icsk_ca_initialized = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); @@ -3046,7 +3047,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, true, + err = tcp_set_congestion_control(sk, name, true, ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); release_sock(sk); @@ -3208,7 +3209,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, break; case TCP_SAVE_SYN: - if (val < 0 || val > 1) + /* 0: disable, 1: enable, 2: start from ether_header */ + if (val < 0 || val > 2) err = -EINVAL; else tp->save_syn = val; @@ -3789,20 +3791,21 @@ static int do_tcp_getsockopt(struct sock *sk, int level, lock_sock(sk); if (tp->saved_syn) { - if (len < tp->saved_syn[0]) { - if (put_user(tp->saved_syn[0], optlen)) { + if (len < tcp_saved_syn_len(tp->saved_syn)) { + if (put_user(tcp_saved_syn_len(tp->saved_syn), + optlen)) { release_sock(sk); return -EFAULT; } release_sock(sk); return -EINVAL; } - len = tp->saved_syn[0]; + len = tcp_saved_syn_len(tp->saved_syn); if (put_user(len, optlen)) { release_sock(sk); return -EFAULT; } - if (copy_to_user(optval, tp->saved_syn + 1, len)) { + if (copy_to_user(optval, tp->saved_syn->data, len)) { release_sock(sk); return -EFAULT; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 7aa68f4aae6c..37f4cb2bba5c 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -567,10 +567,9 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; } -static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops) +static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) { - if (sk->sk_family == AF_INET6 && - unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { + if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { spin_lock_bh(&tcpv6_prot_lock); if (likely(ops != tcpv6_prot_saved)) { tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops); @@ -603,13 +602,11 @@ struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; - if (!psock->sk_proto) { - struct proto *ops = READ_ONCE(sk->sk_prot); - - if (tcp_bpf_assert_proto_ops(ops)) + if (sk->sk_family == AF_INET6) { + if (tcp_bpf_assert_proto_ops(psock->sk_proto)) return ERR_PTR(-EINVAL); - tcp_bpf_check_v6_needs_rebuild(sk, ops); + tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } return &tcp_bpf_prots[family][config]; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 62878cf26d9c..db47ac24d057 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -176,7 +176,7 @@ void tcp_assign_congestion_control(struct sock *sk) void tcp_init_congestion_control(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) @@ -185,6 +185,7 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); + icsk->icsk_ca_initialized = 1; } static void tcp_reinit_congestion_control(struct sock *sk, @@ -340,7 +341,7 @@ out: * already initialized. */ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, - bool reinit, bool cap_net_admin) + bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -361,28 +362,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, goto out; } - if (!ca) { + if (!ca) err = -ENOENT; - } else if (!load) { - const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; - - if (bpf_try_module_get(ca, ca->owner)) { - if (reinit) { - tcp_reinit_congestion_control(sk, ca); - } else { - icsk->icsk_ca_ops = ca; - bpf_module_put(old_ca, old_ca->owner); - } - } else { - err = -EBUSY; - } - } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; - } else if (!bpf_try_module_get(ca, ca->owner)) { + else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; - } else { + else tcp_reinit_congestion_control(sk, ca); - } out: rcu_read_unlock(); return err; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 09b62de04eea..af2814c9342a 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -295,7 +295,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ - tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); + tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b1ce2054291d..67f10d3ec240 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -138,6 +138,69 @@ void clean_acked_data_flush(void) EXPORT_SYMBOL_GPL(clean_acked_data_flush); #endif +#ifdef CONFIG_CGROUP_BPF +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) +{ + bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown && + BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); + bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); + struct bpf_sock_ops_kern sock_ops; + + if (likely(!unknown_opt && !parse_all_opt)) + return; + + /* The skb will be handled in the + * bpf_skops_established() or + * bpf_skops_write_hdr_opt(). + */ + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_SYN_SENT: + case TCP_LISTEN: + return; + } + + sock_owned_by_me(sk); + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); + + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); +} + +static void bpf_skops_established(struct sock *sk, int bpf_op, + struct sk_buff *skb) +{ + struct bpf_sock_ops_kern sock_ops; + + sock_owned_by_me(sk); + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = bpf_op; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ + if (skb) + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); + + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); +} +#else +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) +{ +} + +static void bpf_skops_established(struct sock *sk, int bpf_op, + struct sk_buff *skb) +{ +} +#endif + static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, unsigned int len) { @@ -956,7 +1019,11 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); } -/* This must be called before lost_out is incremented */ + /* This must be called before lost_out or retrans_out are updated + * on a new loss, because we want to know if all skbs previously + * known to be lost have already been retransmitted, indicating + * that this newly lost skb is our next skb to retransmit. + */ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) || @@ -966,41 +1033,36 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) tp->retransmit_skb_hint = skb; } -/* Sum the number of packets on the wire we have marked as lost. - * There are two cases we care about here: - * a) Packet hasn't been marked lost (nor retransmitted), - * and this is the first loss. - * b) Packet has been marked both lost and retransmitted, - * and this means we think it was lost again. +/* Sum the number of packets on the wire we have marked as lost, and + * notify the congestion control module that the given skb was marked lost. */ -static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb) +static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) { - __u8 sacked = TCP_SKB_CB(skb)->sacked; - - if (!(sacked & TCPCB_LOST) || - ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS))) - tp->lost += tcp_skb_pcount(skb); + tp->lost += tcp_skb_pcount(skb); } -static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) +void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { - tcp_verify_retransmit_hint(tp, skb); + __u8 sacked = TCP_SKB_CB(skb)->sacked; + struct tcp_sock *tp = tcp_sk(sk); - tp->lost_out += tcp_skb_pcount(skb); - tcp_sum_lost(tp, skb); - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - } -} + if (sacked & TCPCB_SACKED_ACKED) + return; -void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) -{ tcp_verify_retransmit_hint(tp, skb); - - tcp_sum_lost(tp, skb); - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { + if (sacked & TCPCB_LOST) { + if (sacked & TCPCB_SACKED_RETRANS) { + /* Account for retransmits that are lost again */ + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT, + tcp_skb_pcount(skb)); + tcp_notify_skb_loss_event(tp, skb); + } + } else { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tcp_notify_skb_loss_event(tp, skb); } } @@ -2263,7 +2325,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) if (cnt > packets) break; - tcp_skb_mark_lost(tp, skb); + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) + tcp_mark_skb_lost(sk, skb); if (mark_head) break; @@ -2629,14 +2692,8 @@ void tcp_simple_retransmit(struct sock *sk) unsigned int mss = tcp_current_mss(sk); skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { - if (tcp_skb_seglen(skb) > mss && - !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - } - tcp_skb_mark_lost_uncond_verify(tp, skb); - } + if (tcp_skb_seglen(skb) > mss) + tcp_mark_skb_lost(sk, skb); } tcp_clear_retrans_hints_partial(tp); @@ -3819,7 +3876,7 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, foc->exp = exp_opt; } -static void smc_parse_options(const struct tcphdr *th, +static bool smc_parse_options(const struct tcphdr *th, struct tcp_options_received *opt_rx, const unsigned char *ptr, int opsize) @@ -3828,10 +3885,13 @@ static void smc_parse_options(const struct tcphdr *th, if (static_branch_unlikely(&tcp_have_smc)) { if (th->syn && !(opsize & 1) && opsize >= TCPOLEN_EXP_SMC_BASE && - get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) + get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { opt_rx->smc_ok = 1; + return true; + } } #endif + return false; } /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped @@ -3892,6 +3952,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; + opt_rx->saw_unknown = 0; while (length > 0) { int opcode = *ptr++; @@ -3982,15 +4043,21 @@ void tcp_parse_options(const struct net *net, */ if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE && get_unaligned_be16(ptr) == - TCPOPT_FASTOPEN_MAGIC) + TCPOPT_FASTOPEN_MAGIC) { tcp_parse_fastopen_option(opsize - TCPOLEN_EXP_FASTOPEN_BASE, ptr + 2, th->syn, foc, true); - else - smc_parse_options(th, opt_rx, ptr, - opsize); + break; + } + + if (smc_parse_options(th, opt_rx, ptr, opsize)) + break; + + opt_rx->saw_unknown = 1; break; + default: + opt_rx->saw_unknown = 1; } ptr += opsize-2; length -= opsize; @@ -4363,7 +4430,8 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) sp[i] = sp[i + 1]; continue; } - this_sack++, swalk++; + this_sack++; + swalk++; } } @@ -4853,7 +4921,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) int eaten; if (sk_is_mptcp(sk)) - mptcp_incoming_options(sk, skb, &tp->rx_opt); + mptcp_incoming_options(sk, skb); if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); @@ -5277,12 +5345,6 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return true; } -/* When incoming ACK allowed to free some skb from write_queue, - * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket - * on the exit from tcp input handler. - * - * PROBLEM: sndbuf expansion does not work well with largesend. - */ static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -5297,16 +5359,13 @@ static void tcp_new_space(struct sock *sk) static void tcp_check_space(struct sock *sk) { - if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { - sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); - /* pairs with tcp_poll() */ - smp_mb(); - if (sk->sk_socket && - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - tcp_new_space(sk); - if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) - tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); - } + /* pairs with tcp_poll() */ + smp_mb(); + if (sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + tcp_new_space(sk); + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } } @@ -5608,6 +5667,8 @@ syn_challenge: goto discard; } + bpf_skops_parse_hdr(sk, skb); + return true; discard: @@ -5816,7 +5877,7 @@ discard: } EXPORT_SYMBOL(tcp_rcv_established); -void tcp_init_transfer(struct sock *sk, int bpf_op) +void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -5837,8 +5898,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); tp->snd_cwnd_stamp = tcp_jiffies32; - tcp_call_bpf(sk, bpf_op, 0, NULL); - tcp_init_congestion_control(sk); + icsk->icsk_ca_initialized = 0; + bpf_skops_established(sk, bpf_op, skb); + if (!icsk->icsk_ca_initialized) + tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } @@ -5856,7 +5919,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id(sk, skb); } - tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb); /* Prevent spurious tcp_cwnd_restart() on first data * packet. @@ -6328,7 +6391,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } else { tcp_try_undo_spurious_syn(sk); tp->retrans_stamp = 0; - tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, + skb); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); } smp_mb(); @@ -6438,7 +6502,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_LAST_ACK: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { if (sk_is_mptcp(sk)) - mptcp_incoming_options(sk, skb, &tp->rx_opt); + mptcp_incoming_options(sk, skb); break; } fallthrough; @@ -6617,13 +6681,27 @@ static void tcp_reqsk_record_syn(const struct sock *sk, { if (tcp_sk(sk)->save_syn) { u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); - u32 *copy; + struct saved_syn *saved_syn; + u32 mac_hdrlen; + void *base; + + if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */ + base = skb_mac_header(skb); + mac_hdrlen = skb_mac_header_len(skb); + len += mac_hdrlen; + } else { + base = skb_network_header(skb); + mac_hdrlen = 0; + } - copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); - if (copy) { - copy[0] = len; - memcpy(©[1], skb_network_header(skb), len); - req->saved_syn = copy; + saved_syn = kmalloc(struct_size(saved_syn, data, len), + GFP_ATOMIC); + if (saved_syn) { + saved_syn->mac_hdrlen = mac_hdrlen; + saved_syn->network_hdrlen = skb_network_header_len(skb); + saved_syn->tcp_hdrlen = tcp_hdrlen(skb); + memcpy(saved_syn->data, base, len); + req->saved_syn = saved_syn; } } } @@ -6762,6 +6840,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); + tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; tcp_openreq_init_rwin(req, sk, dst); sk_rx_queue_set(req_to_sk(req), skb); if (!want_cookie) { @@ -6770,7 +6849,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, - &foc, TCP_SYNACK_FASTOPEN); + &foc, TCP_SYNACK_FASTOPEN, skb); /* Add the child socket directly into the accept queue */ if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { reqsk_fastopen_remove(fastopen_sk, req, false); @@ -6788,7 +6867,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_timeout_init((struct sock *)req)); af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie ? TCP_SYNACK_NORMAL : - TCP_SYNACK_COOKIE); + TCP_SYNACK_COOKIE, + skb); if (want_cookie) { reqsk_free(req); return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 592c73962723..7352c097ae48 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -575,7 +575,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is - * is already accepted it is treated as a connected one below. + * already accepted it is treated as a connected one below. */ if (fastopen && !fastopen->sk) break; @@ -965,18 +965,23 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; struct sk_buff *skb; + u8 tos; /* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, foc, synack_type); + skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); + + tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + tcp_rsk(req)->syn_tos : inet_sk(sk)->tos; if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); @@ -984,7 +989,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt)); + rcu_dereference(ireq->ireq_opt), + tos & ~INET_ECN_MASK); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -1529,6 +1535,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = prandom_u32(); + /* Set ToS of the new socket based upon the value of incoming SYN. */ + if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) + newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; + if (!dst) { dst = inet_csk_route_child_sock(sk, newsk, req); if (!dst) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 279db8822439..6b27c481fe18 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -943,7 +943,7 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) return 0; } -static const struct genl_ops tcp_metrics_nl_ops[] = { +static const struct genl_small_ops tcp_metrics_nl_ops[] = { { .cmd = TCP_METRICS_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -966,8 +966,8 @@ static struct genl_family tcp_metrics_nl_family __ro_after_init = { .policy = tcp_metrics_nl_policy, .netnsok = true, .module = THIS_MODULE, - .ops = tcp_metrics_nl_ops, - .n_ops = ARRAY_SIZE(tcp_metrics_nl_ops), + .small_ops = tcp_metrics_nl_ops, + .n_small_ops = ARRAY_SIZE(tcp_metrics_nl_ops), }; static unsigned int tcpmhash_entries; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 85ff417bda7f..bf48cd73e967 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -438,6 +438,7 @@ struct tcp_out_options { u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ + u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ @@ -452,6 +453,145 @@ static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) #endif } +#ifdef CONFIG_CGROUP_BPF +static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb, + enum tcp_synack_type synack_type) +{ + if (unlikely(!skb)) + return BPF_WRITE_HDR_TCP_CURRENT_MSS; + + if (unlikely(synack_type == TCP_SYNACK_COOKIE)) + return BPF_WRITE_HDR_TCP_SYNACK_COOKIE; + + return 0; +} + +/* req, syn_skb and synack_type are used when writing synack */ +static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts, + unsigned int *remaining) +{ + struct bpf_sock_ops_kern sock_ops; + int err; + + if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || + !*remaining) + return; + + /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */ + + /* init sock_ops */ + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + + sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB; + + if (req) { + /* The listen "sk" cannot be passed here because + * it is not locked. It would not make too much + * sense to do bpf_setsockopt(listen_sk) based + * on individual connection request also. + * + * Thus, "req" is passed here and the cgroup-bpf-progs + * of the listen "sk" will be run. + * + * "req" is also used here for fastopen even the "sk" here is + * a fullsock "child" sk. It is to keep the behavior + * consistent between fastopen and non-fastopen on + * the bpf programming side. + */ + sock_ops.sk = (struct sock *)req; + sock_ops.syn_skb = syn_skb; + } else { + sock_owned_by_me(sk); + + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + } + + sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); + sock_ops.remaining_opt_len = *remaining; + /* tcp_current_mss() does not pass a skb */ + if (skb) + bpf_skops_init_skb(&sock_ops, skb, 0); + + err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); + + if (err || sock_ops.remaining_opt_len == *remaining) + return; + + opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len; + /* round up to 4 bytes */ + opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3; + + *remaining -= opts->bpf_opt_len; +} + +static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts) +{ + u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len; + struct bpf_sock_ops_kern sock_ops; + int err; + + if (likely(!max_opt_len)) + return; + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + + sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB; + + if (req) { + sock_ops.sk = (struct sock *)req; + sock_ops.syn_skb = syn_skb; + } else { + sock_owned_by_me(sk); + + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + } + + sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); + sock_ops.remaining_opt_len = max_opt_len; + first_opt_off = tcp_hdrlen(skb) - max_opt_len; + bpf_skops_init_skb(&sock_ops, skb, first_opt_off); + + err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); + + if (err) + nr_written = 0; + else + nr_written = max_opt_len - sock_ops.remaining_opt_len; + + if (nr_written < max_opt_len) + memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP, + max_opt_len - nr_written); +} +#else +static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +} + +static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts) +{ +} +#endif + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -691,6 +831,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -701,7 +843,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, struct tcp_out_options *opts, const struct tcp_md5sig_key *md5, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -758,6 +901,9 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, + synack_type, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -826,6 +972,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } + if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); + + size = MAX_TCP_OPTION_SPACE - remaining; + } + return size; } @@ -1213,6 +1368,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } #endif + /* BPF prog is the last one writing header option */ + bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); + INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, tcp_v6_send_check, tcp_v4_send_check, sk, skb); @@ -1524,7 +1682,6 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); sk_mem_uncharge(sk, delta_truesize); - sock_set_flag(sk, SOCK_QUEUE_SHRUNK); } /* Any change of skb->len requires recalculation of tso factor. */ @@ -3336,20 +3493,20 @@ int tcp_send_synack(struct sock *sk) } /** - * tcp_make_synack - Prepare a SYN-ACK. - * sk: listener socket - * dst: dst entry attached to the SYNACK - * req: request_sock pointer - * foc: cookie for tcp fast open - * synack_type: Type of synback to prepare - * - * Allocate one skb and build a SYNACK packet. - * @dst is consumed : Caller should not use it again. + * tcp_make_synack - Allocate one skb and build a SYNACK packet. + * @sk: listener socket + * @dst: dst entry attached to the SYNACK. It is consumed and caller + * should not use it again. + * @req: request_sock pointer + * @foc: cookie for tcp fast open + * @synack_type: Type of synack to prepare + * @syn_skb: SYN packet just received. It could be NULL for rtx case. */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); @@ -3408,8 +3565,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); + /* bpf program will be interested in the tcp_flags */ + TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc, synack_type) + sizeof(*th); + foc, synack_type, + syn_skb) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -3441,6 +3601,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif + bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, + synack_type, &opts); + skb->skb_mstamp_ns = now; tcp_add_tx_delay(skb, tp); @@ -3741,16 +3904,15 @@ void tcp_send_delayed_ack(struct sock *sk) ato = min(ato, max_ato); } + ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max); + /* Stay within the limit we were given */ timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { - /* If delack timer was blocked or is about to expire, - * send ACK now. - */ - if (icsk->icsk_ack.blocked || - time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { + /* If delack timer is about to expire, send ACK now. */ + if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } @@ -3779,10 +3941,15 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (unlikely(!buff)) { + struct inet_connection_sock *icsk = inet_csk(sk); + unsigned long delay; + + delay = TCP_DELACK_MAX << icsk->icsk_ack.retry; + if (delay < TCP_RTO_MAX) + icsk->icsk_ack.retry++; inet_csk_schedule_ack(sk); - inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, TCP_RTO_MAX); + icsk->icsk_ack.ato = TCP_ATO_MIN; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX); return; } @@ -3934,7 +4101,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int res; tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL); + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, + NULL); if (!res) { __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index fdb715bdd2d1..f65a3ddd0d58 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -2,20 +2,6 @@ #include <linux/tcp.h> #include <net/tcp.h> -void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tcp_skb_mark_lost_uncond_verify(tp, skb); - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { - /* Account for retransmits that are lost again */ - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT, - tcp_skb_pcount(skb)); - } -} - static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) { return t1 > t2 || (t1 == t2 && after(seq1, seq2)); @@ -246,6 +232,6 @@ void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced) tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, mss, mss, GFP_ATOMIC); - tcp_skb_mark_lost_uncond_verify(tp, skb); + tcp_mark_skb_lost(sk, skb); } } diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 6cebf412d590..5842081bc8a2 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -10,7 +10,7 @@ #include <net/tcp.h> /* These factors derived from the recommended values in the aer: - * .01 and and 7/8. + * .01 and 7/8. */ #define TCP_SCALABLE_AI_CNT 100U #define TCP_SCALABLE_MD_SCALE 3 diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0c08c420fbc2..6c62b9ea1320 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -331,7 +331,6 @@ static void tcp_delack_timer(struct timer_list *t) if (!sock_owned_by_user(sk)) { tcp_delack_timer_handler(sk); } else { - icsk->icsk_ack.blocked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 3f51e781562a..c8003c8aad2c 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -293,10 +293,10 @@ size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr, const struct vegas *ca = inet_csk_ca(sk); if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - info->vegas.tcpv_enabled = ca->doing_vegas_now, - info->vegas.tcpv_rttcnt = ca->cntRTT, - info->vegas.tcpv_rtt = ca->baseRTT, - info->vegas.tcpv_minrtt = ca->minRTT, + info->vegas.tcpv_enabled = ca->doing_vegas_now; + info->vegas.tcpv_rttcnt = ca->cntRTT; + info->vegas.tcpv_rtt = ca->baseRTT; + info->vegas.tcpv_minrtt = ca->minRTT; *attr = INET_DIAG_VEGASINFO; return sizeof(struct tcpvegas_info); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e88efba07551..09f0a23d1a01 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1170,7 +1170,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.oif = inet->uc_index; } else if (ipv4_is_lbcast(daddr) && inet->uc_index) { /* oif is set, packet is to local broadcast and - * and uc_index is set. oif is most likely set + * uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index. diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index eddd973e6575..7a94791efc1a 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -22,10 +22,9 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) prot->close = sock_map_close; } -static void udp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops) +static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) { - if (sk->sk_family == AF_INET6 && - unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) { + if (unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) { spin_lock_bh(&udpv6_prot_lock); if (likely(ops != udpv6_prot_saved)) { udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops); @@ -46,8 +45,8 @@ struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; - if (!psock->sk_proto) - udp_bpf_check_v6_needs_rebuild(sk, READ_ONCE(sk->sk_prot)); + if (sk->sk_family == AF_INET6) + udp_bpf_check_v6_needs_rebuild(psock->sk_proto); return &udp_bpf_prots[family]; } diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c index 69962165c0e8..0d122edc368d 100644 --- a/net/ipv4/udp_tunnel_nic.c +++ b/net/ipv4/udp_tunnel_nic.c @@ -19,8 +19,9 @@ enum udp_tunnel_nic_table_entry_flags { struct udp_tunnel_nic_table_entry { __be16 port; u8 type; - u8 use_cnt; u8 flags; + u16 use_cnt; +#define UDP_TUNNEL_NIC_USE_CNT_MAX U16_MAX u8 hw_priv; }; @@ -370,6 +371,8 @@ udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn, bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; unsigned int from, to; + WARN_ON(entry->use_cnt + (u32)use_cnt_adj > U16_MAX); + /* If not going from used to unused or vice versa - all done. * For dodgy entries make sure we try to sync again (queue the entry). */ @@ -675,6 +678,7 @@ static void udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic_shared_node *node; unsigned int i, j; /* Freeze all the ports we are already tracking so that the replay @@ -686,7 +690,12 @@ udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn) utn->missed = 0; utn->need_replay = 0; - udp_tunnel_get_rx_info(dev); + if (!info->shared) { + udp_tunnel_get_rx_info(dev); + } else { + list_for_each_entry(node, &info->shared->devices, list) + udp_tunnel_get_rx_info(node->dev); + } for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) @@ -742,20 +751,39 @@ err_free_utn: return NULL; } +static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn) +{ + unsigned int i; + + for (i = 0; i < utn->n_tables; i++) + kfree(utn->entries[i]); + kfree(utn->entries); + kfree(utn); +} + static int udp_tunnel_nic_register(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic_shared_node *node = NULL; struct udp_tunnel_nic *utn; unsigned int n_tables, i; BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE < UDP_TUNNEL_NIC_MAX_TABLES); + /* Expect use count of at most 2 (IPv4, IPv6) per device */ + BUILD_BUG_ON(UDP_TUNNEL_NIC_USE_CNT_MAX < + UDP_TUNNEL_NIC_MAX_SHARING_DEVICES * 2); + /* Check that the driver info is sane */ if (WARN_ON(!info->set_port != !info->unset_port) || WARN_ON(!info->set_port == !info->sync_table) || WARN_ON(!info->tables[0].n_entries)) return -EINVAL; + if (WARN_ON(info->shared && + info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) + return -EINVAL; + n_tables = 1; for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { if (!info->tables[i].n_entries) @@ -766,9 +794,33 @@ static int udp_tunnel_nic_register(struct net_device *dev) return -EINVAL; } - utn = udp_tunnel_nic_alloc(info, n_tables); - if (!utn) - return -ENOMEM; + /* Create UDP tunnel state structures */ + if (info->shared) { + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + node->dev = dev; + } + + if (info->shared && info->shared->udp_tunnel_nic_info) { + utn = info->shared->udp_tunnel_nic_info; + } else { + utn = udp_tunnel_nic_alloc(info, n_tables); + if (!utn) { + kfree(node); + return -ENOMEM; + } + } + + if (info->shared) { + if (!info->shared->udp_tunnel_nic_info) { + INIT_LIST_HEAD(&info->shared->devices); + info->shared->udp_tunnel_nic_info = utn; + } + + list_add_tail(&node->list, &info->shared->devices); + } utn->dev = dev; dev_hold(dev); @@ -783,7 +835,33 @@ static int udp_tunnel_nic_register(struct net_device *dev) static void udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) { - unsigned int i; + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + + /* For a shared table remove this dev from the list of sharing devices + * and if there are other devices just detach. + */ + if (info->shared) { + struct udp_tunnel_nic_shared_node *node, *first; + + list_for_each_entry(node, &info->shared->devices, list) + if (node->dev == dev) + break; + if (node->dev != dev) + return; + + list_del(&node->list); + kfree(node); + + first = list_first_entry_or_null(&info->shared->devices, + typeof(*first), list); + if (first) { + udp_tunnel_drop_rx_info(dev); + utn->dev = first->dev; + goto release_dev; + } + + info->shared->udp_tunnel_nic_info = NULL; + } /* Flush before we check work, so we don't waste time adding entries * from the work which we will boot immediately. @@ -796,10 +874,8 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) if (utn->work_pending) return; - for (i = 0; i < utn->n_tables; i++) - kfree(utn->entries[i]); - kfree(utn->entries); - kfree(utn); + udp_tunnel_nic_free(utn); +release_dev: dev->udp_tunnel_nic = NULL; dev_put(dev); } |