diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 13 | ||||
-rw-r--r-- | net/ipv4/arp.c | 25 | ||||
-rw-r--r-- | net/ipv4/bpf_tcp_ca.c | 57 | ||||
-rw-r--r-- | net/ipv4/devinet.c | 4 | ||||
-rw-r--r-- | net/ipv4/esp4.c | 4 | ||||
-rw-r--r-- | net/ipv4/fib_semantics.c | 11 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 10 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel.c | 21 | ||||
-rw-r--r-- | net/ipv4/ipconfig.c | 8 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 217 | ||||
-rw-r--r-- | net/ipv4/ipmr_base.c | 53 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_nat_h323.c | 42 | ||||
-rw-r--r-- | net/ipv4/ping.c | 36 | ||||
-rw-r--r-- | net/ipv4/raw.c | 172 | ||||
-rw-r--r-- | net/ipv4/raw_diag.c | 57 | ||||
-rw-r--r-- | net/ipv4/route.c | 65 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 155 | ||||
-rw-r--r-- | net/ipv4/tcp_bbr.c | 24 | ||||
-rw-r--r-- | net/ipv4/tcp_bpf.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_cubic.c | 20 | ||||
-rw-r--r-- | net/ipv4/tcp_dctcp.c | 20 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 9 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 5 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 32 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 19 | ||||
-rw-r--r-- | net/ipv4/udp.c | 33 | ||||
-rw-r--r-- | net/ipv4/udplite.c | 3 | ||||
-rw-r--r-- | net/ipv4/xfrm4_policy.c | 2 |
28 files changed, 593 insertions, 525 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 252c8bceaba4..3ca0cc467886 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -148,10 +148,10 @@ void inet_sock_destruct(struct sock *sk) return; } - WARN_ON(atomic_read(&sk->sk_rmem_alloc)); - WARN_ON(refcount_read(&sk->sk_wmem_alloc)); - WARN_ON(sk->sk_wmem_queued); - WARN_ON(sk_forward_alloc_get(sk)); + WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc)); + WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); + WARN_ON_ONCE(sk->sk_wmem_queued); + WARN_ON_ONCE(sk_forward_alloc_get(sk)); kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); @@ -1040,6 +1040,7 @@ const struct proto_ops inet_stream_ops = { .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, + .read_skb = tcp_read_skb, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, .peek_len = tcp_peek_len, @@ -1067,7 +1068,7 @@ const struct proto_ops inet_dgram_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, - .read_sock = udp_read_sock, + .read_skb = udp_read_skb, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, @@ -1919,6 +1920,8 @@ static int __init inet_init(void) sock_skb_cb_check_size(sizeof(struct inet_skb_parm)); + raw_hashinfo_init(&raw_v4_hashinfo); + rc = proto_register(&tcp_prot, 1); if (rc) goto out; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index ab4a5601c82a..87c7e3fc5197 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -168,6 +168,7 @@ struct neigh_table arp_tbl = { [NEIGH_VAR_RETRANS_TIME] = 1 * HZ, [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, + [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, @@ -428,6 +429,26 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) return !inet_confirm_addr(net, in_dev, sip, tip, scope); } +static int arp_accept(struct in_device *in_dev, __be32 sip) +{ + struct net *net = dev_net(in_dev->dev); + int scope = RT_SCOPE_LINK; + + switch (IN_DEV_ARP_ACCEPT(in_dev)) { + case 0: /* Don't create new entries from garp */ + return 0; + case 1: /* Create new entries from garp */ + return 1; + case 2: /* Create a neighbor in the arp table only if sip + * is in the same subnet as an address configured + * on the interface that received the garp message + */ + return !!inet_confirm_addr(net, in_dev, sip, 0, scope); + default: + return 0; + } +} + static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) { struct rtable *rt; @@ -867,12 +888,12 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); addr_type = -1; - if (n || IN_DEV_ARP_ACCEPT(in_dev)) { + if (n || arp_accept(in_dev, sip)) { is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, sip, tip, sha, tha); } - if (IN_DEV_ARP_ACCEPT(in_dev)) { + if (arp_accept(in_dev, sip)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index f79ab942f03b..85a9e500c42d 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -14,18 +14,6 @@ /* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ extern struct bpf_struct_ops bpf_tcp_congestion_ops; -static u32 optional_ops[] = { - offsetof(struct tcp_congestion_ops, init), - offsetof(struct tcp_congestion_ops, release), - offsetof(struct tcp_congestion_ops, set_state), - offsetof(struct tcp_congestion_ops, cwnd_event), - offsetof(struct tcp_congestion_ops, in_ack_event), - offsetof(struct tcp_congestion_ops, pkts_acked), - offsetof(struct tcp_congestion_ops, min_tso_segs), - offsetof(struct tcp_congestion_ops, sndbuf_expand), - offsetof(struct tcp_congestion_ops, cong_control), -}; - static u32 unsupported_ops[] = { offsetof(struct tcp_congestion_ops, get_info), }; @@ -51,18 +39,6 @@ static int bpf_tcp_ca_init(struct btf *btf) return 0; } -static bool is_optional(u32 member_offset) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(optional_ops); i++) { - if (member_offset == optional_ops[i]) - return true; - } - - return false; -} - static bool is_unsupported(u32 member_offset) { unsigned int i; @@ -111,6 +87,12 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, } switch (off) { + case offsetof(struct sock, sk_pacing_rate): + end = offsetofend(struct sock, sk_pacing_rate); + break; + case offsetof(struct sock, sk_pacing_status): + end = offsetofend(struct sock, sk_pacing_status); + break; case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv): end = offsetofend(struct inet_connection_sock, icsk_ca_priv); break; @@ -215,17 +197,17 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, } } -BTF_SET_START(bpf_tcp_ca_check_kfunc_ids) -BTF_ID(func, tcp_reno_ssthresh) -BTF_ID(func, tcp_reno_cong_avoid) -BTF_ID(func, tcp_reno_undo_cwnd) -BTF_ID(func, tcp_slow_start) -BTF_ID(func, tcp_cong_avoid_ai) -BTF_SET_END(bpf_tcp_ca_check_kfunc_ids) +BTF_SET8_START(bpf_tcp_ca_check_kfunc_ids) +BTF_ID_FLAGS(func, tcp_reno_ssthresh) +BTF_ID_FLAGS(func, tcp_reno_cong_avoid) +BTF_ID_FLAGS(func, tcp_reno_undo_cwnd) +BTF_ID_FLAGS(func, tcp_slow_start) +BTF_ID_FLAGS(func, tcp_cong_avoid_ai) +BTF_SET8_END(bpf_tcp_ca_check_kfunc_ids) static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = { - .owner = THIS_MODULE, - .check_set = &bpf_tcp_ca_check_kfunc_ids, + .owner = THIS_MODULE, + .set = &bpf_tcp_ca_check_kfunc_ids, }; static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { @@ -240,7 +222,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, { const struct tcp_congestion_ops *utcp_ca; struct tcp_congestion_ops *tcp_ca; - int prog_fd; u32 moff; utcp_ca = (const struct tcp_congestion_ops *)udata; @@ -262,14 +243,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, return 1; } - if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL)) - return 0; - - /* Ensure bpf_prog is provided for compulsory func ptr */ - prog_fd = (int)(*(unsigned long *)(udata + moff)); - if (!prog_fd && !is_optional(moff) && !is_unsupported(moff)) - return -EINVAL; - return 0; } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index b2366ad540e6..92b778e423df 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -244,7 +244,7 @@ void in_dev_finish_destroy(struct in_device *idev) #ifdef NET_REFCNT_DEBUG pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL"); #endif - dev_put_track(dev, &idev->dev_tracker); + netdev_put(dev, &idev->dev_tracker); if (!idev->dead) pr_err("Freeing alive in_device %p\n", idev); else @@ -272,7 +272,7 @@ static struct in_device *inetdev_init(struct net_device *dev) if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) dev_disable_lro(dev); /* Reference in_dev->dev */ - dev_hold_track(dev, &in_dev->dev_tracker, GFP_KERNEL); + netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL); /* Account for reference dev->ip_ptr (below) */ refcount_set(&in_dev->refcnt, 1); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b694f352ce7a..5c03eba787e5 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -502,9 +502,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * nfrags++; - skb->len += tailen; - skb->data_len += tailen; - skb->truesize += tailen; + skb_len_add(skb, tailen); if (sk && sk_fullsock(sk)) refcount_add(tailen, &sk->sk_wmem_alloc); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index db7b2503f068..2dc97583d279 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -211,7 +211,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) void fib_nh_common_release(struct fib_nh_common *nhc) { - dev_put_track(nhc->nhc_dev, &nhc->nhc_dev_tracker); + netdev_put(nhc->nhc_dev, &nhc->nhc_dev_tracker); lwtstate_put(nhc->nhc_lwtstate); rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); rt_fibinfo_free(&nhc->nhc_rth_input); @@ -1057,7 +1057,8 @@ static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh, err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack); if (!err) { nh->fib_nh_dev = fib6_nh.fib_nh_dev; - dev_hold_track(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_KERNEL); + netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, + GFP_KERNEL); nh->fib_nh_oif = nh->fib_nh_dev->ifindex; nh->fib_nh_scope = RT_SCOPE_LINK; @@ -1141,7 +1142,7 @@ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, if (!netif_carrier_ok(dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; nh->fib_nh_dev = dev; - dev_hold_track(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); + netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); nh->fib_nh_scope = RT_SCOPE_LINK; return 0; } @@ -1195,7 +1196,7 @@ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, "No egress device for nexthop gateway"); goto out; } - dev_hold_track(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); + netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); if (!netif_carrier_ok(dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; @@ -1229,7 +1230,7 @@ static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh, } nh->fib_nh_dev = in_dev->dev; - dev_hold_track(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); + netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); nh->fib_nh_scope = RT_SCOPE_LINK; if (!netif_carrier_ok(nh->fib_nh_dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index df7f9dfbe8be..d7bd1daf022b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1236,9 +1236,7 @@ alloc_new_skb: pfrag->offset += copy; skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); - skb->len += copy; - skb->data_len += copy; - skb->truesize += copy; + skb_len_add(skb, copy); wmem_alloc_delta += copy; } else { err = skb_zerocopy_iter_dgram(skb, from, copy); @@ -1465,9 +1463,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, skb->csum = csum_block_add(skb->csum, csum, skb->len); } - skb->len += len; - skb->data_len += len; - skb->truesize += len; + skb_len_add(skb, len); refcount_add(len, &sk->sk_wmem_alloc); offset += len; size -= len; @@ -1726,7 +1722,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, tcp_hdr(skb)->source, tcp_hdr(skb)->dest, arg->uid); security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); - rt = ip_route_output_key(net, &fl4); + rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) return; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 94017a8c3994..e65e948cab9f 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -242,7 +242,7 @@ static struct net_device *__ip_tunnel_create(struct net *net, if (parms->name[0]) { if (!dev_valid_name(parms->name)) goto failed; - strlcpy(name, parms->name, IFNAMSIZ); + strscpy(name, parms->name, IFNAMSIZ); } else { if (strlen(ops->kind) > (IFNAMSIZ - 3)) goto failed; @@ -641,6 +641,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *inner_iph; unsigned int max_headroom; /* The extra header space needed */ struct rtable *rt = NULL; /* Route to the other host */ + __be16 payload_protocol; bool use_cache = false; struct flowi4 fl4; bool md = false; @@ -651,6 +652,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); + payload_protocol = skb_protocol(skb, true); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -670,13 +672,12 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, dst = tun_info->key.u.ipv4.dst; md = true; connected = true; - } - else if (skb->protocol == htons(ETH_P_IP)) { + } else if (payload_protocol == htons(ETH_P_IP)) { rt = skb_rtable(skb); dst = rt_nexthop(rt, inner_iph->daddr); } #if IS_ENABLED(CONFIG_IPV6) - else if (skb->protocol == htons(ETH_P_IPV6)) { + else if (payload_protocol == htons(ETH_P_IPV6)) { const struct in6_addr *addr6; struct neighbour *neigh; bool do_tx_error_icmp; @@ -716,10 +717,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, tos = tnl_params->tos; if (tos & 0x1) { tos &= ~0x1; - if (skb->protocol == htons(ETH_P_IP)) { + if (payload_protocol == htons(ETH_P_IP)) { tos = inner_iph->tos; connected = false; - } else if (skb->protocol == htons(ETH_P_IPV6)) { + } else if (payload_protocol == htons(ETH_P_IPV6)) { tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); connected = false; } @@ -765,7 +766,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } df = tnl_params->frag_off; - if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) + if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df) df |= (inner_iph->frag_off & htons(IP_DF)); if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) { @@ -786,10 +787,10 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); ttl = tnl_params->ttl; if (ttl == 0) { - if (skb->protocol == htons(ETH_P_IP)) + if (payload_protocol == htons(ETH_P_IP)) ttl = inner_iph->ttl; #if IS_ENABLED(CONFIG_IPV6) - else if (skb->protocol == htons(ETH_P_IPV6)) + else if (payload_protocol == htons(ETH_P_IPV6)) ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; #endif else @@ -1065,7 +1066,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, memset(&parms, 0, sizeof(parms)); if (devname) - strlcpy(parms.name, devname, IFNAMSIZ); + strscpy(parms.name, devname, IFNAMSIZ); rtnl_lock(); itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 9d41d5d5cd1e..f53a0f2453af 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1759,15 +1759,15 @@ static int __init ip_auto_config_setup(char *addrs) case 4: if ((dp = strchr(ip, '.'))) { *dp++ = '\0'; - strlcpy(utsname()->domainname, dp, + strscpy(utsname()->domainname, dp, sizeof(utsname()->domainname)); } - strlcpy(utsname()->nodename, ip, + strscpy(utsname()->nodename, ip, sizeof(utsname()->nodename)); ic_host_name_set = 1; break; case 5: - strlcpy(user_dev_name, ip, sizeof(user_dev_name)); + strscpy(user_dev_name, ip, sizeof(user_dev_name)); break; case 6: if (ic_proto_name(ip) == 0 && @@ -1814,7 +1814,7 @@ __setup("nfsaddrs=", nfsaddrs_config_setup); static int __init vendor_class_identifier_setup(char *addrs) { - if (strlcpy(vendor_class_identifier, addrs, + if (strscpy(vendor_class_identifier, addrs, sizeof(vendor_class_identifier)) >= sizeof(vendor_class_identifier)) pr_warn("DHCP: vendorclass too long, truncated to \"%s\"\n", diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 13e6329784fb..73651d17e51f 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -77,7 +77,12 @@ struct ipmr_result { * Note that the changes are semaphored via rtnl_lock. */ -static DEFINE_RWLOCK(mrt_lock); +static DEFINE_SPINLOCK(mrt_lock); + +static struct net_device *vif_dev_read(const struct vif_device *vif) +{ + return rcu_dereference(vif->dev); +} /* Multicast router control variables */ @@ -100,11 +105,11 @@ static void ipmr_free_table(struct mr_table *mrt); static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc_cache *cache, int local); -static int ipmr_cache_report(struct mr_table *mrt, +static int ipmr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); -static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); +static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); @@ -501,11 +506,15 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) return err; } - read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; dev->stats.tx_packets++; - ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT); - read_unlock(&mrt_lock); + rcu_read_lock(); + + /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */ + ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), + IGMPMSG_WHOLEPKT); + + rcu_read_unlock(); kfree_skb(skb); return NETDEV_TX_OK; } @@ -572,6 +581,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, { struct net_device *reg_dev = NULL; struct iphdr *encap; + int vif_num; encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); /* Check that: @@ -584,11 +594,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, ntohs(encap->tot_len) + pimlen > skb->len) return 1; - read_lock(&mrt_lock); - if (mrt->mroute_reg_vif_num >= 0) - reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; - read_unlock(&mrt_lock); - + /* Pairs with WRITE_ONCE() in vif_add()/vid_delete() */ + vif_num = READ_ONCE(mrt->mroute_reg_vif_num); + if (vif_num >= 0) + reg_dev = vif_dev_read(&mrt->vif_table[vif_num]); if (!reg_dev) return 1; @@ -614,10 +623,11 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) static int call_ipmr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, + struct net_device *vif_dev, vifi_t vif_index, u32 tb_id) { return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type, - vif, vif_index, tb_id, + vif, vif_dev, vif_index, tb_id, &net->ipv4.ipmr_seq); } @@ -649,22 +659,19 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, v = &mrt->vif_table[vifi]; - if (VIF_EXISTS(mrt, vifi)) - call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi, - mrt->id); - - write_lock_bh(&mrt_lock); - dev = v->dev; - v->dev = NULL; - - if (!dev) { - write_unlock_bh(&mrt_lock); + dev = rtnl_dereference(v->dev); + if (!dev) return -EADDRNOTAVAIL; - } - if (vifi == mrt->mroute_reg_vif_num) - mrt->mroute_reg_vif_num = -1; + spin_lock(&mrt_lock); + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, dev, + vifi, mrt->id); + RCU_INIT_POINTER(v->dev, NULL); + if (vifi == mrt->mroute_reg_vif_num) { + /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */ + WRITE_ONCE(mrt->mroute_reg_vif_num, -1); + } if (vifi + 1 == mrt->maxvif) { int tmp; @@ -672,10 +679,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, if (VIF_EXISTS(mrt, tmp)) break; } - mrt->maxvif = tmp+1; + WRITE_ONCE(mrt->maxvif, tmp + 1); } - write_unlock_bh(&mrt_lock); + spin_unlock(&mrt_lock); dev_set_allmulti(dev, -1); @@ -691,7 +698,7 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); - dev_put_track(dev, &v->dev_tracker); + netdev_put(dev, &v->dev_tracker); return 0; } @@ -777,7 +784,7 @@ out: spin_unlock(&mfc_unres_lock); } -/* Fill oifs list. It is called under write locked mrt_lock. */ +/* Fill oifs list. It is called under locked mrt_lock. */ static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { @@ -889,15 +896,18 @@ static int vif_add(struct net *net, struct mr_table *mrt, v->remote = vifc->vifc_rmt_addr.s_addr; /* And finish update writing critical data */ - write_lock_bh(&mrt_lock); - v->dev = dev; + spin_lock(&mrt_lock); + rcu_assign_pointer(v->dev, dev); netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC); - if (v->flags & VIFF_REGISTER) - mrt->mroute_reg_vif_num = vifi; + if (v->flags & VIFF_REGISTER) { + /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */ + WRITE_ONCE(mrt->mroute_reg_vif_num, vifi); + } if (vifi+1 > mrt->maxvif) - mrt->maxvif = vifi+1; - write_unlock_bh(&mrt_lock); - call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id); + WRITE_ONCE(mrt->maxvif, vifi + 1); + spin_unlock(&mrt_lock); + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev, + vifi, mrt->id); return 0; } @@ -1001,9 +1011,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, /* Bounce a cache query up to mrouted and netlink. * - * Called under mrt_lock. + * Called under rcu_read_lock(). */ -static int ipmr_cache_report(struct mr_table *mrt, +static int ipmr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert) { const int ihl = ip_hdrlen(pkt); @@ -1038,8 +1048,11 @@ static int ipmr_cache_report(struct mr_table *mrt, msg->im_vif = vifi; msg->im_vif_hi = vifi >> 8; } else { - msg->im_vif = mrt->mroute_reg_vif_num; - msg->im_vif_hi = mrt->mroute_reg_vif_num >> 8; + /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */ + int vif_num = READ_ONCE(mrt->mroute_reg_vif_num); + + msg->im_vif = vif_num; + msg->im_vif_hi = vif_num >> 8; } ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + @@ -1064,10 +1077,8 @@ static int ipmr_cache_report(struct mr_table *mrt, skb->transport_header = skb->network_header; } - rcu_read_lock(); mroute_sk = rcu_dereference(mrt->mroute_sk); if (!mroute_sk) { - rcu_read_unlock(); kfree_skb(skb); return -EINVAL; } @@ -1076,7 +1087,7 @@ static int ipmr_cache_report(struct mr_table *mrt, /* Deliver to mrouted */ ret = sock_queue_rcv_skb(mroute_sk, skb); - rcu_read_unlock(); + if (ret < 0) { net_warn_ratelimited("mroute: pending queue full, dropping entries\n"); kfree_skb(skb); @@ -1086,6 +1097,7 @@ static int ipmr_cache_report(struct mr_table *mrt, } /* Queue a packet for resolution. It gets locked cache entry! */ +/* Called under rcu_read_lock() */ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb, struct net_device *dev) { @@ -1198,12 +1210,12 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, mfc->mfcc_mcastgrp.s_addr, parent); rcu_read_unlock(); if (c) { - write_lock_bh(&mrt_lock); + spin_lock(&mrt_lock); c->_c.mfc_parent = mfc->mfcc_parent; ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; - write_unlock_bh(&mrt_lock); + spin_unlock(&mrt_lock); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); @@ -1598,20 +1610,20 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) if (vr.vifi >= mrt->maxvif) return -EINVAL; vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif); - read_lock(&mrt_lock); + rcu_read_lock(); vif = &mrt->vif_table[vr.vifi]; if (VIF_EXISTS(mrt, vr.vifi)) { - vr.icount = vif->pkt_in; - vr.ocount = vif->pkt_out; - vr.ibytes = vif->bytes_in; - vr.obytes = vif->bytes_out; - read_unlock(&mrt_lock); + vr.icount = READ_ONCE(vif->pkt_in); + vr.ocount = READ_ONCE(vif->pkt_out); + vr.ibytes = READ_ONCE(vif->bytes_in); + vr.obytes = READ_ONCE(vif->bytes_out); + rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT: if (copy_from_user(&sr, arg, sizeof(sr))) @@ -1673,20 +1685,20 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) if (vr.vifi >= mrt->maxvif) return -EINVAL; vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif); - read_lock(&mrt_lock); + rcu_read_lock(); vif = &mrt->vif_table[vr.vifi]; if (VIF_EXISTS(mrt, vr.vifi)) { - vr.icount = vif->pkt_in; - vr.ocount = vif->pkt_out; - vr.ibytes = vif->bytes_in; - vr.obytes = vif->bytes_out; - read_unlock(&mrt_lock); + vr.icount = READ_ONCE(vif->pkt_in); + vr.ocount = READ_ONCE(vif->pkt_out); + vr.ibytes = READ_ONCE(vif->bytes_in); + vr.obytes = READ_ONCE(vif->bytes_out); + rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT: if (copy_from_user(&sr, arg, sizeof(sr))) @@ -1726,7 +1738,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v ipmr_for_each_table(mrt, net) { v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { - if (v->dev == dev) + if (rcu_access_pointer(v->dev) == dev) vif_delete(mrt, ct, 1, NULL); } } @@ -1804,26 +1816,28 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, } #endif -/* Processing handlers for ipmr_forward */ +/* Processing handlers for ipmr_forward, under rcu_read_lock() */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, int in_vifi, struct sk_buff *skb, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; + struct net_device *vif_dev; struct net_device *dev; struct rtable *rt; struct flowi4 fl4; int encap = 0; - if (!vif->dev) + vif_dev = vif_dev_read(vif); + if (!vif_dev) goto out_free; if (vif->flags & VIFF_REGISTER) { - vif->pkt_out++; - vif->bytes_out += skb->len; - vif->dev->stats.tx_bytes += skb->len; - vif->dev->stats.tx_packets++; + WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); + WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); + vif_dev->stats.tx_bytes += skb->len; + vif_dev->stats.tx_packets++; ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT); goto out_free; } @@ -1868,8 +1882,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } - vif->pkt_out++; - vif->bytes_out += skb->len; + WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); + WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); @@ -1881,8 +1895,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, if (vif->flags & VIFF_TUNNEL) { ip_encap(net, skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ - vif->dev->stats.tx_packets++; - vif->dev->stats.tx_bytes += skb->len; + vif_dev->stats.tx_packets++; + vif_dev->stats.tx_bytes += skb->len; } IPCB(skb)->flags |= IPSKB_FORWARDED; @@ -1906,18 +1920,20 @@ out_free: kfree_skb(skb); } -static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev) +/* Called with mrt_lock or rcu_read_lock() */ +static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev) { int ct; - - for (ct = mrt->maxvif-1; ct >= 0; ct--) { - if (mrt->vif_table[ct].dev == dev) + /* Pairs with WRITE_ONCE() in vif_delete()/vif_add() */ + for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) { + if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev) break; } return ct; } /* "local" means that we should preserve one skb (for local delivery) */ +/* Called uner rcu_read_lock() */ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc_cache *c, int local) @@ -1944,7 +1960,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, } /* Wrong interface: drop packet and (maybe) send PIM assert. */ - if (mrt->vif_table[vif].dev != dev) { + if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { if (rt_is_output_route(skb_rtable(skb))) { /* It is our own packet, looped back. * Very complicated situation... @@ -1983,8 +1999,10 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, } forward: - mrt->vif_table[vif].pkt_in++; - mrt->vif_table[vif].bytes_in += skb->len; + WRITE_ONCE(mrt->vif_table[vif].pkt_in, + mrt->vif_table[vif].pkt_in + 1); + WRITE_ONCE(mrt->vif_table[vif].bytes_in, + mrt->vif_table[vif].bytes_in + skb->len); /* Forward the frame */ if (c->mfc_origin == htonl(INADDR_ANY) && @@ -2140,22 +2158,14 @@ int ip_mr_input(struct sk_buff *skb) skb = skb2; } - read_lock(&mrt_lock); vif = ipmr_find_vif(mrt, dev); - if (vif >= 0) { - int err2 = ipmr_cache_unresolved(mrt, vif, skb, dev); - read_unlock(&mrt_lock); - - return err2; - } - read_unlock(&mrt_lock); + if (vif >= 0) + return ipmr_cache_unresolved(mrt, vif, skb, dev); kfree_skb(skb); return -ENODEV; } - read_lock(&mrt_lock); ip_mr_forward(net, mrt, dev, skb, cache, local); - read_unlock(&mrt_lock); if (local) return ip_local_deliver(skb); @@ -2252,18 +2262,15 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, int vif = -1; dev = skb->dev; - read_lock(&mrt_lock); if (dev) vif = ipmr_find_vif(mrt, dev); if (vif < 0) { - read_unlock(&mrt_lock); rcu_read_unlock(); return -ENODEV; } skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr)); if (!skb2) { - read_unlock(&mrt_lock); rcu_read_unlock(); return -ENOMEM; } @@ -2277,14 +2284,11 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, iph->daddr = daddr; iph->version = 0; err = ipmr_cache_unresolved(mrt, vif, skb2, dev); - read_unlock(&mrt_lock); rcu_read_unlock(); return err; } - read_lock(&mrt_lock); err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); - read_unlock(&mrt_lock); rcu_read_unlock(); return err; } @@ -2404,7 +2408,7 @@ static size_t igmpmsg_netlink_msgsize(size_t payloadlen) return len; } -static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt) +static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; @@ -2744,18 +2748,21 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) { + struct net_device *vif_dev; struct nlattr *vif_nest; struct vif_device *vif; + vif = &mrt->vif_table[vifid]; + vif_dev = rtnl_dereference(vif->dev); /* if the VIF doesn't exist just continue */ - if (!VIF_EXISTS(mrt, vifid)) + if (!vif_dev) return true; - vif = &mrt->vif_table[vifid]; vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF); if (!vif_nest) return false; - if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) || + + if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) || nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) || nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) || nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in, @@ -2887,7 +2894,7 @@ out: */ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(mrt_lock) + __acquires(RCU) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); @@ -2899,14 +2906,14 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) iter->mrt = mrt; - read_lock(&mrt_lock); + rcu_read_lock(); return mr_vif_seq_start(seq, pos); } static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) - __releases(mrt_lock) + __releases(RCU) { - read_unlock(&mrt_lock); + rcu_read_unlock(); } static int ipmr_vif_seq_show(struct seq_file *seq, void *v) @@ -2919,9 +2926,11 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); } else { const struct vif_device *vif = v; - const char *name = vif->dev ? - vif->dev->name : "none"; + const struct net_device *vif_dev; + const char *name; + vif_dev = vif_dev_read(vif); + name = vif_dev ? vif_dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", vif - mrt->vif_table, @@ -3017,7 +3026,7 @@ static int ipmr_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump, - ipmr_mr_table_iter, &mrt_lock, extack); + ipmr_mr_table_iter, extack); } static const struct fib_notifier_ops ipmr_notifier_ops_template = { diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index aa8738a91210..271dc03fc6db 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -13,7 +13,7 @@ void vif_device_init(struct vif_device *v, unsigned short flags, unsigned short get_iflink_mask) { - v->dev = NULL; + RCU_INIT_POINTER(v->dev, NULL); v->bytes_in = 0; v->bytes_out = 0; v->pkt_in = 0; @@ -208,6 +208,7 @@ EXPORT_SYMBOL(mr_mfc_seq_next); int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm) { + struct net_device *vif_dev; struct rta_mfc_stats mfcs; struct nlattr *mp_attr; struct rtnexthop *nhp; @@ -220,10 +221,13 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, return -ENOENT; } - if (VIF_EXISTS(mrt, c->mfc_parent) && - nla_put_u32(skb, RTA_IIF, - mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) + rcu_read_lock(); + vif_dev = rcu_dereference(mrt->vif_table[c->mfc_parent].dev); + if (vif_dev && nla_put_u32(skb, RTA_IIF, vif_dev->ifindex) < 0) { + rcu_read_unlock(); return -EMSGSIZE; + } + rcu_read_unlock(); if (c->mfc_flags & MFC_OFFLOAD) rtm->rtm_flags |= RTNH_F_OFFLOAD; @@ -232,23 +236,27 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, if (!mp_attr) return -EMSGSIZE; + rcu_read_lock(); for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { - if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { - struct vif_device *vif; + struct vif_device *vif = &mrt->vif_table[ct]; + + vif_dev = rcu_dereference(vif->dev); + if (vif_dev && c->mfc_un.res.ttls[ct] < 255) { nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); if (!nhp) { + rcu_read_unlock(); nla_nest_cancel(skb, mp_attr); return -EMSGSIZE; } nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; - vif = &mrt->vif_table[ct]; - nhp->rtnh_ifindex = vif->dev->ifindex; + nhp->rtnh_ifindex = vif_dev->ifindex; nhp->rtnh_len = sizeof(*nhp); } } + rcu_read_unlock(); nla_nest_end(skb, mp_attr); @@ -275,13 +283,14 @@ static bool mr_mfc_uses_dev(const struct mr_table *mrt, int ct; for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { - if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { - const struct vif_device *vif; - - vif = &mrt->vif_table[ct]; - if (vif->dev == dev) - return true; - } + const struct net_device *vif_dev; + const struct vif_device *vif; + + vif = &mrt->vif_table[ct]; + vif_dev = rcu_access_pointer(vif->dev); + if (vif_dev && c->mfc_un.res.ttls[ct] < 255 && + vif_dev == dev) + return true; } return false; } @@ -390,7 +399,6 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), - rwlock_t *mrt_lock, struct netlink_ext_ack *extack) { struct mr_table *mrt; @@ -402,22 +410,25 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) { struct vif_device *v = &mrt->vif_table[0]; + struct net_device *vif_dev; struct mr_mfc *mfc; int vifi; /* Notifiy on table VIF entries */ - read_lock(mrt_lock); + rcu_read_lock(); for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { - if (!v->dev) + vif_dev = rcu_dereference(v->dev); + if (!vif_dev) continue; err = mr_call_vif_notifier(nb, family, - FIB_EVENT_VIF_ADD, - v, vifi, mrt->id, extack); + FIB_EVENT_VIF_ADD, v, + vif_dev, vifi, + mrt->id, extack); if (err) break; } - read_unlock(mrt_lock); + rcu_read_unlock(); if (err) return err; diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 76a411ae9fe6..a334f0dcc2d0 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -579,28 +579,22 @@ static struct nf_ct_helper_expectfn callforwarding_nat = { .expectfn = ip_nat_callforwarding_expect, }; +static const struct nfct_h323_nat_hooks nathooks = { + .set_h245_addr = set_h245_addr, + .set_h225_addr = set_h225_addr, + .set_sig_addr = set_sig_addr, + .set_ras_addr = set_ras_addr, + .nat_rtp_rtcp = nat_rtp_rtcp, + .nat_t120 = nat_t120, + .nat_h245 = nat_h245, + .nat_callforwarding = nat_callforwarding, + .nat_q931 = nat_q931, +}; + /****************************************************************************/ static int __init nf_nat_h323_init(void) { - BUG_ON(set_h245_addr_hook != NULL); - BUG_ON(set_h225_addr_hook != NULL); - BUG_ON(set_sig_addr_hook != NULL); - BUG_ON(set_ras_addr_hook != NULL); - BUG_ON(nat_rtp_rtcp_hook != NULL); - BUG_ON(nat_t120_hook != NULL); - BUG_ON(nat_h245_hook != NULL); - BUG_ON(nat_callforwarding_hook != NULL); - BUG_ON(nat_q931_hook != NULL); - - RCU_INIT_POINTER(set_h245_addr_hook, set_h245_addr); - RCU_INIT_POINTER(set_h225_addr_hook, set_h225_addr); - RCU_INIT_POINTER(set_sig_addr_hook, set_sig_addr); - RCU_INIT_POINTER(set_ras_addr_hook, set_ras_addr); - RCU_INIT_POINTER(nat_rtp_rtcp_hook, nat_rtp_rtcp); - RCU_INIT_POINTER(nat_t120_hook, nat_t120); - RCU_INIT_POINTER(nat_h245_hook, nat_h245); - RCU_INIT_POINTER(nat_callforwarding_hook, nat_callforwarding); - RCU_INIT_POINTER(nat_q931_hook, nat_q931); + RCU_INIT_POINTER(nfct_h323_nat_hook, &nathooks); nf_ct_helper_expectfn_register(&q931_nat); nf_ct_helper_expectfn_register(&callforwarding_nat); return 0; @@ -609,15 +603,7 @@ static int __init nf_nat_h323_init(void) /****************************************************************************/ static void __exit nf_nat_h323_fini(void) { - RCU_INIT_POINTER(set_h245_addr_hook, NULL); - RCU_INIT_POINTER(set_h225_addr_hook, NULL); - RCU_INIT_POINTER(set_sig_addr_hook, NULL); - RCU_INIT_POINTER(set_ras_addr_hook, NULL); - RCU_INIT_POINTER(nat_rtp_rtcp_hook, NULL); - RCU_INIT_POINTER(nat_t120_hook, NULL); - RCU_INIT_POINTER(nat_h245_hook, NULL); - RCU_INIT_POINTER(nat_callforwarding_hook, NULL); - RCU_INIT_POINTER(nat_q931_hook, NULL); + RCU_INIT_POINTER(nfct_h323_nat_hook, NULL); nf_ct_helper_expectfn_unregister(&q931_nat); nf_ct_helper_expectfn_unregister(&callforwarding_nat); synchronize_rcu(); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 3c6101def7d6..b83c2bd9d722 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -50,7 +50,7 @@ struct ping_table { struct hlist_nulls_head hash[PING_HTABLE_SIZE]; - rwlock_t lock; + spinlock_t lock; }; static struct ping_table ping_table; @@ -82,7 +82,7 @@ int ping_get_port(struct sock *sk, unsigned short ident) struct sock *sk2 = NULL; isk = inet_sk(sk); - write_lock_bh(&ping_table.lock); + spin_lock(&ping_table.lock); if (ident == 0) { u32 i; u16 result = ping_port_rover + 1; @@ -128,14 +128,15 @@ next_port: if (sk_unhashed(sk)) { pr_debug("was not hashed\n"); sock_hold(sk); - hlist_nulls_add_head(&sk->sk_nulls_node, hlist); + sock_set_flag(sk, SOCK_RCU_FREE); + hlist_nulls_add_head_rcu(&sk->sk_nulls_node, hlist); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } - write_unlock_bh(&ping_table.lock); + spin_unlock(&ping_table.lock); return 0; fail: - write_unlock_bh(&ping_table.lock); + spin_unlock(&ping_table.lock); return 1; } EXPORT_SYMBOL_GPL(ping_get_port); @@ -153,19 +154,19 @@ void ping_unhash(struct sock *sk) struct inet_sock *isk = inet_sk(sk); pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); - write_lock_bh(&ping_table.lock); + spin_lock(&ping_table.lock); if (sk_hashed(sk)) { - hlist_nulls_del(&sk->sk_nulls_node); - sk_nulls_node_init(&sk->sk_nulls_node); + hlist_nulls_del_init_rcu(&sk->sk_nulls_node); sock_put(sk); isk->inet_num = 0; isk->inet_sport = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } - write_unlock_bh(&ping_table.lock); + spin_unlock(&ping_table.lock); } EXPORT_SYMBOL_GPL(ping_unhash); +/* Called under rcu_read_lock() */ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) { struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); @@ -190,8 +191,6 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) return NULL; } - read_lock_bh(&ping_table.lock); - ping_portaddr_for_each_entry(sk, hnode, hslot) { isk = inet_sk(sk); @@ -230,13 +229,11 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) sk->sk_bound_dev_if != sdif) continue; - sock_hold(sk); goto exit; } sk = NULL; exit: - read_unlock_bh(&ping_table.lock); return sk; } @@ -592,7 +589,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info) sk->sk_err = err; sk_error_report(sk); out: - sock_put(sk); + return; } EXPORT_SYMBOL_GPL(ping_err); @@ -998,7 +995,6 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb) reason = __ping_queue_rcv_skb(sk, skb2); else reason = SKB_DROP_REASON_NOMEM; - sock_put(sk); } if (reason) @@ -1084,13 +1080,13 @@ static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) } void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) - __acquires(ping_table.lock) + __acquires(RCU) { struct ping_iter_state *state = seq->private; state->bucket = 0; state->family = family; - read_lock_bh(&ping_table.lock); + rcu_read_lock(); return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } @@ -1116,9 +1112,9 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) EXPORT_SYMBOL_GPL(ping_seq_next); void ping_seq_stop(struct seq_file *seq, void *v) - __releases(ping_table.lock) + __releases(RCU) { - read_unlock_bh(&ping_table.lock); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ping_seq_stop); @@ -1202,5 +1198,5 @@ void __init ping_init(void) for (i = 0; i < PING_HTABLE_SIZE; i++) INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i); - rwlock_init(&ping_table.lock); + spin_lock_init(&ping_table.lock); } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index bbd717805b10..006c1f0ed8b4 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -85,21 +85,20 @@ struct raw_frag_vec { int hlen; }; -struct raw_hashinfo raw_v4_hashinfo = { - .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), -}; +struct raw_hashinfo raw_v4_hashinfo; EXPORT_SYMBOL_GPL(raw_v4_hashinfo); int raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; - struct hlist_head *head; + struct hlist_nulls_head *hlist; - head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)]; + hlist = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)]; - write_lock_bh(&h->lock); - sk_add_node(sk, head); - write_unlock_bh(&h->lock); + spin_lock(&h->lock); + __sk_nulls_add_node_rcu(sk, hlist); + sock_set_flag(sk, SOCK_RCU_FREE); + spin_unlock(&h->lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; @@ -110,31 +109,26 @@ void raw_unhash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; - write_lock_bh(&h->lock); - if (sk_del_node_init(sk)) + spin_lock(&h->lock); + if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - write_unlock_bh(&h->lock); + spin_unlock(&h->lock); } EXPORT_SYMBOL_GPL(raw_unhash_sk); -struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, - unsigned short num, __be32 raddr, __be32 laddr, - int dif, int sdif) +bool raw_v4_match(struct net *net, struct sock *sk, unsigned short num, + __be32 raddr, __be32 laddr, int dif, int sdif) { - sk_for_each_from(sk) { - struct inet_sock *inet = inet_sk(sk); - - if (net_eq(sock_net(sk), net) && inet->inet_num == num && - !(inet->inet_daddr && inet->inet_daddr != raddr) && - !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && - raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) - goto found; /* gotcha */ - } - sk = NULL; -found: - return sk; + struct inet_sock *inet = inet_sk(sk); + + if (net_eq(sock_net(sk), net) && inet->inet_num == num && + !(inet->inet_daddr && inet->inet_daddr != raddr) && + !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && + raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) + return true; + return false; } -EXPORT_SYMBOL_GPL(__raw_v4_lookup); +EXPORT_SYMBOL_GPL(raw_v4_match); /* * 0 - deliver @@ -168,23 +162,20 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) */ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) { + struct net *net = dev_net(skb->dev); + struct hlist_nulls_head *hlist; + struct hlist_nulls_node *hnode; int sdif = inet_sdif(skb); int dif = inet_iif(skb); - struct sock *sk; - struct hlist_head *head; int delivered = 0; - struct net *net; - - read_lock(&raw_v4_hashinfo.lock); - head = &raw_v4_hashinfo.ht[hash]; - if (hlist_empty(head)) - goto out; - - net = dev_net(skb->dev); - sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, - iph->saddr, iph->daddr, dif, sdif); + struct sock *sk; - while (sk) { + hlist = &raw_v4_hashinfo.ht[hash]; + rcu_read_lock(); + sk_nulls_for_each(sk, hnode, hlist) { + if (!raw_v4_match(net, sk, iph->protocol, + iph->saddr, iph->daddr, dif, sdif)) + continue; delivered = 1; if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) && ip_mc_sf_allow(sk, iph->daddr, iph->saddr, @@ -195,31 +186,16 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) if (clone) raw_rcv(sk, clone); } - sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol, - iph->saddr, iph->daddr, - dif, sdif); } -out: - read_unlock(&raw_v4_hashinfo.lock); + rcu_read_unlock(); return delivered; } int raw_local_deliver(struct sk_buff *skb, int protocol) { - int hash; - struct sock *raw_sk; - - hash = protocol & (RAW_HTABLE_SIZE - 1); - raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); - - /* If there maybe a raw socket we must check - if not we - * don't care less - */ - if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) - raw_sk = NULL; - - return raw_sk != NULL; + int hash = protocol & (RAW_HTABLE_SIZE - 1); + return raw_v4_input(skb, ip_hdr(skb), hash); } static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) @@ -286,31 +262,27 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) { - int hash; - struct sock *raw_sk; + struct net *net = dev_net(skb->dev); + struct hlist_nulls_head *hlist; + struct hlist_nulls_node *hnode; + int dif = skb->dev->ifindex; + int sdif = inet_sdif(skb); const struct iphdr *iph; - struct net *net; + struct sock *sk; + int hash; hash = protocol & (RAW_HTABLE_SIZE - 1); + hlist = &raw_v4_hashinfo.ht[hash]; - read_lock(&raw_v4_hashinfo.lock); - raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); - if (raw_sk) { - int dif = skb->dev->ifindex; - int sdif = inet_sdif(skb); - + rcu_read_lock(); + sk_nulls_for_each(sk, hnode, hlist) { iph = (const struct iphdr *)skb->data; - net = dev_net(skb->dev); - - while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, - iph->daddr, iph->saddr, - dif, sdif)) != NULL) { - raw_err(raw_sk, skb, info); - raw_sk = sk_next(raw_sk); - iph = (const struct iphdr *)skb->data; - } + if (!raw_v4_match(net, sk, iph->protocol, + iph->daddr, iph->saddr, dif, sdif)) + continue; + raw_err(sk, skb, info); } - read_unlock(&raw_v4_hashinfo.lock); + rcu_read_unlock(); } static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -971,44 +943,41 @@ struct proto raw_prot = { }; #ifdef CONFIG_PROC_FS -static struct sock *raw_get_first(struct seq_file *seq) +static struct sock *raw_get_first(struct seq_file *seq, int bucket) { - struct sock *sk; struct raw_hashinfo *h = pde_data(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); + struct hlist_nulls_head *hlist; + struct hlist_nulls_node *hnode; + struct sock *sk; - for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE; + for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE; ++state->bucket) { - sk_for_each(sk, &h->ht[state->bucket]) + hlist = &h->ht[state->bucket]; + sk_nulls_for_each(sk, hnode, hlist) { if (sock_net(sk) == seq_file_net(seq)) - goto found; + return sk; + } } - sk = NULL; -found: - return sk; + return NULL; } static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) { - struct raw_hashinfo *h = pde_data(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); do { - sk = sk_next(sk); -try_again: - ; + sk = sk_nulls_next(sk); } while (sk && sock_net(sk) != seq_file_net(seq)); - if (!sk && ++state->bucket < RAW_HTABLE_SIZE) { - sk = sk_head(&h->ht[state->bucket]); - goto try_again; - } + if (!sk) + return raw_get_first(seq, state->bucket + 1); return sk; } static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) { - struct sock *sk = raw_get_first(seq); + struct sock *sk = raw_get_first(seq, 0); if (sk) while (pos && (sk = raw_get_next(seq, sk)) != NULL) @@ -1017,11 +986,9 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) } void *raw_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(&h->lock) + __acquires(RCU) { - struct raw_hashinfo *h = pde_data(file_inode(seq->file)); - - read_lock(&h->lock); + rcu_read_lock(); return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } EXPORT_SYMBOL_GPL(raw_seq_start); @@ -1031,7 +998,7 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct sock *sk; if (v == SEQ_START_TOKEN) - sk = raw_get_first(seq); + sk = raw_get_first(seq, 0); else sk = raw_get_next(seq, v); ++*pos; @@ -1040,11 +1007,9 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) EXPORT_SYMBOL_GPL(raw_seq_next); void raw_seq_stop(struct seq_file *seq, void *v) - __releases(&h->lock) + __releases(RCU) { - struct raw_hashinfo *h = pde_data(file_inode(seq->file)); - - read_unlock(&h->lock); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(raw_seq_stop); @@ -1106,6 +1071,7 @@ static __net_initdata struct pernet_operations raw_net_ops = { int __init raw_proc_init(void) { + return register_pernet_subsys(&raw_net_ops); } diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index ccacbde30a2c..999321834b94 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -34,57 +34,57 @@ raw_get_hashinfo(const struct inet_diag_req_v2 *r) * use helper to figure it out. */ -static struct sock *raw_lookup(struct net *net, struct sock *from, - const struct inet_diag_req_v2 *req) +static bool raw_lookup(struct net *net, struct sock *sk, + const struct inet_diag_req_v2 *req) { struct inet_diag_req_raw *r = (void *)req; - struct sock *sk = NULL; if (r->sdiag_family == AF_INET) - sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol, - r->id.idiag_dst[0], - r->id.idiag_src[0], - r->id.idiag_if, 0); + return raw_v4_match(net, sk, r->sdiag_raw_protocol, + r->id.idiag_dst[0], + r->id.idiag_src[0], + r->id.idiag_if, 0); #if IS_ENABLED(CONFIG_IPV6) else - sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol, - (const struct in6_addr *)r->id.idiag_src, - (const struct in6_addr *)r->id.idiag_dst, - r->id.idiag_if, 0); + return raw_v6_match(net, sk, r->sdiag_raw_protocol, + (const struct in6_addr *)r->id.idiag_src, + (const struct in6_addr *)r->id.idiag_dst, + r->id.idiag_if, 0); #endif - return sk; + return false; } static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) { struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); - struct sock *sk = NULL, *s; + struct hlist_nulls_head *hlist; + struct hlist_nulls_node *hnode; + struct sock *sk; int slot; if (IS_ERR(hashinfo)) return ERR_CAST(hashinfo); - read_lock(&hashinfo->lock); + rcu_read_lock(); for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { - sk_for_each(s, &hashinfo->ht[slot]) { - sk = raw_lookup(net, s, r); - if (sk) { + hlist = &hashinfo->ht[slot]; + sk_nulls_for_each(sk, hnode, hlist) { + if (raw_lookup(net, sk, r)) { /* * Grab it and keep until we fill - * diag meaage to be reported, so + * diag message to be reported, so * caller should call sock_put then. - * We can do that because we're keeping - * hashinfo->lock here. */ - sock_hold(sk); - goto out_unlock; + if (refcount_inc_not_zero(&sk->sk_refcnt)) + goto out_unlock; } } } + sk = ERR_PTR(-ENOENT); out_unlock: - read_unlock(&hashinfo->lock); + rcu_read_unlock(); - return sk ? sk : ERR_PTR(-ENOENT); + return sk; } static int raw_diag_dump_one(struct netlink_callback *cb, @@ -142,6 +142,8 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; + struct hlist_nulls_head *hlist; + struct hlist_nulls_node *hnode; int num, s_num, slot, s_slot; struct sock *sk = NULL; struct nlattr *bc; @@ -154,11 +156,12 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, s_slot = cb->args[0]; num = s_num = cb->args[1]; - read_lock(&hashinfo->lock); + rcu_read_lock(); for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) { num = 0; - sk_for_each(sk, &hashinfo->ht[slot]) { + hlist = &hashinfo->ht[slot]; + sk_nulls_for_each(sk, hnode, hlist) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) @@ -181,7 +184,7 @@ next: } out_unlock: - read_unlock(&hashinfo->lock); + rcu_read_unlock(); cb->args[0] = slot; cb->args[1] = num; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4702c61207a8..795cbe1de912 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1550,9 +1550,8 @@ void rt_flush_dev(struct net_device *dev) if (rt->dst.dev != dev) continue; rt->dst.dev = blackhole_netdev; - dev_replace_track(dev, blackhole_netdev, - &rt->dst.dev_tracker, - GFP_ATOMIC); + netdev_ref_replace(dev, blackhole_netdev, + &rt->dst.dev_tracker, GFP_ATOMIC); list_move(&rt->rt_uncached, &ul->quarantine); } spin_unlock_bh(&ul->lock); @@ -1627,12 +1626,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, struct rtable *rt_dst_alloc(struct net_device *dev, unsigned int flags, u16 type, - bool nopolicy, bool noxfrm) + bool noxfrm) { struct rtable *rt; rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, - (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); if (rt) { @@ -1727,7 +1725,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; struct rtable *rth; - bool no_policy; u32 itag = 0; int err; @@ -1738,12 +1735,11 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (our) flags |= RTCF_LOCAL; - no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); - if (no_policy) + if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, - no_policy, false); + false); if (!rth) return -ENOBUFS; @@ -1802,7 +1798,7 @@ static int __mkroute_input(struct sk_buff *skb, struct rtable *rth; int err; struct in_device *out_dev; - bool do_cache, no_policy; + bool do_cache; u32 itag = 0; /* get a working reference to the output device */ @@ -1847,8 +1843,7 @@ static int __mkroute_input(struct sk_buff *skb, } } - no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); - if (no_policy) + if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; fnhe = find_exception(nhc, daddr); @@ -1863,7 +1858,7 @@ static int __mkroute_input(struct sk_buff *skb, } } - rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy, + rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; @@ -2238,7 +2233,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct rtable *rth; struct flowi4 fl4; bool do_cache = true; - bool no_policy; /* IP on this device is disabled. */ @@ -2357,8 +2351,7 @@ brd_input: RT_CACHE_STAT_INC(in_brd); local_input: - no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); - if (no_policy) + if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; do_cache &= res->fi && !itag; @@ -2374,8 +2367,7 @@ local_input: } rth = rt_dst_alloc(ip_rt_get_dev(net, res), - flags | RTCF_LOCAL, res->type, - no_policy, false); + flags | RTCF_LOCAL, res->type, false); if (!rth) goto e_nobufs; @@ -2440,24 +2432,9 @@ martian_source: goto out; } -int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) -{ - struct fib_result res; - int err; - - tos &= IPTOS_RT_MASK; - rcu_read_lock(); - err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); - rcu_read_unlock(); - - return err; -} -EXPORT_SYMBOL(ip_route_input_noref); - /* called with rcu_read_lock held */ -int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, struct fib_result *res) +static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, struct fib_result *res) { /* Multicast recognition logic is moved from route cache to here. * The problem was that too many Ethernet cards have broken/missing @@ -2506,6 +2483,21 @@ int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, return ip_route_input_slow(skb, daddr, saddr, tos, dev, res); } +int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev) +{ + struct fib_result res; + int err; + + tos &= IPTOS_RT_MASK; + rcu_read_lock(); + err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); + rcu_read_unlock(); + + return err; +} +EXPORT_SYMBOL(ip_route_input_noref); + /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, const struct flowi4 *fl4, int orig_oif, @@ -2598,7 +2590,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, add: rth = rt_dst_alloc(dev_out, flags, type, - IN_DEV_ORCONF(in_dev, NOPOLICY), IN_DEV_ORCONF(in_dev, NOXFRM)); if (!rth) return ERR_PTR(-ENOBUFS); @@ -2851,7 +2842,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->output = dst_discard_out; new->dev = net->loopback_dev; - dev_hold_track(new->dev, &new->dev_tracker, GFP_ATOMIC); + netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC); rt->rt_is_input = ort->rt_is_input; rt->rt_iif = ort->rt_iif; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a628daa51113..970e9a2cca4a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -294,6 +294,8 @@ EXPORT_SYMBOL(sysctl_tcp_mem); atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); +DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); +EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc); #if IS_ENABLED(CONFIG_SMC) DEFINE_STATIC_KEY_FALSE(tcp_have_smc); @@ -856,9 +858,6 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, { struct sk_buff *skb; - if (unlikely(tcp_under_memory_pressure(sk))) - sk_mem_reclaim_partial(sk); - skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp); if (likely(skb)) { bool mem_scheduled; @@ -952,6 +951,24 @@ static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) return 0; } + +static int tcp_wmem_schedule(struct sock *sk, int copy) +{ + int left; + + if (likely(sk_wmem_schedule(sk, copy))) + return copy; + + /* We could be in trouble if we have nothing queued. + * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] + * to guarantee some progress. + */ + left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued; + if (left > 0) + sk_forced_mem_schedule(sk, min(left, copy)); + return min(copy, sk->sk_forward_alloc); +} + static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, struct page *page, int offset, size_t *size) { @@ -987,7 +1004,11 @@ new_segment: tcp_mark_push(tp, skb); goto new_segment; } - if (tcp_downgrade_zcopy_pure(sk, skb) || !sk_wmem_schedule(sk, copy)) + if (tcp_downgrade_zcopy_pure(sk, skb)) + return NULL; + + copy = tcp_wmem_schedule(sk, copy); + if (!copy) return NULL; if (can_coalesce) { @@ -1348,7 +1369,8 @@ new_segment: skb_zcopy_downgrade_managed(skb); } - if (!sk_wmem_schedule(sk, copy)) + copy = tcp_wmem_schedule(sk, copy); + if (!copy) goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, @@ -1375,7 +1397,8 @@ new_segment: skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; if (!skb_zcopy_pure(skb)) { - if (!sk_wmem_schedule(sk, copy)) + copy = tcp_wmem_schedule(sk, copy); + if (!copy) goto wait_for_space; } @@ -1612,7 +1635,7 @@ static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); } -static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) +struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; u32 offset; @@ -1635,6 +1658,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) } return NULL; } +EXPORT_SYMBOL(tcp_recv_skb); /* * This routine provides an alternative to tcp_recvmsg() for routines @@ -1721,6 +1745,89 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, } EXPORT_SYMBOL(tcp_read_sock); +int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 seq = tp->copied_seq; + struct sk_buff *skb; + int copied = 0; + u32 offset; + + if (sk->sk_state == TCP_LISTEN) + return -ENOTCONN; + + while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + int used; + + __skb_unlink(skb, &sk->sk_receive_queue); + used = recv_actor(sk, skb); + if (used <= 0) { + if (!copied) + copied = used; + break; + } + seq += used; + copied += used; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { + consume_skb(skb); + ++seq; + break; + } + consume_skb(skb); + break; + } + WRITE_ONCE(tp->copied_seq, seq); + + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + if (copied > 0) + tcp_cleanup_rbuf(sk, copied); + + return copied; +} +EXPORT_SYMBOL(tcp_read_skb); + +void tcp_read_done(struct sock *sk, size_t len) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 seq = tp->copied_seq; + struct sk_buff *skb; + size_t left; + u32 offset; + + if (sk->sk_state == TCP_LISTEN) + return; + + left = len; + while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + int used; + + used = min_t(size_t, skb->len - offset, left); + seq += used; + left -= used; + + if (skb->len > offset + used) + break; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { + tcp_eat_recv_skb(sk, skb); + ++seq; + break; + } + tcp_eat_recv_skb(sk, skb); + } + WRITE_ONCE(tp->copied_seq, seq); + + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + if (left != len) + tcp_cleanup_rbuf(sk, len - left); +} +EXPORT_SYMBOL(tcp_read_done); + int tcp_peek_len(struct socket *sock) { return tcp_inq(sock->sk); @@ -2775,8 +2882,6 @@ void __tcp_close(struct sock *sk, long timeout) __kfree_skb(skb); } - sk_mem_reclaim(sk); - /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ if (sk->sk_state == TCP_CLOSE) goto adjudge_to_death; @@ -2884,7 +2989,6 @@ adjudge_to_death: } } if (sk->sk_state != TCP_CLOSE) { - sk_mem_reclaim(sk); if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); @@ -2962,7 +3066,6 @@ void tcp_write_queue_purge(struct sock *sk) } tcp_rtx_queue_purge(sk); INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); - sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); tcp_sk(sk)->packets_out = 0; inet_csk(sk)->icsk_backoff = 0; @@ -4534,16 +4637,24 @@ EXPORT_SYMBOL_GPL(tcp_done); int tcp_abort(struct sock *sk, int err) { - if (!sk_fullsock(sk)) { - if (sk->sk_state == TCP_NEW_SYN_RECV) { - struct request_sock *req = inet_reqsk(sk); + int state = inet_sk_state_load(sk); - local_bh_disable(); - inet_csk_reqsk_queue_drop(req->rsk_listener, req); - local_bh_enable(); - return 0; - } - return -EOPNOTSUPP; + if (state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + + local_bh_disable(); + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + local_bh_enable(); + return 0; + } + if (state == TCP_TIME_WAIT) { + struct inet_timewait_sock *tw = inet_twsk(sk); + + refcount_inc(&tw->tw_refcnt); + local_bh_disable(); + inet_twsk_deschedule_put(tw); + local_bh_enable(); + return 0; } /* Don't race with userspace socket closes such as tcp_close. */ @@ -4675,11 +4786,11 @@ void __init tcp_init(void) max_wshare = min(4UL*1024*1024, limit); max_rshare = min(6UL*1024*1024, limit); - init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); - init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_rmem[1] = 131072; init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 075e744bfb48..54eec33c6e1c 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1154,24 +1154,24 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .set_state = bbr_set_state, }; -BTF_SET_START(tcp_bbr_check_kfunc_ids) +BTF_SET8_START(tcp_bbr_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE -BTF_ID(func, bbr_init) -BTF_ID(func, bbr_main) -BTF_ID(func, bbr_sndbuf_expand) -BTF_ID(func, bbr_undo_cwnd) -BTF_ID(func, bbr_cwnd_event) -BTF_ID(func, bbr_ssthresh) -BTF_ID(func, bbr_min_tso_segs) -BTF_ID(func, bbr_set_state) +BTF_ID_FLAGS(func, bbr_init) +BTF_ID_FLAGS(func, bbr_main) +BTF_ID_FLAGS(func, bbr_sndbuf_expand) +BTF_ID_FLAGS(func, bbr_undo_cwnd) +BTF_ID_FLAGS(func, bbr_cwnd_event) +BTF_ID_FLAGS(func, bbr_ssthresh) +BTF_ID_FLAGS(func, bbr_min_tso_segs) +BTF_ID_FLAGS(func, bbr_set_state) #endif #endif -BTF_SET_END(tcp_bbr_check_kfunc_ids) +BTF_SET8_END(tcp_bbr_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = { - .owner = THIS_MODULE, - .check_set = &tcp_bbr_check_kfunc_ids, + .owner = THIS_MODULE, + .set = &tcp_bbr_check_kfunc_ids, }; static int __init bbr_register(void) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 0d3f68bb51c0..a1626afe87a1 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -540,6 +540,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], struct proto *base) { prot[TCP_BPF_BASE] = *base; + prot[TCP_BPF_BASE].destroy = sock_map_destroy; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 68178e7280ce..768c10c1f649 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -485,22 +485,22 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { .name = "cubic", }; -BTF_SET_START(tcp_cubic_check_kfunc_ids) +BTF_SET8_START(tcp_cubic_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE -BTF_ID(func, cubictcp_init) -BTF_ID(func, cubictcp_recalc_ssthresh) -BTF_ID(func, cubictcp_cong_avoid) -BTF_ID(func, cubictcp_state) -BTF_ID(func, cubictcp_cwnd_event) -BTF_ID(func, cubictcp_acked) +BTF_ID_FLAGS(func, cubictcp_init) +BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh) +BTF_ID_FLAGS(func, cubictcp_cong_avoid) +BTF_ID_FLAGS(func, cubictcp_state) +BTF_ID_FLAGS(func, cubictcp_cwnd_event) +BTF_ID_FLAGS(func, cubictcp_acked) #endif #endif -BTF_SET_END(tcp_cubic_check_kfunc_ids) +BTF_SET8_END(tcp_cubic_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = { - .owner = THIS_MODULE, - .check_set = &tcp_cubic_check_kfunc_ids, + .owner = THIS_MODULE, + .set = &tcp_cubic_check_kfunc_ids, }; static int __init cubictcp_register(void) diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index ab034a4e9324..2a6c0dd665a4 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -239,22 +239,22 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = { .name = "dctcp-reno", }; -BTF_SET_START(tcp_dctcp_check_kfunc_ids) +BTF_SET8_START(tcp_dctcp_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE -BTF_ID(func, dctcp_init) -BTF_ID(func, dctcp_update_alpha) -BTF_ID(func, dctcp_cwnd_event) -BTF_ID(func, dctcp_ssthresh) -BTF_ID(func, dctcp_cwnd_undo) -BTF_ID(func, dctcp_state) +BTF_ID_FLAGS(func, dctcp_init) +BTF_ID_FLAGS(func, dctcp_update_alpha) +BTF_ID_FLAGS(func, dctcp_cwnd_event) +BTF_ID_FLAGS(func, dctcp_ssthresh) +BTF_ID_FLAGS(func, dctcp_cwnd_undo) +BTF_ID_FLAGS(func, dctcp_state) #endif #endif -BTF_SET_END(tcp_dctcp_check_kfunc_ids) +BTF_SET8_END(tcp_dctcp_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = { - .owner = THIS_MODULE, - .check_set = &tcp_dctcp_check_kfunc_ids, + .owner = THIS_MODULE, + .set = &tcp_dctcp_check_kfunc_ids, }; static int __init dctcp_register(void) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b1637990d570..ab5f0ea166f1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -806,7 +806,6 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); - sk_mem_reclaim(sk); } } icsk->icsk_ack.lrcvtime = now; @@ -3974,7 +3973,7 @@ static bool smc_parse_options(const struct tcphdr *th, /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped * value on success. */ -static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) +u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) { const unsigned char *ptr = (const unsigned char *)(th + 1); int length = (th->doff * 4) - sizeof(struct tcphdr); @@ -4013,6 +4012,7 @@ static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) } return mss; } +EXPORT_SYMBOL_GPL(tcp_parse_mss_option); /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when @@ -4397,7 +4397,6 @@ void tcp_fin(struct sock *sk) skb_rbtree_purge(&tp->out_of_order_queue); if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); - sk_mem_reclaim(sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); @@ -5294,7 +5293,7 @@ new_range: before(TCP_SKB_CB(skb)->end_seq, start)) { /* Do not attempt collapsing tiny skbs */ if (range_truesize != head->truesize || - end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { + end - start >= SKB_WITH_OVERHEAD(PAGE_SIZE)) { tcp_collapse(sk, NULL, &tp->out_of_order_queue, head, skb, start, end); } else { @@ -5343,7 +5342,6 @@ static bool tcp_prune_ofo_queue(struct sock *sk) tcp_drop_reason(sk, rb_to_skb(node), SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE); if (!prev || goal <= 0) { - sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) break; @@ -5390,7 +5388,6 @@ static int tcp_prune_queue(struct sock *sk) skb_peek(&sk->sk_receive_queue), NULL, tp->copied_seq, tp->rcv_nxt); - sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 586c102ce152..0c83780dc9bf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -819,6 +819,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); + xfrm_sk_clone_policy(ctl_sk, sk); } ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, @@ -827,6 +828,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) transmit_time); ctl_sk->sk_mark = 0; + xfrm_sk_free_policy(ctl_sk); sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); @@ -3049,7 +3051,10 @@ struct proto tcp_prot = { .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, + .memory_allocated = &tcp_memory_allocated, + .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, + .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4c376b6d8764..78b654ff421b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3142,7 +3142,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) struct tcp_sock *tp = tcp_sk(sk); unsigned int cur_mss; int diff, len, err; - + int avail_wnd; /* Inconclusive MTU probe */ if (icsk->icsk_mtup.probe_size) @@ -3164,17 +3164,25 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) return -EHOSTUNREACH; /* Routing failure or similar. */ cur_mss = tcp_current_mss(sk); + avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the * case, when window is shrunk to zero. In this case - * our retransmit serves as a zero window probe. + * our retransmit of one segment serves as a zero window probe. */ - if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && - TCP_SKB_CB(skb)->seq != tp->snd_una) - return -EAGAIN; + if (avail_wnd <= 0) { + if (TCP_SKB_CB(skb)->seq != tp->snd_una) + return -EAGAIN; + avail_wnd = cur_mss; + } len = cur_mss * segs; + if (len > avail_wnd) { + len = rounddown(avail_wnd, cur_mss); + if (!len) + len = avail_wnd; + } if (skb->len > len) { if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, cur_mss, GFP_ATOMIC)) @@ -3188,8 +3196,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) diff -= tcp_skb_pcount(skb); if (diff) tcp_adjust_pcount(sk, skb, diff); - if (skb->len < cur_mss) - tcp_retrans_try_collapse(sk, skb, cur_mss); + avail_wnd = min_t(int, avail_wnd, cur_mss); + if (skb->len < avail_wnd) + tcp_retrans_try_collapse(sk, skb, avail_wnd); } /* RFC3168, section 6.1.1.1. ECN fallback */ @@ -3360,12 +3369,13 @@ void tcp_xmit_retransmit_queue(struct sock *sk) */ void sk_forced_mem_schedule(struct sock *sk, int size) { - int amt; + int delta, amt; - if (size <= sk->sk_forward_alloc) + delta = size - sk->sk_forward_alloc; + if (delta <= 0) return; - amt = sk_mem_pages(size); - sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; + amt = sk_mem_pages(delta); + sk->sk_forward_alloc += amt << PAGE_SHIFT; sk_memory_allocated_add(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 50bba370486e..b4dfb82d6ecb 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -291,15 +291,13 @@ void tcp_delack_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - sk_mem_reclaim_partial(sk); - if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) - goto out; + return; if (time_after(icsk->icsk_ack.timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); - goto out; + return; } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; @@ -318,10 +316,6 @@ void tcp_delack_timer_handler(struct sock *sk) tcp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } - -out: - if (tcp_under_memory_pressure(sk)) - sk_mem_reclaim(sk); } @@ -604,11 +598,11 @@ void tcp_write_timer_handler(struct sock *sk) if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || !icsk->icsk_pending) - goto out; + return; if (time_after(icsk->icsk_timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); - goto out; + return; } tcp_mstamp_refresh(tcp_sk(sk)); @@ -630,9 +624,6 @@ void tcp_write_timer_handler(struct sock *sk) tcp_probe_timer(sk); break; } - -out: - sk_mem_reclaim(sk); } static void tcp_write_timer(struct timer_list *t) @@ -747,8 +738,6 @@ static void tcp_keepalive_timer (struct timer_list *t) elapsed = keepalive_time_when(tp) - elapsed; } - sk_mem_reclaim(sk); - resched: inet_csk_reset_keepalive_timer (sk, elapsed); goto out; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index aa9f2ec3dc46..34eda973bbf1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -125,6 +125,8 @@ EXPORT_SYMBOL(sysctl_udp_mem); atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp; EXPORT_SYMBOL(udp_memory_allocated); +DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); +EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc); #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) @@ -1461,11 +1463,11 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, sk->sk_forward_alloc += size; - amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1); + amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); sk->sk_forward_alloc -= amt; if (amt) - __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT); + __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT); atomic_sub(size, &sk->sk_rmem_alloc); @@ -1558,7 +1560,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) spin_lock(&list->lock); if (size >= sk->sk_forward_alloc) { amt = sk_mem_pages(size); - delta = amt << SK_MEM_QUANTUM_SHIFT; + delta = amt << PAGE_SHIFT; if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) { err = -ENOBUFS; spin_unlock(&list->lock); @@ -1795,8 +1797,7 @@ busy_check: } EXPORT_SYMBOL(__skb_recv_udp); -int udp_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { int copied = 0; @@ -1818,7 +1819,8 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc, continue; } - used = recv_actor(desc, skb, 0, skb->len); + WARN_ON(!skb_set_owner_sk_safe(skb, sk)); + used = recv_actor(sk, skb); if (used <= 0) { if (!copied) copied = used; @@ -1829,13 +1831,12 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc, } kfree_skb(skb); - if (!desc->count) - break; + break; } return copied; } -EXPORT_SYMBOL(udp_read_sock); +EXPORT_SYMBOL(udp_read_skb); /* * This should be easy, if there is something there we @@ -2946,6 +2947,8 @@ struct proto udp_prot = { .psock_update_sk_prot = udp_bpf_update_proto, #endif .memory_allocated = &udp_memory_allocated, + .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, + .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), @@ -3261,19 +3264,15 @@ u32 udp_flow_hashrnd(void) } EXPORT_SYMBOL(udp_flow_hashrnd); -static void __udp_sysctl_init(struct net *net) +static int __net_init udp_sysctl_init(struct net *net) { - net->ipv4.sysctl_udp_rmem_min = SK_MEM_QUANTUM; - net->ipv4.sysctl_udp_wmem_min = SK_MEM_QUANTUM; + net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE; + net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE; #ifdef CONFIG_NET_L3_MASTER_DEV net->ipv4.sysctl_udp_l3mdev_accept = 0; #endif -} -static int __net_init udp_sysctl_init(struct net *net) -{ - __udp_sysctl_init(net); return 0; } @@ -3349,8 +3348,6 @@ void __init udp_init(void) sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; - __udp_sysctl_init(&init_net); - /* 16 spinlocks per cpu */ udp_busylocks_log = ilog2(nr_cpu_ids) + 4; udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index cd1cd68adeec..6e08a76ae1e7 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -51,7 +51,10 @@ struct proto udplite_prot = { .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, + .memory_allocated = &udp_memory_allocated, + .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, + .sysctl_mem = sysctl_udp_mem, .obj_size = sizeof(struct udp_sock), .h.udp_table = &udplite_table, diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 6fde0b184791..3d0dfa6cf9f9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -75,7 +75,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_iif = fl4->flowi4_iif; xdst->u.dst.dev = dev; - dev_hold_track(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC); + netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC); /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ |