diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 11 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 20 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 90 | ||||
-rw-r--r-- | net/ipv4/fou.c | 4 | ||||
-rw-r--r-- | net/ipv4/gre_offload.c | 2 | ||||
-rw-r--r-- | net/ipv4/icmp.c | 4 | ||||
-rw-r--r-- | net/ipv4/igmp.c | 50 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 8 | ||||
-rw-r--r-- | net/ipv4/ip_forward.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 28 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 11 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel_core.c | 11 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 2 | ||||
-rw-r--r-- | net/ipv4/netfilter/nft_dup_ipv4.c | 6 | ||||
-rw-r--r-- | net/ipv4/ping.c | 2 | ||||
-rw-r--r-- | net/ipv4/raw.c | 2 | ||||
-rw-r--r-- | net/ipv4/route.c | 4 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 8 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_dctcp.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 22 | ||||
-rw-r--r-- | net/ipv4/udp.c | 23 | ||||
-rw-r--r-- | net/ipv4/udp_impl.h | 2 | ||||
-rw-r--r-- | net/ipv4/udp_offload.c | 2 | ||||
-rw-r--r-- | net/ipv4/udplite.c | 2 |
26 files changed, 233 insertions, 104 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1effc986739e..5ddf5cda07f4 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -533,9 +533,9 @@ EXPORT_SYMBOL(inet_dgram_connect); static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) { - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending += writebias; /* Basic assumption: if someone sets sk->sk_err, he _must_ @@ -545,13 +545,12 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) */ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { release_sock(sk); - timeo = schedule_timeout(timeo); + timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); if (signal_pending(current) || !timeo) break; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending -= writebias; return timeo; } @@ -1391,7 +1390,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); - pp = ops->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c3b80478226e..161fc0f0d752 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -151,7 +151,7 @@ static void fib_replace_table(struct net *net, struct fib_table *old, int fib_unmerge(struct net *net) { - struct fib_table *old, *new; + struct fib_table *old, *new, *main_table; /* attempt to fetch local table if it has been allocated */ old = fib_get_table(net, RT_TABLE_LOCAL); @@ -162,11 +162,21 @@ int fib_unmerge(struct net *net) if (!new) return -ENOMEM; + /* table is already unmerged */ + if (new == old) + return 0; + /* replace merged table with clean table */ - if (new != old) { - fib_replace_table(net, old, new); - fib_free_table(old); - } + fib_replace_table(net, old, new); + fib_free_table(old); + + /* attempt to fetch main table if it has been allocated */ + main_table = fib_get_table(net, RT_TABLE_MAIN); + if (!main_table) + return 0; + + /* flush local entries from main table */ + fib_table_flush_external(main_table); return 0; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 31cef3602585..026f309c51e9 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1743,8 +1743,10 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) local_l = fib_find_node(lt, &local_tp, l->key); if (fib_insert_alias(lt, local_tp, local_l, new_fa, - NULL, l->key)) + NULL, l->key)) { + kmem_cache_free(fn_alias_kmem, new_fa); goto out; + } } /* stop loop if key wrapped back to 0 */ @@ -1760,6 +1762,71 @@ out: return NULL; } +/* Caller must hold RTNL */ +void fib_table_flush_external(struct fib_table *tb) +{ + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; + struct hlist_node *tmp; + struct fib_alias *fa; + + /* walk trie in reverse order */ + for (;;) { + unsigned char slen = 0; + struct key_vector *n; + + if (!(cindex--)) { + t_key pkey = pn->key; + + /* cannot resize the trie vector */ + if (IS_TRIE(pn)) + break; + + /* resize completed node */ + pn = resize(t, pn); + cindex = get_index(pkey, pn); + + continue; + } + + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; + + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; + + continue; + } + + hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { + /* if alias was cloned to local then we just + * need to remove the local copy from main + */ + if (tb->tb_id != fa->tb_id) { + hlist_del_rcu(&fa->fa_list); + alias_free_mem_rcu(fa); + continue; + } + + /* record local slen */ + slen = fa->fa_slen; + } + + /* update leaf slen */ + n->slen = slen; + + if (hlist_empty(&n->leaf)) { + put_child_root(pn, n->key, NULL); + node_free(n); + } + } +} + /* Caller must hold RTNL. */ int fib_table_flush(struct net *net, struct fib_table *tb) { @@ -2413,22 +2480,19 @@ static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, struct key_vector *l, **tp = &iter->tnode; t_key key; - /* use cache location of next-to-find key */ + /* use cached location of previously found key */ if (iter->pos > 0 && pos >= iter->pos) { - pos -= iter->pos; key = iter->key; } else { - iter->pos = 0; + iter->pos = 1; key = 0; } - while ((l = leaf_walk_rcu(tp, key)) != NULL) { + pos -= iter->pos; + + while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) { key = l->key + 1; iter->pos++; - - if (--pos <= 0) - break; - l = NULL; /* handle unlikely case of a key wrap */ @@ -2437,7 +2501,7 @@ static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, } if (l) - iter->key = key; /* remember it */ + iter->key = l->key; /* remember it */ else iter->pos = 0; /* forget it */ @@ -2465,7 +2529,7 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) return fib_route_get_idx(iter, *pos); iter->pos = 0; - iter->key = 0; + iter->key = KEY_MAX; return SEQ_START_TOKEN; } @@ -2474,7 +2538,7 @@ static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_route_iter *iter = seq->private; struct key_vector *l = NULL; - t_key key = iter->key; + t_key key = iter->key + 1; ++*pos; @@ -2483,7 +2547,7 @@ static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) l = leaf_walk_rcu(&iter->tnode, key); if (l) { - iter->key = l->key + 1; + iter->key = l->key; iter->pos++; } else { iter->pos = 0; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index cf50f7e2b012..030d1531e897 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -249,7 +249,7 @@ static struct sk_buff **fou_gro_receive(struct sock *sk, if (!ops || !ops->callbacks.gro_receive) goto out_unlock; - pp = ops->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); @@ -441,7 +441,7 @@ next_proto: if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) goto out_unlock; - pp = ops->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); flush = 0; out_unlock: diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 96e0efecefa6..d5cac99170b1 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -229,7 +229,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ skb_gro_postpull_rcsum(skb, greh, grehlen); - pp = ptype->callbacks.gro_receive(head, skb); + pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; out_unlock: diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 38abe70e595f..48734ee6293f 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -477,7 +477,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev); + fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = __ip_route_output_key_hash(net, fl4, @@ -502,7 +502,7 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - if (inet_addr_type_dev_table(net, skb_in->dev, + if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev, fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 606cc3e85d2b..15db786d50ed 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -162,7 +162,7 @@ static int unsolicited_report_interval(struct in_device *in_dev) } static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); -static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr); +static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im); static void igmpv3_clear_delrec(struct in_device *in_dev); static int sf_setstate(struct ip_mc_list *pmc); static void sf_markstate(struct ip_mc_list *pmc); @@ -1130,10 +1130,15 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) spin_unlock_bh(&in_dev->mc_tomb_lock); } -static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) +/* + * restore ip_mc_list deleted records + */ +static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list *pmc, *pmc_prev; - struct ip_sf_list *psf, *psf_next; + struct ip_sf_list *psf; + struct net *net = dev_net(in_dev->dev); + __be32 multiaddr = im->multiaddr; spin_lock_bh(&in_dev->mc_tomb_lock); pmc_prev = NULL; @@ -1149,16 +1154,26 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) in_dev->mc_tomb = pmc->next; } spin_unlock_bh(&in_dev->mc_tomb_lock); + + spin_lock_bh(&im->lock); if (pmc) { - for (psf = pmc->tomb; psf; psf = psf_next) { - psf_next = psf->sf_next; - kfree(psf); + im->interface = pmc->interface; + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + im->sfmode = pmc->sfmode; + if (pmc->sfmode == MCAST_INCLUDE) { + im->tomb = pmc->tomb; + im->sources = pmc->sources; + for (psf = im->sources; psf; psf = psf->sf_next) + psf->sf_crcount = im->crcount; } in_dev_put(pmc->interface); - kfree(pmc); } + spin_unlock_bh(&im->lock); } +/* + * flush ip_mc_list deleted records + */ static void igmpv3_clear_delrec(struct in_device *in_dev) { struct ip_mc_list *pmc, *nextpmc; @@ -1366,7 +1381,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) ip_mc_hash_add(in_dev, im); #ifdef CONFIG_IP_MULTICAST - igmpv3_del_delrec(in_dev, im->multiaddr); + igmpv3_del_delrec(in_dev, im); #endif igmp_group_added(im); if (!in_dev->dead) @@ -1626,8 +1641,12 @@ void ip_mc_remap(struct in_device *in_dev) ASSERT_RTNL(); - for_each_pmc_rtnl(in_dev, pmc) + for_each_pmc_rtnl(in_dev, pmc) { +#ifdef CONFIG_IP_MULTICAST + igmpv3_del_delrec(in_dev, pmc); +#endif igmp_group_added(pmc); + } } /* Device going down */ @@ -1648,7 +1667,6 @@ void ip_mc_down(struct in_device *in_dev) in_dev->mr_gq_running = 0; if (del_timer(&in_dev->mr_gq_timer)) __in_dev_put(in_dev); - igmpv3_clear_delrec(in_dev); #endif ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); @@ -1688,8 +1706,12 @@ void ip_mc_up(struct in_device *in_dev) #endif ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); - for_each_pmc_rtnl(in_dev, pmc) + for_each_pmc_rtnl(in_dev, pmc) { +#ifdef CONFIG_IP_MULTICAST + igmpv3_del_delrec(in_dev, pmc); +#endif igmp_group_added(pmc); + } } /* @@ -1704,13 +1726,13 @@ void ip_mc_destroy_dev(struct in_device *in_dev) /* Deactivate timers */ ip_mc_down(in_dev); +#ifdef CONFIG_IP_MULTICAST + igmpv3_clear_delrec(in_dev); +#endif while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) { in_dev->mc_list = i->next_rcu; in_dev->mc_count--; - - /* We've dropped the groups in ip_mc_down already */ - ip_mc_clear_src(i); ip_ma_put(i); } } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 77c20a489218..ca97835bfec4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -25,6 +25,7 @@ #include <net/inet_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> +#include <net/tcp.h> #include <net/sock_reuseport.h> static u32 inet_ehashfn(const struct net *net, const __be32 laddr, @@ -172,7 +173,7 @@ EXPORT_SYMBOL_GPL(__inet_inherit_port); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, - const int dif) + const int dif, bool exact_dif) { int score = -1; struct inet_sock *inet = inet_sk(sk); @@ -186,7 +187,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if || exact_dif) { if (sk->sk_bound_dev_if != dif) return -1; score += 4; @@ -215,11 +216,12 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore = 0, matches = 0, reuseport = 0; + bool exact_dif = inet_exact_dif_match(net, skb); struct sock *sk, *result = NULL; u32 phash = 0; sk_for_each_rcu(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, dif); + score = compute_score(sk, net, hnum, daddr, dif, exact_dif); if (score > hiscore) { reuseport = sk->sk_reuseport; if (reuseport) { diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 8b4ffd216839..9f0a7b96646f 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -117,7 +117,7 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; - IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS; + IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); if (ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 05d105832bdb..105908d841a3 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -239,19 +239,23 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *segs; int ret = 0; - /* common case: fragmentation of segments is not allowed, - * or seglen is <= mtu + /* common case: seglen is <= mtu */ - if (((IPCB(skb)->flags & IPSKB_FRAG_SEGS) == 0) || - skb_gso_validate_mtu(skb, mtu)) + if (skb_gso_validate_mtu(skb, mtu)) return ip_finish_output2(net, sk, skb); - /* Slowpath - GSO segment length is exceeding the dst MTU. + /* Slowpath - GSO segment length exceeds the egress MTU. * - * This can happen in two cases: - * 1) TCP GRO packet, DF bit not set - * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly - * from host network stack. + * This can happen in several cases: + * - Forwarding of a TCP GRO skb, when DF flag is not set. + * - Forwarding of an skb that arrived on a virtualization interface + * (virtio-net/vhost/tap) with TSO/GSO size set by other network + * stack. + * - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an + * interface with a smaller MTU. + * - Arriving GRO skb (or GSO skb in a virtualized environment) that is + * bridged to a NETIF_F_TSO tunnel stacked over an interface with an + * insufficent MTU. */ features = netif_skb_features(skb); BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); @@ -538,7 +542,6 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, { struct iphdr *iph; int ptr; - struct net_device *dev; struct sk_buff *skb2; unsigned int mtu, hlen, left, len, ll_rs; int offset; @@ -546,8 +549,6 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rtable *rt = skb_rtable(skb); int err = 0; - dev = rt->dst.dev; - /* for offloaded checksums cleanup checksum before fragmentation */ if (skb->ip_summed == CHECKSUM_PARTIAL && (err = skb_checksum_help(skb))) @@ -1582,7 +1583,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, } oif = arg->bound_dev_if; - oif = oif ? : skb->skb_iif; + if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) + oif = skb->skb_iif; flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark), diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index af4919792b6a..b8a2d63d1fb8 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -98,7 +98,7 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) } static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, - int offset) + int tlen, int offset) { __wsum csum = skb->csum; @@ -106,8 +106,9 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, return; if (offset != 0) - csum = csum_sub(csum, csum_partial(skb_transport_header(skb), - offset, 0)); + csum = csum_sub(csum, + csum_partial(skb_transport_header(skb) + tlen, + offset, 0)); put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum); } @@ -153,7 +154,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) } void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, - int offset) + int tlen, int offset) { struct inet_sock *inet = inet_sk(skb->sk); unsigned int flags = inet->cmsg_flags; @@ -216,7 +217,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, } if (flags & IP_CMSG_CHECKSUM) - ip_cmsg_recv_checksum(msg, skb, offset); + ip_cmsg_recv_checksum(msg, skb, tlen, offset); } EXPORT_SYMBOL(ip_cmsg_recv_offset); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 777bc1883870..fed3d29f9eb3 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -63,7 +63,6 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, int pkt_len = skb->len - skb_inner_network_offset(skb); struct net *net = dev_net(rt->dst.dev); struct net_device *dev = skb->dev; - int skb_iif = skb->skb_iif; struct iphdr *iph; int err; @@ -73,16 +72,6 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - if (skb_iif && !(df & htons(IP_DF))) { - /* Arrived from an ingress interface, got encapsulated, with - * fragmentation of encapulating frames allowed. - * If skb is gso, the resulting encapsulated network segments - * may exceed dst mtu. - * Allow IP Fragmentation of segments. - */ - IPCB(skb)->flags |= IPSKB_FRAG_SEGS; - } - /* Push down and install the IP header. */ skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 5f006e13de56..27089f5ebbb1 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1749,7 +1749,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, vif->dev->stats.tx_bytes += skb->len; } - IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS; + IPCB(skb)->flags |= IPSKB_FORWARDED; /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index bf855e64fc45..0c01a270bf9f 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -28,7 +28,7 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr, struct in_addr gw = { .s_addr = (__force __be32)regs->data[priv->sreg_addr], }; - int oif = regs->data[priv->sreg_dev]; + int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1; nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif); } @@ -59,7 +59,9 @@ static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr) { struct nft_dup_ipv4 *priv = nft_expr_priv(expr); - if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) || + if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr)) + goto nla_put_failure; + if (priv->sreg_dev && nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) goto nla_put_failure; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 7cf7d6e380c2..205e2000d395 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -994,7 +994,7 @@ struct proto ping_prot = { .init = ping_init_sock, .close = ping_close, .connect = ip4_datagram_connect, - .disconnect = udp_disconnect, + .disconnect = __udp_disconnect, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .sendmsg = ping_v4_sendmsg, diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 90a85c955872..ecbe5a7c2d6d 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -918,7 +918,7 @@ struct proto raw_prot = { .close = raw_close, .destroy = raw_destroy, .connect = ip4_datagram_connect, - .disconnect = udp_disconnect, + .disconnect = __udp_disconnect, .ioctl = raw_ioctl, .init = raw_init, .setsockopt = raw_setsockopt, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 62d4d90c1389..2a57566e6e91 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -753,7 +753,9 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow goto reject_redirect; } - n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); + n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); + if (!n) + n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); if (!IS_ERR(n)) { if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1cb67de106fe..80bc36b25de2 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -96,11 +96,11 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low container_of(table->data, struct net, ipv4.ping_group_range.range); unsigned int seq; do { - seq = read_seqbegin(&net->ipv4.ip_local_ports.lock); + seq = read_seqbegin(&net->ipv4.ping_group_range.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq)); + } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); } /* Update system visible IP port range */ @@ -109,10 +109,10 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig kgid_t *data = table->data; struct net *net = container_of(table->data, struct net, ipv4.ping_group_range.range); - write_seqlock(&net->ipv4.ip_local_ports.lock); + write_seqlock(&net->ipv4.ping_group_range.lock); data[0] = low; data[1] = high; - write_sequnlock(&net->ipv4.ip_local_ports.lock); + write_sequnlock(&net->ipv4.ping_group_range.lock); } /* Validate changes from /proc interface. */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3251fe71f39f..814af89c1bd3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1164,7 +1164,7 @@ restart: err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto out_err; + goto do_error; sg = !!(sk->sk_route_caps & NETIF_F_SG); @@ -1241,7 +1241,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i == sysctl_max_skb_frags || !sg) { + if (i >= sysctl_max_skb_frags || !sg) { tcp_mark_push(tp, skb); goto new_segment; } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 1294af4e0127..f9038d6b109e 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -200,8 +200,10 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; - if (sk->sk_state != TCP_CLOSE) + if (sk->sk_state != TCP_CLOSE) { + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); tcp_init_congestion_control(sk); + } } /* Manage refcounts on socket close. */ diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 10d728b6804c..ab37c6775630 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -56,6 +56,7 @@ struct dctcp { u32 next_seq; u32 ce_state; u32 delayed_ack_reserved; + u32 loss_cwnd; }; static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ @@ -96,6 +97,7 @@ static void dctcp_init(struct sock *sk) ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); ca->delayed_ack_reserved = 0; + ca->loss_cwnd = 0; ca->ce_state = 0; dctcp_reset(tp, ca); @@ -111,9 +113,10 @@ static void dctcp_init(struct sock *sk) static u32 dctcp_ssthresh(struct sock *sk) { - const struct dctcp *ca = inet_csk_ca(sk); + struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); + ca->loss_cwnd = tp->snd_cwnd; return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); } @@ -308,12 +311,20 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, return 0; } +static u32 dctcp_cwnd_undo(struct sock *sk) +{ + const struct dctcp *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} + static struct tcp_congestion_ops dctcp __read_mostly = { .init = dctcp_init, .in_ack_event = dctcp_update_alpha, .cwnd_event = dctcp_cwnd_event, .ssthresh = dctcp_ssthresh, .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = dctcp_cwnd_undo, .set_state = dctcp_state, .get_info = dctcp_get_info, .flags = TCP_CONG_NEEDS_ECN, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bd5e8d10893f..2259114c7242 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -86,7 +86,6 @@ int sysctl_tcp_tw_reuse __read_mostly; int sysctl_tcp_low_latency __read_mostly; -EXPORT_SYMBOL(sysctl_tcp_low_latency); #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, @@ -1565,6 +1564,21 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(tcp_add_backlog); +int tcp_filter(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th = (struct tcphdr *)skb->data; + unsigned int eaten = skb->len; + int err; + + err = sk_filter_trim_cap(sk, skb, th->doff * 4); + if (!err) { + eaten -= skb->len; + TCP_SKB_CB(skb)->end_seq -= eaten; + } + return err; +} +EXPORT_SYMBOL(tcp_filter); + /* * From tcp_input.c */ @@ -1677,8 +1691,10 @@ process: nf_reset(skb); - if (sk_filter(sk, skb)) + if (tcp_filter(sk, skb)) goto discard_and_relse; + th = (const struct tcphdr *)skb->data; + iph = ip_hdr(skb); skb->dev = NULL; @@ -1887,7 +1903,6 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); struct inet_listen_hashbucket *ilb; - struct inet_connection_sock *icsk; struct sock *sk = cur; if (!sk) { @@ -1909,7 +1924,6 @@ get_sk: continue; if (sk->sk_family == st->family) return sk; - icsk = inet_csk(sk); } spin_unlock_bh(&ilb->lock); st->offset = 0; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7d96dc2d3d08..5bab6c3f7a2f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1322,7 +1322,7 @@ try_again: *addr_len = sizeof(*sin); } if (inet->cmsg_flags) - ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr) + off); + ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr), off); err = copied; if (flags & MSG_TRUNC) @@ -1345,7 +1345,7 @@ csum_copy_err: goto try_again; } -int udp_disconnect(struct sock *sk, int flags) +int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); /* @@ -1367,6 +1367,15 @@ int udp_disconnect(struct sock *sk, int flags) sk_dst_reset(sk); return 0; } +EXPORT_SYMBOL(__udp_disconnect); + +int udp_disconnect(struct sock *sk, int flags) +{ + lock_sock(sk); + __udp_disconnect(sk, flags); + release_sock(sk); + return 0; +} EXPORT_SYMBOL(udp_disconnect); void udp_lib_unhash(struct sock *sk) @@ -1446,7 +1455,7 @@ static void udp_v4_rehash(struct sock *sk) udp_lib_rehash(sk, new_hash); } -static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; @@ -1643,10 +1652,10 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, if (use_hash2) { hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & - udp_table.mask; - hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask; + udptable->mask; + hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: - hslot = &udp_table.hash2[hash2]; + hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } @@ -2193,7 +2202,7 @@ int udp_abort(struct sock *sk, int err) sk->sk_err = err; sk->sk_error_report(sk); - udp_disconnect(sk, 0); + __udp_disconnect(sk, 0); release_sock(sk); diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 7e0fe4bdd967..feb50a16398d 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -25,7 +25,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); -int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); +int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index f9333c963607..b2be1d9757ef 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -295,7 +295,7 @@ unflush: skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); - pp = udp_sk(sk)->gro_receive(sk, head, skb); + pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index af817158d830..ff450c2aad9b 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -50,7 +50,7 @@ struct proto udplite_prot = { .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .sendpage = udp_sendpage, - .backlog_rcv = udp_queue_rcv_skb, + .backlog_rcv = __udp_queue_rcv_skb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .get_port = udp_v4_get_port, |