diff options
Diffstat (limited to 'net/ipv4')
47 files changed, 1016 insertions, 793 deletions
| diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1fbe2f815474..0dfb72c46671 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -208,6 +208,7 @@ int inet_listen(struct socket *sock, int backlog)  	if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))  		goto out; +	sk->sk_max_ack_backlog = backlog;  	/* Really, if the socket is already in listen state  	 * we can only allow the backlog to be adjusted.  	 */ @@ -231,7 +232,6 @@ int inet_listen(struct socket *sock, int backlog)  			goto out;  		tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);  	} -	sk->sk_max_ack_backlog = backlog;  	err = 0;  out: @@ -1385,6 +1385,10 @@ out:  }  EXPORT_SYMBOL(inet_gso_segment); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *, +							   struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, +							   struct sk_buff *));  struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)  {  	const struct net_offload *ops; @@ -1494,7 +1498,8 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)  	skb_gro_pull(skb, sizeof(*iph));  	skb_set_transport_header(skb, skb_gro_offset(skb)); -	pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); +	pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive, +				       ops->callbacks.gro_receive, head, skb);  out_unlock:  	rcu_read_unlock(); @@ -1556,6 +1561,8 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)  	return -EINVAL;  } +INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *, int)); +INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int));  int inet_gro_complete(struct sk_buff *skb, int nhoff)  {  	__be16 newlen = htons(skb->len - nhoff); @@ -1581,7 +1588,9 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)  	 * because any hdr with option will have been flushed in  	 * inet_gro_receive().  	 */ -	err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph)); +	err = INDIRECT_CALL_2(ops->callbacks.gro_complete, +			      tcp4_gro_complete, udp4_gro_complete, +			      skb, nhoff + sizeof(*iph));  out_unlock:  	rcu_read_unlock(); @@ -1964,6 +1973,8 @@ static int __init inet_init(void)  	/* Add UDP-Lite (RFC 3828) */  	udplite4_register(); +	raw_init(); +  	ping_init();  	/* diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 608a6f4223fb..04ba321ae5ce 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1101,7 +1101,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)  				inet_del_ifa(in_dev, ifap, 1);  			break;  		} -		ret = dev_change_flags(dev, ifr->ifr_flags); +		ret = dev_change_flags(dev, ifr->ifr_flags, NULL);  		break;  	case SIOCSIFADDR:	/* Set interface address (and family) */ diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 9e1c840596c5..5459f41fc26f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -125,10 +125,13 @@ static void esp_output_done(struct crypto_async_request *base, int err)  	void *tmp;  	struct xfrm_state *x; -	if (xo && (xo->flags & XFRM_DEV_RESUME)) -		x = skb->sp->xvec[skb->sp->len - 1]; -	else +	if (xo && (xo->flags & XFRM_DEV_RESUME)) { +		struct sec_path *sp = skb_sec_path(skb); + +		x = sp->xvec[sp->len - 1]; +	} else {  		x = skb_dst(skb)->xfrm; +	}  	tmp = ESP_SKB_CB(skb)->tmp;  	esp_ssg_unref(x, tmp); diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 58834a10c0be..8756e0e790d2 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -46,11 +46,12 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head,  	xo = xfrm_offload(skb);  	if (!xo || !(xo->flags & CRYPTO_DONE)) { -		err = secpath_set(skb); -		if (err) +		struct sec_path *sp = secpath_set(skb); + +		if (!sp)  			goto out; -		if (skb->sp->len == XFRM_MAX_DEPTH) +		if (sp->len == XFRM_MAX_DEPTH)  			goto out;  		x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, @@ -59,8 +60,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head,  		if (!x)  			goto out; -		skb->sp->xvec[skb->sp->len++] = x; -		skb->sp->olen++; +		sp->xvec[sp->len++] = x; +		sp->olen++;  		xo = xfrm_offload(skb);  		if (!xo) { @@ -114,6 +115,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,  	struct crypto_aead *aead;  	netdev_features_t esp_features = features;  	struct xfrm_offload *xo = xfrm_offload(skb); +	struct sec_path *sp;  	if (!xo)  		return ERR_PTR(-EINVAL); @@ -121,7 +123,8 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,  	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP))  		return ERR_PTR(-EINVAL); -	x = skb->sp->xvec[skb->sp->len - 1]; +	sp = skb_sec_path(skb); +	x = sp->xvec[sp->len - 1];  	aead = x->data;  	esph = ip_esp_hdr(skb); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b5c3937ca6ec..5022bc63863a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1076,7 +1076,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,  	if (!fi)  		goto failure;  	fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, -					      cfg->fc_mx_len); +					      cfg->fc_mx_len, extack);  	if (unlikely(IS_ERR(fi->fib_metrics))) {  		err = PTR_ERR(fi->fib_metrics);  		kfree(fi); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 500a59906b87..0c9f171fb085 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -3,6 +3,7 @@  #include <linux/socket.h>  #include <linux/skbuff.h>  #include <linux/ip.h> +#include <linux/icmp.h>  #include <linux/udp.h>  #include <linux/types.h>  #include <linux/kernel.h> @@ -1003,15 +1004,89 @@ static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,  	return 0;  } +static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info) +{ +	const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]); + +	if (ipprot && ipprot->err_handler) { +		if (!ipprot->err_handler(skb, info)) +			return 0; +	} + +	return -ENOENT; +} + +static int gue_err(struct sk_buff *skb, u32 info) +{ +	int transport_offset = skb_transport_offset(skb); +	struct guehdr *guehdr; +	size_t optlen; +	int ret; + +	if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr)) +		return -EINVAL; + +	guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + +	switch (guehdr->version) { +	case 0: /* Full GUE header present */ +		break; +	case 1: { +		/* Direct encasulation of IPv4 or IPv6 */ +		skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); + +		switch (((struct iphdr *)guehdr)->version) { +		case 4: +			ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info); +			goto out; +#if IS_ENABLED(CONFIG_IPV6) +		case 6: +			ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info); +			goto out; +#endif +		default: +			ret = -EOPNOTSUPP; +			goto out; +		} +	} +	default: /* Undefined version */ +		return -EOPNOTSUPP; +	} + +	if (guehdr->control) +		return -ENOENT; + +	optlen = guehdr->hlen << 2; + +	if (validate_gue_flags(guehdr, optlen)) +		return -EINVAL; + +	/* Handling exceptions for direct UDP encapsulation in GUE would lead to +	 * recursion. Besides, this kind of encapsulation can't even be +	 * configured currently. Discard this. +	 */ +	if (guehdr->proto_ctype == IPPROTO_UDP) +		return -EOPNOTSUPP; + +	skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); +	ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info); + +out: +	skb_set_transport_header(skb, transport_offset); +	return ret; +} +  static const struct ip_tunnel_encap_ops fou_iptun_ops = {  	.encap_hlen = fou_encap_hlen,  	.build_header = fou_build_header, +	.err_handler = gue_err,  };  static const struct ip_tunnel_encap_ops gue_iptun_ops = {  	.encap_hlen = gue_encap_hlen,  	.build_header = gue_build_header, +	.err_handler = gue_err,  };  static int ip_tunnel_encap_add_fou_ops(void) diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 7efe740c06eb..a4bf22ee3aed 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -151,20 +151,25 @@ drop:  	return NET_RX_DROP;  } -static void gre_err(struct sk_buff *skb, u32 info) +static int gre_err(struct sk_buff *skb, u32 info)  {  	const struct gre_protocol *proto;  	const struct iphdr *iph = (const struct iphdr *)skb->data;  	u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f; +	int err = 0;  	if (ver >= GREPROTO_MAX) -		return; +		return -EINVAL;  	rcu_read_lock();  	proto = rcu_dereference(gre_proto[ver]);  	if (proto && proto->err_handler)  		proto->err_handler(skb, info); +	else +		err = -EPROTONOSUPPORT;  	rcu_read_unlock(); + +	return err;  }  static const struct net_protocol net_gre_protocol = { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index d832beed6e3a..065997f414e6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1079,7 +1079,7 @@ error:  	goto drop;  } -void icmp_err(struct sk_buff *skb, u32 info) +int icmp_err(struct sk_buff *skb, u32 info)  {  	struct iphdr *iph = (struct iphdr *)skb->data;  	int offset = iph->ihl<<2; @@ -1094,13 +1094,15 @@ void icmp_err(struct sk_buff *skb, u32 info)  	 */  	if (icmph->type != ICMP_ECHOREPLY) {  		ping_err(skb, offset, info); -		return; +		return 0;  	}  	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)  		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);  	else if (type == ICMP_REDIRECT)  		ipv4_redirect(skb, net, 0, IPPROTO_ICMP); + +	return 0;  }  /* diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 15e7f7915a21..6ea523d71947 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -183,7 +183,9 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *  	int i, low, high, attempt_half;  	struct inet_bind_bucket *tb;  	u32 remaining, offset; +	int l3mdev; +	l3mdev = inet_sk_bound_l3mdev(sk);  	attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;  other_half_scan:  	inet_get_local_port_range(net, &low, &high); @@ -219,7 +221,8 @@ other_parity_scan:  						  hinfo->bhash_size)];  		spin_lock_bh(&head->lock);  		inet_bind_bucket_for_each(tb, &head->chain) -			if (net_eq(ib_net(tb), net) && tb->port == port) { +			if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && +			    tb->port == port) {  				if (!inet_csk_bind_conflict(sk, tb, false, false))  					goto success;  				goto next_port; @@ -293,6 +296,9 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)  	struct net *net = sock_net(sk);  	struct inet_bind_bucket *tb = NULL;  	kuid_t uid = sock_i_uid(sk); +	int l3mdev; + +	l3mdev = inet_sk_bound_l3mdev(sk);  	if (!port) {  		head = inet_csk_find_open_port(sk, &tb, &port); @@ -306,11 +312,12 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)  					  hinfo->bhash_size)];  	spin_lock_bh(&head->lock);  	inet_bind_bucket_for_each(tb, &head->chain) -		if (net_eq(ib_net(tb), net) && tb->port == port) +		if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && +		    tb->port == port)  			goto tb_found;  tb_not_found:  	tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, -				     net, head, port); +				     net, head, port, l3mdev);  	if (!tb)  		goto fail_unlock;  tb_found: @@ -874,7 +881,6 @@ int inet_csk_listen_start(struct sock *sk, int backlog)  	reqsk_queue_alloc(&icsk->icsk_accept_queue); -	sk->sk_max_ack_backlog = backlog;  	sk->sk_ack_backlog = 0;  	inet_csk_delack_init(sk); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 411dd7a90046..942265d65eb3 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -65,12 +65,14 @@ static u32 sk_ehashfn(const struct sock *sk)  struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,  						 struct net *net,  						 struct inet_bind_hashbucket *head, -						 const unsigned short snum) +						 const unsigned short snum, +						 int l3mdev)  {  	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);  	if (tb) {  		write_pnet(&tb->ib_net, net); +		tb->l3mdev    = l3mdev;  		tb->port      = snum;  		tb->fastreuse = 0;  		tb->fastreuseport = 0; @@ -135,6 +137,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)  			table->bhash_size);  	struct inet_bind_hashbucket *head = &table->bhash[bhash];  	struct inet_bind_bucket *tb; +	int l3mdev;  	spin_lock(&head->lock);  	tb = inet_csk(sk)->icsk_bind_hash; @@ -143,6 +146,8 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)  		return -ENOENT;  	}  	if (tb->port != port) { +		l3mdev = inet_sk_bound_l3mdev(sk); +  		/* NOTE: using tproxy and redirecting skbs to a proxy  		 * on a different listener port breaks the assumption  		 * that the listener socket's icsk_bind_hash is the same @@ -150,12 +155,13 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)  		 * create a new bind bucket for the child here. */  		inet_bind_bucket_for_each(tb, &head->chain) {  			if (net_eq(ib_net(tb), sock_net(sk)) && -			    tb->port == port) +			    tb->l3mdev == l3mdev && tb->port == port)  				break;  		}  		if (!tb) {  			tb = inet_bind_bucket_create(table->bind_bucket_cachep, -						     sock_net(sk), head, port); +						     sock_net(sk), head, port, +						     l3mdev);  			if (!tb) {  				spin_unlock(&head->lock);  				return -ENOMEM; @@ -228,26 +234,16 @@ static inline int compute_score(struct sock *sk, struct net *net,  				const int dif, const int sdif, bool exact_dif)  {  	int score = -1; -	struct inet_sock *inet = inet_sk(sk); -	if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && +	if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&  			!ipv6_only_sock(sk)) { -		__be32 rcv_saddr = inet->inet_rcv_saddr; -		score = sk->sk_family == PF_INET ? 2 : 1; -		if (rcv_saddr) { -			if (rcv_saddr != daddr) -				return -1; -			score += 4; -		} -		if (sk->sk_bound_dev_if || exact_dif) { -			bool dev_match = (sk->sk_bound_dev_if == dif || -					  sk->sk_bound_dev_if == sdif); +		if (sk->sk_rcv_saddr != daddr) +			return -1; -			if (!dev_match) -				return -1; -			if (sk->sk_bound_dev_if) -				score += 4; -		} +		if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) +			return -1; + +		score = sk->sk_family == PF_INET ? 2 : 1;  		if (sk->sk_incoming_cpu == raw_smp_processor_id())  			score++;  	} @@ -303,26 +299,12 @@ struct sock *__inet_lookup_listener(struct net *net,  				    const __be32 daddr, const unsigned short hnum,  				    const int dif, const int sdif)  { -	unsigned int hash = inet_lhashfn(net, hnum); -	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; -	bool exact_dif = inet_exact_dif_match(net, skb);  	struct inet_listen_hashbucket *ilb2; -	struct sock *sk, *result = NULL; -	int score, hiscore = 0; +	struct sock *result = NULL;  	unsigned int hash2; -	u32 phash = 0; - -	if (ilb->count <= 10 || !hashinfo->lhash2) -		goto port_lookup; - -	/* Too many sk in the ilb bucket (which is hashed by port alone). -	 * Try lhash2 (which is hashed by port and addr) instead. -	 */  	hash2 = ipv4_portaddr_hash(net, daddr, hnum);  	ilb2 = inet_lhash2_bucket(hashinfo, hash2); -	if (ilb2->count > ilb->count) -		goto port_lookup;  	result = inet_lhash2_lookup(net, ilb2, skb, doff,  				    saddr, sport, daddr, hnum, @@ -331,34 +313,12 @@ struct sock *__inet_lookup_listener(struct net *net,  		goto done;  	/* Lookup lhash2 with INADDR_ANY */ -  	hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);  	ilb2 = inet_lhash2_bucket(hashinfo, hash2); -	if (ilb2->count > ilb->count) -		goto port_lookup;  	result = inet_lhash2_lookup(net, ilb2, skb, doff, -				    saddr, sport, daddr, hnum, +				    saddr, sport, htonl(INADDR_ANY), hnum,  				    dif, sdif); -	goto done; - -port_lookup: -	sk_for_each_rcu(sk, &ilb->head) { -		score = compute_score(sk, net, hnum, daddr, -				      dif, sdif, exact_dif); -		if (score > hiscore) { -			if (sk->sk_reuseport) { -				phash = inet_ehashfn(net, daddr, hnum, -						     saddr, sport); -				result = reuseport_select_sock(sk, phash, -							       skb, doff); -				if (result) -					goto done; -			} -			result = sk; -			hiscore = score; -		} -	}  done:  	if (unlikely(IS_ERR(result)))  		return NULL; @@ -675,6 +635,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,  	u32 remaining, offset;  	int ret, i, low, high;  	static u32 hint; +	int l3mdev;  	if (port) {  		head = &hinfo->bhash[inet_bhashfn(net, port, @@ -693,6 +654,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,  		return ret;  	} +	l3mdev = inet_sk_bound_l3mdev(sk); +  	inet_get_local_port_range(net, &low, &high);  	high++; /* [32768, 60999] -> [32768, 61000[ */  	remaining = high - low; @@ -719,7 +682,8 @@ other_parity_scan:  		 * the established check is already unique enough.  		 */  		inet_bind_bucket_for_each(tb, &head->chain) { -			if (net_eq(ib_net(tb), net) && tb->port == port) { +			if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && +			    tb->port == port) {  				if (tb->fastreuse >= 0 ||  				    tb->fastreuseport >= 0)  					goto next_port; @@ -732,7 +696,7 @@ other_parity_scan:  		}  		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, -					     net, head, port); +					     net, head, port, l3mdev);  		if (!tb) {  			spin_unlock_bh(&head->lock);  			return -ENOMEM; @@ -798,13 +762,22 @@ void inet_hashinfo_init(struct inet_hashinfo *h)  }  EXPORT_SYMBOL_GPL(inet_hashinfo_init); +static void init_hashinfo_lhash2(struct inet_hashinfo *h) +{ +	int i; + +	for (i = 0; i <= h->lhash2_mask; i++) { +		spin_lock_init(&h->lhash2[i].lock); +		INIT_HLIST_HEAD(&h->lhash2[i].head); +		h->lhash2[i].count = 0; +	} +} +  void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,  				unsigned long numentries, int scale,  				unsigned long low_limit,  				unsigned long high_limit)  { -	unsigned int i; -  	h->lhash2 = alloc_large_system_hash(name,  					    sizeof(*h->lhash2),  					    numentries, @@ -814,13 +787,23 @@ void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,  					    &h->lhash2_mask,  					    low_limit,  					    high_limit); +	init_hashinfo_lhash2(h); +} -	for (i = 0; i <= h->lhash2_mask; i++) { -		spin_lock_init(&h->lhash2[i].lock); -		INIT_HLIST_HEAD(&h->lhash2[i].head); -		h->lhash2[i].count = 0; -	} +int inet_hashinfo2_init_mod(struct inet_hashinfo *h) +{ +	h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); +	if (!h->lhash2) +		return -ENOMEM; + +	h->lhash2_mask = INET_LHTABLE_SIZE - 1; +	/* INET_LHTABLE_SIZE must be a power of 2 */ +	BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); + +	init_hashinfo_lhash2(h); +	return 0;  } +EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod);  int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)  { diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index d5984d31ab93..00ec819f949b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -69,6 +69,13 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s  	__IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);  	__IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len); +#ifdef CONFIG_NET_SWITCHDEV +	if (skb->offload_l3_fwd_mark) { +		consume_skb(skb); +		return 0; +	} +#endif +  	if (unlikely(opt->optlen))  		ip_forward_options(skb); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 38befe829caf..c7a7bd58a23c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -121,8 +121,8 @@ static unsigned int ipgre_net_id __read_mostly;  static unsigned int gre_tap_net_id __read_mostly;  static unsigned int erspan_net_id __read_mostly; -static void ipgre_err(struct sk_buff *skb, u32 info, -		      const struct tnl_ptk_info *tpi) +static int ipgre_err(struct sk_buff *skb, u32 info, +		     const struct tnl_ptk_info *tpi)  {  	/* All the routers (except for Linux) return only @@ -146,17 +146,32 @@ static void ipgre_err(struct sk_buff *skb, u32 info,  	unsigned int data_len = 0;  	struct ip_tunnel *t; +	if (tpi->proto == htons(ETH_P_TEB)) +		itn = net_generic(net, gre_tap_net_id); +	else if (tpi->proto == htons(ETH_P_ERSPAN) || +		 tpi->proto == htons(ETH_P_ERSPAN2)) +		itn = net_generic(net, erspan_net_id); +	else +		itn = net_generic(net, ipgre_net_id); + +	iph = (const struct iphdr *)(icmp_hdr(skb) + 1); +	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, +			     iph->daddr, iph->saddr, tpi->key); + +	if (!t) +		return -ENOENT; +  	switch (type) {  	default:  	case ICMP_PARAMETERPROB: -		return; +		return 0;  	case ICMP_DEST_UNREACH:  		switch (code) {  		case ICMP_SR_FAILED:  		case ICMP_PORT_UNREACH:  			/* Impossible event. */ -			return; +			return 0;  		default:  			/* All others are translated to HOST_UNREACH.  			   rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -168,7 +183,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info,  	case ICMP_TIME_EXCEEDED:  		if (code != ICMP_EXC_TTL) -			return; +			return 0;  		data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */  		break; @@ -176,40 +191,27 @@ static void ipgre_err(struct sk_buff *skb, u32 info,  		break;  	} -	if (tpi->proto == htons(ETH_P_TEB)) -		itn = net_generic(net, gre_tap_net_id); -	else if (tpi->proto == htons(ETH_P_ERSPAN) || -		 tpi->proto == htons(ETH_P_ERSPAN2)) -		itn = net_generic(net, erspan_net_id); -	else -		itn = net_generic(net, ipgre_net_id); - -	iph = (const struct iphdr *)(icmp_hdr(skb) + 1); -	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, -			     iph->daddr, iph->saddr, tpi->key); - -	if (!t) -		return; -  #if IS_ENABLED(CONFIG_IPV6)         if (tpi->proto == htons(ETH_P_IPV6) &&             !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,  				       type, data_len)) -               return; +               return 0;  #endif  	if (t->parms.iph.daddr == 0 ||  	    ipv4_is_multicast(t->parms.iph.daddr)) -		return; +		return 0;  	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) -		return; +		return 0;  	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))  		t->err_count++;  	else  		t->err_count = 1;  	t->err_time = jiffies; + +	return 0;  }  static void gre_err(struct sk_buff *skb, u32 info) @@ -1339,12 +1341,6 @@ static void ipgre_tap_setup(struct net_device *dev)  	ip_tunnel_setup(dev, gre_tap_net_id);  } -bool is_gretap_dev(const struct net_device *dev) -{ -	return dev->netdev_ops == &gre_tap_netdev_ops; -} -EXPORT_SYMBOL_GPL(is_gretap_dev); -  static int ipgre_newlink(struct net *src_net, struct net_device *dev,  			 struct nlattr *tb[], struct nlattr *data[],  			 struct netlink_ext_ack *extack) @@ -1601,7 +1597,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,  	memset(&tb, 0, sizeof(tb));  	dev = rtnl_create_link(net, name, name_assign_type, -			       &ipgre_tap_ops, tb); +			       &ipgre_tap_ops, tb, NULL);  	if (IS_ERR(dev))  		return dev; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index e609b08c9df4..26921f6b3b92 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -188,51 +188,50 @@ bool ip_call_ra_chain(struct sk_buff *skb)  	return false;  } -static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)  { -	__skb_pull(skb, skb_network_header_len(skb)); - -	rcu_read_lock(); -	{ -		int protocol = ip_hdr(skb)->protocol; -		const struct net_protocol *ipprot; -		int raw; +	const struct net_protocol *ipprot; +	int raw, ret; -	resubmit: -		raw = raw_local_deliver(skb, protocol); +resubmit: +	raw = raw_local_deliver(skb, protocol); -		ipprot = rcu_dereference(inet_protos[protocol]); -		if (ipprot) { -			int ret; - -			if (!ipprot->no_policy) { -				if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { -					kfree_skb(skb); -					goto out; -				} -				nf_reset(skb); +	ipprot = rcu_dereference(inet_protos[protocol]); +	if (ipprot) { +		if (!ipprot->no_policy) { +			if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { +				kfree_skb(skb); +				return;  			} -			ret = ipprot->handler(skb); -			if (ret < 0) { -				protocol = -ret; -				goto resubmit; +			nf_reset(skb); +		} +		ret = ipprot->handler(skb); +		if (ret < 0) { +			protocol = -ret; +			goto resubmit; +		} +		__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); +	} else { +		if (!raw) { +			if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { +				__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); +				icmp_send(skb, ICMP_DEST_UNREACH, +					  ICMP_PROT_UNREACH, 0);  			} -			__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); +			kfree_skb(skb);  		} else { -			if (!raw) { -				if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { -					__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); -					icmp_send(skb, ICMP_DEST_UNREACH, -						  ICMP_PROT_UNREACH, 0); -				} -				kfree_skb(skb); -			} else { -				__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); -				consume_skb(skb); -			} +			__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); +			consume_skb(skb);  		}  	} - out: +} + +static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ +	__skb_pull(skb, skb_network_header_len(skb)); + +	rcu_read_lock(); +	ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);  	rcu_read_unlock();  	return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5dbec21856f4..c80188875f39 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -533,6 +533,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)  	to->tc_index = from->tc_index;  #endif  	nf_copy(to, from); +	skb_ext_copy(to, from);  #if IS_ENABLED(CONFIG_IP_VS)  	to->ipvs_property = from->ipvs_property;  #endif @@ -867,6 +868,7 @@ static int __ip_append_data(struct sock *sk,  			    unsigned int flags)  {  	struct inet_sock *inet = inet_sk(sk); +	struct ubuf_info *uarg = NULL;  	struct sk_buff *skb;  	struct ip_options *opt = cork->opt; @@ -880,8 +882,8 @@ static int __ip_append_data(struct sock *sk,  	int csummode = CHECKSUM_NONE;  	struct rtable *rt = (struct rtable *)cork->dst;  	unsigned int wmem_alloc_delta = 0; +	bool paged, extra_uref;  	u32 tskey = 0; -	bool paged;  	skb = skb_peek_tail(queue); @@ -916,6 +918,20 @@ static int __ip_append_data(struct sock *sk,  	    (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))  		csummode = CHECKSUM_PARTIAL; +	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { +		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); +		if (!uarg) +			return -ENOBUFS; +		extra_uref = true; +		if (rt->dst.dev->features & NETIF_F_SG && +		    csummode == CHECKSUM_PARTIAL) { +			paged = true; +		} else { +			uarg->zerocopy = 0; +			skb_zcopy_set(skb, uarg, &extra_uref); +		} +	} +  	cork->length += length;  	/* So, what's going on in the loop below? @@ -1001,12 +1017,6 @@ alloc_new_skb:  			skb->csum = 0;  			skb_reserve(skb, hh_len); -			/* only the initial fragment is time stamped */ -			skb_shinfo(skb)->tx_flags = cork->tx_flags; -			cork->tx_flags = 0; -			skb_shinfo(skb)->tskey = tskey; -			tskey = 0; -  			/*  			 *	Find where to start putting bytes.  			 */ @@ -1039,6 +1049,13 @@ alloc_new_skb:  			exthdrlen = 0;  			csummode = CHECKSUM_NONE; +			/* only the initial fragment is time stamped */ +			skb_shinfo(skb)->tx_flags = cork->tx_flags; +			cork->tx_flags = 0; +			skb_shinfo(skb)->tskey = tskey; +			tskey = 0; +			skb_zcopy_set(skb, uarg, &extra_uref); +  			if ((flags & MSG_CONFIRM) && !skb_prev)  				skb_set_dst_pending_confirm(skb, 1); @@ -1068,7 +1085,7 @@ alloc_new_skb:  				err = -EFAULT;  				goto error;  			} -		} else { +		} else if (!uarg || !uarg->zerocopy) {  			int i = skb_shinfo(skb)->nr_frags;  			err = -ENOMEM; @@ -1098,6 +1115,10 @@ alloc_new_skb:  			skb->data_len += copy;  			skb->truesize += copy;  			wmem_alloc_delta += copy; +		} else { +			err = skb_zerocopy_iter_dgram(skb, from, copy); +			if (err < 0) +				goto error;  		}  		offset += copy;  		length -= copy; @@ -1110,6 +1131,8 @@ alloc_new_skb:  error_efault:  	err = -EFAULT;  error: +	if (uarg) +		sock_zerocopy_put_abort(uarg, extra_uref);  	cork->length -= length;  	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);  	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index c248e0dccbe1..9a0e67b52a4e 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -120,7 +120,7 @@ int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,  	}  	skb_clear_hash_if_not_l4(skb); -	skb->vlan_tci = 0; +	__vlan_hwaccel_clear_tag(skb);  	skb_set_queue_mapping(skb, 0);  	skb_scrub_packet(skb, xnet); @@ -151,6 +151,7 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,  		       sizeof(struct in6_addr));  	else  		dst->key.u.ipv4.dst = src->key.u.ipv4.src; +	dst->key.tun_flags = src->key.tun_flags;  	dst->mode = src->mode | IP_TUNNEL_INFO_TX;  	return res; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 2393e5c106bf..b9a9873c25c6 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -220,7 +220,7 @@ static int __init ic_open_devs(void)  	for_each_netdev(&init_net, dev) {  		if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev))  			continue; -		if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) +		if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0)  			pr_err("IP-Config: Failed to open %s\n", dev->name);  	} @@ -238,7 +238,7 @@ static int __init ic_open_devs(void)  			if (ic_proto_enabled && !able)  				continue;  			oflags = dev->flags; -			if (dev_change_flags(dev, oflags | IFF_UP) < 0) { +			if (dev_change_flags(dev, oflags | IFF_UP, NULL) < 0) {  				pr_err("IP-Config: Failed to open %s\n",  				       dev->name);  				continue; @@ -315,7 +315,7 @@ static void __init ic_close_devs(void)  		dev = d->dev;  		if (d != ic_dev && !netdev_uses_dsa(dev)) {  			pr_debug("IP-Config: Downing %s\n", dev->name); -			dev_change_flags(dev, d->flags); +			dev_change_flags(dev, d->flags, NULL);  		}  		kfree(d);  	} @@ -1363,18 +1363,7 @@ static int ntp_servers_seq_show(struct seq_file *seq, void *v)  	}  	return 0;  } - -static int ntp_servers_seq_open(struct inode *inode, struct file *file) -{ -	return single_open(file, ntp_servers_seq_show, NULL); -} - -static const struct file_operations ntp_servers_seq_fops = { -	.open		= ntp_servers_seq_open, -	.read		= seq_read, -	.llseek		= seq_lseek, -	.release	= single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ntp_servers_seq);  #endif /* CONFIG_PROC_FS */  /* diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index e65287c27e3d..57c5dd283a2c 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -140,6 +140,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)  	struct ip_tunnel *t;  	int err = 0; +	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, +			     iph->daddr, iph->saddr, 0); +	if (!t) { +		err = -ENOENT; +		goto out; +	} +  	switch (type) {  	case ICMP_DEST_UNREACH:  		switch (code) { @@ -167,13 +174,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)  		goto out;  	} -	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, -			     iph->daddr, iph->saddr, 0); -	if (!t) { -		err = -ENOENT; -		goto out; -	} -  	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {  		ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);  		goto out; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e7a3879cedd0..ddbf8c9a1abb 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -508,7 +508,7 @@ static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)  			dev->flags |= IFF_MULTICAST;  			if (!ipmr_init_vif_indev(dev))  				goto failure; -			if (dev_open(dev)) +			if (dev_open(dev, NULL))  				goto failure;  			dev_hold(dev);  		} @@ -591,7 +591,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)  	if (!ipmr_init_vif_indev(dev))  		goto failure; -	if (dev_open(dev)) +	if (dev_open(dev, NULL))  		goto failure;  	dev_hold(dev); @@ -1806,7 +1806,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,  	struct vif_device *out_vif = &mrt->vif_table[out_vifi];  	struct vif_device *in_vif = &mrt->vif_table[in_vifi]; -	if (!skb->offload_mr_fwd_mark) +	if (!skb->offload_l3_fwd_mark)  		return false;  	if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len)  		return false; @@ -1824,8 +1824,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,  /* Processing handlers for ipmr_forward */  static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, -			    int in_vifi, struct sk_buff *skb, -			    struct mfc_cache *c, int vifi) +			    int in_vifi, struct sk_buff *skb, int vifi)  {  	const struct iphdr *iph = ip_hdr(skb);  	struct vif_device *vif = &mrt->vif_table[vifi]; @@ -2031,7 +2030,7 @@ forward:  				if (skb2)  					ipmr_queue_xmit(net, mrt, true_vifi, -							skb2, c, psend); +							skb2, psend);  			}  			psend = ct;  		} @@ -2043,9 +2042,9 @@ last_forward:  			if (skb2)  				ipmr_queue_xmit(net, mrt, true_vifi, skb2, -						c, psend); +						psend);  		} else { -			ipmr_queue_xmit(net, mrt, true_vifi, skb, c, psend); +			ipmr_queue_xmit(net, mrt, true_vifi, skb, psend);  			return;  		}  	} diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 6d218f5a2e71..ca9a5fefdefa 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -6,7 +6,8 @@  #include <net/tcp.h>  static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, -			      int fc_mx_len, u32 *metrics) +			      int fc_mx_len, u32 *metrics, +			      struct netlink_ext_ack *extack)  {  	bool ecn_ca = false;  	struct nlattr *nla; @@ -21,19 +22,26 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,  		if (!type)  			continue; -		if (type > RTAX_MAX) +		if (type > RTAX_MAX) { +			NL_SET_ERR_MSG(extack, "Invalid metric type");  			return -EINVAL; +		}  		if (type == RTAX_CC_ALGO) {  			char tmp[TCP_CA_NAME_MAX];  			nla_strlcpy(tmp, nla, sizeof(tmp));  			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); -			if (val == TCP_CA_UNSPEC) +			if (val == TCP_CA_UNSPEC) { +				NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm");  				return -EINVAL; +			}  		} else { -			if (nla_len(nla) != sizeof(u32)) +			if (nla_len(nla) != sizeof(u32)) { +				NL_SET_ERR_MSG_ATTR(extack, nla, +						    "Invalid attribute in metrics");  				return -EINVAL; +			}  			val = nla_get_u32(nla);  		}  		if (type == RTAX_ADVMSS && val > 65535 - 40) @@ -42,8 +50,10 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,  			val = 65535 - 15;  		if (type == RTAX_HOPLIMIT && val > 255)  			val = 255; -		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) +		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) { +			NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute");  			return -EINVAL; +		}  		metrics[type - 1] = val;  	} @@ -54,7 +64,8 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,  }  struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, -					int fc_mx_len) +					int fc_mx_len, +					struct netlink_ext_ack *extack)  {  	struct dst_metrics *fib_metrics;  	int err; @@ -66,7 +77,8 @@ struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,  	if (unlikely(!fib_metrics))  		return ERR_PTR(-ENOMEM); -	err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics); +	err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics, +				 extack);  	if (!err) {  		refcount_set(&fib_metrics->refcnt, 1);  	} else { diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 184bf2e0a1ed..80f72cc5ca8d 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -156,15 +156,10 @@ config NF_NAT_SNMP_BASIC  	  To compile it as a module, choose M here.  If unsure, say N. -config NF_NAT_PROTO_GRE -	tristate -	depends on NF_CT_PROTO_GRE -  config NF_NAT_PPTP  	tristate  	depends on NF_CONNTRACK  	default NF_CONNTRACK_PPTP -	select NF_NAT_PROTO_GRE  config NF_NAT_H323  	tristate diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 367993adf4d3..fd7122e0e2c9 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -3,7 +3,7 @@  # Makefile for the netfilter modules on top of IPv4.  # -nf_nat_ipv4-y		:= nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o +nf_nat_ipv4-y		:= nf_nat_l3proto_ipv4.o  nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o  obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o @@ -28,9 +28,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o  $(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h  obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o -# NAT protocols (nf_nat) -obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o -  obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o  obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o  obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2c8d313ae216..b61977db9b7f 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -56,18 +56,15 @@ struct clusterip_config {  #endif  	enum clusterip_hashmode hash_mode;	/* which hashing mode */  	u_int32_t hash_initval;			/* hash initialization */ -	struct rcu_head rcu; - +	struct rcu_head rcu;			/* for call_rcu_bh */ +	struct net *net;			/* netns for pernet list */  	char ifname[IFNAMSIZ];			/* device ifname */ -	struct notifier_block notifier;		/* refresh c->ifindex in it */  };  #ifdef CONFIG_PROC_FS  static const struct file_operations clusterip_proc_fops;  #endif -static unsigned int clusterip_net_id __read_mostly; -  struct clusterip_net {  	struct list_head configs;  	/* lock protects the configs list */ @@ -75,51 +72,66 @@ struct clusterip_net {  #ifdef CONFIG_PROC_FS  	struct proc_dir_entry *procdir; +	/* mutex protects the config->pde*/ +	struct mutex mutex;  #endif  }; +static unsigned int clusterip_net_id __read_mostly; +static inline struct clusterip_net *clusterip_pernet(struct net *net) +{ +	return net_generic(net, clusterip_net_id); +} +  static inline void  clusterip_config_get(struct clusterip_config *c)  {  	refcount_inc(&c->refcount);  } -  static void clusterip_config_rcu_free(struct rcu_head *head)  { -	kfree(container_of(head, struct clusterip_config, rcu)); +	struct clusterip_config *config; +	struct net_device *dev; + +	config = container_of(head, struct clusterip_config, rcu); +	dev = dev_get_by_name(config->net, config->ifname); +	if (dev) { +		dev_mc_del(dev, config->clustermac); +		dev_put(dev); +	} +	kfree(config);  }  static inline void  clusterip_config_put(struct clusterip_config *c)  {  	if (refcount_dec_and_test(&c->refcount)) -		call_rcu_bh(&c->rcu, clusterip_config_rcu_free); +		call_rcu(&c->rcu, clusterip_config_rcu_free);  }  /* decrease the count of entries using/referencing this config.  If last   * entry(rule) is removed, remove the config from lists, but don't free it   * yet, since proc-files could still be holding references */  static inline void -clusterip_config_entry_put(struct net *net, struct clusterip_config *c) +clusterip_config_entry_put(struct clusterip_config *c)  { -	struct clusterip_net *cn = net_generic(net, clusterip_net_id); +	struct clusterip_net *cn = clusterip_pernet(c->net);  	local_bh_disable();  	if (refcount_dec_and_lock(&c->entries, &cn->lock)) { +		list_del_rcu(&c->list); +		spin_unlock(&cn->lock); +		local_bh_enable();  		/* In case anyone still accesses the file, the open/close  		 * functions are also incrementing the refcount on their own,  		 * so it's safe to remove the entry even if it's in use. */  #ifdef CONFIG_PROC_FS +		mutex_lock(&cn->mutex);  		if (cn->procdir)  			proc_remove(c->pde); +		mutex_unlock(&cn->mutex);  #endif -		list_del_rcu(&c->list); -		spin_unlock(&cn->lock); -		local_bh_enable(); - -		unregister_netdevice_notifier(&c->notifier); -  		return;  	}  	local_bh_enable(); @@ -129,7 +141,7 @@ static struct clusterip_config *  __clusterip_config_find(struct net *net, __be32 clusterip)  {  	struct clusterip_config *c; -	struct clusterip_net *cn = net_generic(net, clusterip_net_id); +	struct clusterip_net *cn = clusterip_pernet(net);  	list_for_each_entry_rcu(c, &cn->configs, list) {  		if (c->clusterip == clusterip) @@ -181,32 +193,37 @@ clusterip_netdev_event(struct notifier_block *this, unsigned long event,  		       void *ptr)  {  	struct net_device *dev = netdev_notifier_info_to_dev(ptr); +	struct net *net = dev_net(dev); +	struct clusterip_net *cn = clusterip_pernet(net);  	struct clusterip_config *c; -	c = container_of(this, struct clusterip_config, notifier); -	switch (event) { -	case NETDEV_REGISTER: -		if (!strcmp(dev->name, c->ifname)) { -			c->ifindex = dev->ifindex; -			dev_mc_add(dev, c->clustermac); -		} -		break; -	case NETDEV_UNREGISTER: -		if (dev->ifindex == c->ifindex) { -			dev_mc_del(dev, c->clustermac); -			c->ifindex = -1; -		} -		break; -	case NETDEV_CHANGENAME: -		if (!strcmp(dev->name, c->ifname)) { -			c->ifindex = dev->ifindex; -			dev_mc_add(dev, c->clustermac); -		} else if (dev->ifindex == c->ifindex) { -			dev_mc_del(dev, c->clustermac); -			c->ifindex = -1; +	spin_lock_bh(&cn->lock); +	list_for_each_entry_rcu(c, &cn->configs, list) { +		switch (event) { +		case NETDEV_REGISTER: +			if (!strcmp(dev->name, c->ifname)) { +				c->ifindex = dev->ifindex; +				dev_mc_add(dev, c->clustermac); +			} +			break; +		case NETDEV_UNREGISTER: +			if (dev->ifindex == c->ifindex) { +				dev_mc_del(dev, c->clustermac); +				c->ifindex = -1; +			} +			break; +		case NETDEV_CHANGENAME: +			if (!strcmp(dev->name, c->ifname)) { +				c->ifindex = dev->ifindex; +				dev_mc_add(dev, c->clustermac); +			} else if (dev->ifindex == c->ifindex) { +				dev_mc_del(dev, c->clustermac); +				c->ifindex = -1; +			} +			break;  		} -		break;  	} +	spin_unlock_bh(&cn->lock);  	return NOTIFY_DONE;  } @@ -215,30 +232,44 @@ static struct clusterip_config *  clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,  		      __be32 ip, const char *iniface)  { -	struct clusterip_net *cn = net_generic(net, clusterip_net_id); +	struct clusterip_net *cn = clusterip_pernet(net);  	struct clusterip_config *c; +	struct net_device *dev;  	int err; +	if (iniface[0] == '\0') { +		pr_info("Please specify an interface name\n"); +		return ERR_PTR(-EINVAL); +	} +  	c = kzalloc(sizeof(*c), GFP_ATOMIC);  	if (!c)  		return ERR_PTR(-ENOMEM); -	strcpy(c->ifname, iniface); -	c->ifindex = -1; -	c->clusterip = ip; +	dev = dev_get_by_name(net, iniface); +	if (!dev) { +		pr_info("no such interface %s\n", iniface); +		kfree(c); +		return ERR_PTR(-ENOENT); +	} +	c->ifindex = dev->ifindex; +	strcpy(c->ifname, dev->name);  	memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); +	dev_mc_add(dev, c->clustermac); +	dev_put(dev); + +	c->clusterip = ip;  	c->num_total_nodes = i->num_total_nodes;  	clusterip_config_init_nodelist(c, i);  	c->hash_mode = i->hash_mode;  	c->hash_initval = i->hash_initval; +	c->net = net;  	refcount_set(&c->refcount, 1);  	spin_lock_bh(&cn->lock);  	if (__clusterip_config_find(net, ip)) { -		spin_unlock_bh(&cn->lock); -		kfree(c); - -		return ERR_PTR(-EBUSY); +		err = -EBUSY; +		goto out_config_put;  	}  	list_add_rcu(&c->list, &cn->configs); @@ -250,9 +281,11 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,  		/* create proc dir entry */  		sprintf(buffer, "%pI4", &ip); +		mutex_lock(&cn->mutex);  		c->pde = proc_create_data(buffer, 0600,  					  cn->procdir,  					  &clusterip_proc_fops, c); +		mutex_unlock(&cn->mutex);  		if (!c->pde) {  			err = -ENOMEM;  			goto err; @@ -260,22 +293,17 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,  	}  #endif -	c->notifier.notifier_call = clusterip_netdev_event; -	err = register_netdevice_notifier(&c->notifier); -	if (!err) { -		refcount_set(&c->entries, 1); -		return c; -	} +	refcount_set(&c->entries, 1); +	return c;  #ifdef CONFIG_PROC_FS -	proc_remove(c->pde);  err:  #endif  	spin_lock_bh(&cn->lock);  	list_del_rcu(&c->list); +out_config_put:  	spin_unlock_bh(&cn->lock);  	clusterip_config_put(c); -  	return ERR_PTR(err);  } @@ -475,34 +503,20 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)  				&e->ip.dst.s_addr);  			return -EINVAL;  		} else { -			struct net_device *dev; - -			if (e->ip.iniface[0] == '\0') { -				pr_info("Please specify an interface name\n"); -				return -EINVAL; -			} - -			dev = dev_get_by_name(par->net, e->ip.iniface); -			if (!dev) { -				pr_info("no such interface %s\n", -					e->ip.iniface); -				return -ENOENT; -			} -			dev_put(dev); -  			config = clusterip_config_init(par->net, cipinfo,  						       e->ip.dst.s_addr,  						       e->ip.iniface);  			if (IS_ERR(config))  				return PTR_ERR(config);  		} -	} +	} else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) +		return -EINVAL;  	ret = nf_ct_netns_get(par->net, par->family);  	if (ret < 0) {  		pr_info("cannot load conntrack support for proto=%u\n",  			par->family); -		clusterip_config_entry_put(par->net, config); +		clusterip_config_entry_put(config);  		clusterip_config_put(config);  		return ret;  	} @@ -524,7 +538,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)  	/* if no more entries are referencing the config, remove it  	 * from the list and destroy the proc entry */ -	clusterip_config_entry_put(par->net, cipinfo->config); +	clusterip_config_entry_put(cipinfo->config);  	clusterip_config_put(cipinfo->config); @@ -806,7 +820,7 @@ static const struct file_operations clusterip_proc_fops = {  static int clusterip_net_init(struct net *net)  { -	struct clusterip_net *cn = net_generic(net, clusterip_net_id); +	struct clusterip_net *cn = clusterip_pernet(net);  	int ret;  	INIT_LIST_HEAD(&cn->configs); @@ -824,6 +838,7 @@ static int clusterip_net_init(struct net *net)  		pr_err("Unable to proc dir entry\n");  		return -ENOMEM;  	} +	mutex_init(&cn->mutex);  #endif /* CONFIG_PROC_FS */  	return 0; @@ -831,13 +846,15 @@ static int clusterip_net_init(struct net *net)  static void clusterip_net_exit(struct net *net)  { -	struct clusterip_net *cn = net_generic(net, clusterip_net_id); +	struct clusterip_net *cn = clusterip_pernet(net); +  #ifdef CONFIG_PROC_FS +	mutex_lock(&cn->mutex);  	proc_remove(cn->procdir);  	cn->procdir = NULL; +	mutex_unlock(&cn->mutex);  #endif  	nf_unregister_net_hook(net, &cip_arp_ops); -	WARN_ON_ONCE(!list_empty(&cn->configs));  }  static struct pernet_operations clusterip_net_ops = { @@ -847,6 +864,10 @@ static struct pernet_operations clusterip_net_ops = {  	.size = sizeof(struct clusterip_net),  }; +struct notifier_block cip_netdev_notifier = { +	.notifier_call = clusterip_netdev_event +}; +  static int __init clusterip_tg_init(void)  {  	int ret; @@ -859,11 +880,17 @@ static int __init clusterip_tg_init(void)  	if (ret < 0)  		goto cleanup_subsys; +	ret = register_netdevice_notifier(&cip_netdev_notifier); +	if (ret < 0) +		goto unregister_target; +  	pr_info("ClusterIP Version %s loaded successfully\n",  		CLUSTERIP_VERSION);  	return 0; +unregister_target: +	xt_unregister_target(&clusterip_tg_reg);  cleanup_subsys:  	unregister_pernet_subsys(&clusterip_net_ops);  	return ret; @@ -873,11 +900,12 @@ static void __exit clusterip_tg_exit(void)  {  	pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); +	unregister_netdevice_notifier(&cip_netdev_notifier);  	xt_unregister_target(&clusterip_tg_reg);  	unregister_pernet_subsys(&clusterip_net_ops); -	/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */ -	rcu_barrier_bh(); +	/* Wait for completion of call_rcu()'s (clusterip_config_rcu_free) */ +	rcu_barrier();  }  module_init(clusterip_tg_init); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 78a67f961d86..2687db015b6f 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -62,22 +62,8 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,  }  #endif /* CONFIG_XFRM */ -static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t, -				 const struct nf_nat_range2 *range) -{ -	return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && -	       ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); -} - -static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t, -				   __be16 dport) -{ -	return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport); -} -  static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,  				  unsigned int iphdroff, -				  const struct nf_nat_l4proto *l4proto,  				  const struct nf_conntrack_tuple *target,  				  enum nf_nat_manip_type maniptype)  { @@ -90,8 +76,8 @@ static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,  	iph = (void *)skb->data + iphdroff;  	hdroff = iphdroff + iph->ihl * 4; -	if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff, -				target, maniptype)) +	if (!nf_nat_l4proto_manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, +				      hdroff, target, maniptype))  		return false;  	iph = (void *)skb->data + iphdroff; @@ -161,8 +147,6 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],  static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {  	.l3proto		= NFPROTO_IPV4, -	.in_range		= nf_nat_ipv4_in_range, -	.secure_port		= nf_nat_ipv4_secure_port,  	.manip_pkt		= nf_nat_ipv4_manip_pkt,  	.csum_update		= nf_nat_ipv4_csum_update,  	.csum_recalc		= nf_nat_ipv4_csum_recalc, @@ -186,7 +170,6 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,  	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);  	enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);  	unsigned int hdrlen = ip_hdrlen(skb); -	const struct nf_nat_l4proto *l4proto;  	struct nf_conntrack_tuple target;  	unsigned long statusbit; @@ -217,9 +200,8 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,  	if (!(ct->status & statusbit))  		return 1; -	l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);  	if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp), -				   l4proto, &ct->tuplehash[!dir].tuple, !manip)) +				   &ct->tuplehash[!dir].tuple, !manip))  		return 0;  	if (skb->ip_summed != CHECKSUM_PARTIAL) { @@ -233,8 +215,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,  	/* Change outer to look like the reply to an incoming packet */  	nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); -	l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0); -	if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip)) +	if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip))  		return 0;  	return 1; @@ -391,26 +372,12 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_unregister_fn);  static int __init nf_nat_l3proto_ipv4_init(void)  { -	int err; - -	err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp); -	if (err < 0) -		goto err1; -	err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4); -	if (err < 0) -		goto err2; -	return err; - -err2: -	nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp); -err1: -	return err; +	return nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);  }  static void __exit nf_nat_l3proto_ipv4_exit(void)  {  	nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4); -	nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);  }  MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 5d259a12e25f..68b4d450391b 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -299,8 +299,6 @@ pptp_inbound_pkt(struct sk_buff *skb,  static int __init nf_nat_helper_pptp_init(void)  { -	nf_nat_need_gre(); -  	BUG_ON(nf_nat_pptp_hook_outbound != NULL);  	RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt); diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c deleted file mode 100644 index 00fda6331ce5..000000000000 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * nf_nat_proto_gre.c - * - * NAT protocol helper module for GRE. - * - * GRE is a generic encapsulation protocol, which is generally not very - * suited for NAT, as it has no protocol-specific part as port numbers. - * - * It has an optional key field, which may help us distinguishing two - * connections between the same two hosts. - * - * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 - * - * PPTP is built on top of a modified version of GRE, and has a mandatory - * field called "CallID", which serves us for the same purpose as the key - * field in plain GRE. - * - * Documentation about PPTP can be found in RFC 2637 - * - * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org> - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - * (C) 2006-2012 Patrick McHardy <kaber@trash.net> - * - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> - -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_l4proto.h> -#include <linux/netfilter/nf_conntrack_proto_gre.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); - -/* generate unique tuple ... */ -static void -gre_unique_tuple(const struct nf_nat_l3proto *l3proto, -		 struct nf_conntrack_tuple *tuple, -		 const struct nf_nat_range2 *range, -		 enum nf_nat_manip_type maniptype, -		 const struct nf_conn *ct) -{ -	static u_int16_t key; -	__be16 *keyptr; -	unsigned int min, i, range_size; - -	/* If there is no master conntrack we are not PPTP, -	   do not change tuples */ -	if (!ct->master) -		return; - -	if (maniptype == NF_NAT_MANIP_SRC) -		keyptr = &tuple->src.u.gre.key; -	else -		keyptr = &tuple->dst.u.gre.key; - -	if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { -		pr_debug("%p: NATing GRE PPTP\n", ct); -		min = 1; -		range_size = 0xffff; -	} else { -		min = ntohs(range->min_proto.gre.key); -		range_size = ntohs(range->max_proto.gre.key) - min + 1; -	} - -	pr_debug("min = %u, range_size = %u\n", min, range_size); - -	for (i = 0; ; ++key) { -		*keyptr = htons(min + key % range_size); -		if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) -			return; -	} - -	pr_debug("%p: no NAT mapping\n", ct); -	return; -} - -/* manipulate a GRE packet according to maniptype */ -static bool -gre_manip_pkt(struct sk_buff *skb, -	      const struct nf_nat_l3proto *l3proto, -	      unsigned int iphdroff, unsigned int hdroff, -	      const struct nf_conntrack_tuple *tuple, -	      enum nf_nat_manip_type maniptype) -{ -	const struct gre_base_hdr *greh; -	struct pptp_gre_header *pgreh; - -	/* pgreh includes two optional 32bit fields which are not required -	 * to be there.  That's where the magic '8' comes from */ -	if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8)) -		return false; - -	greh = (void *)skb->data + hdroff; -	pgreh = (struct pptp_gre_header *)greh; - -	/* we only have destination manip of a packet, since 'source key' -	 * is not present in the packet itself */ -	if (maniptype != NF_NAT_MANIP_DST) -		return true; - -	switch (greh->flags & GRE_VERSION) { -	case GRE_VERSION_0: -		/* We do not currently NAT any GREv0 packets. -		 * Try to behave like "nf_nat_proto_unknown" */ -		break; -	case GRE_VERSION_1: -		pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); -		pgreh->call_id = tuple->dst.u.gre.key; -		break; -	default: -		pr_debug("can't nat unknown GRE version\n"); -		return false; -	} -	return true; -} - -static const struct nf_nat_l4proto gre = { -	.l4proto		= IPPROTO_GRE, -	.manip_pkt		= gre_manip_pkt, -	.in_range		= nf_nat_l4proto_in_range, -	.unique_tuple		= gre_unique_tuple, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) -	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range, -#endif -}; - -static int __init nf_nat_proto_gre_init(void) -{ -	return nf_nat_l4proto_register(NFPROTO_IPV4, &gre); -} - -static void __exit nf_nat_proto_gre_fini(void) -{ -	nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre); -} - -module_init(nf_nat_proto_gre_init); -module_exit(nf_nat_proto_gre_fini); - -void nf_nat_need_gre(void) -{ -	return; -} -EXPORT_SYMBOL_GPL(nf_nat_need_gre); diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c deleted file mode 100644 index 6d7cf1d79baf..000000000000 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ /dev/null @@ -1,83 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/types.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/ip.h> -#include <linux/icmp.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_l4proto.h> - -static bool -icmp_in_range(const struct nf_conntrack_tuple *tuple, -	      enum nf_nat_manip_type maniptype, -	      const union nf_conntrack_man_proto *min, -	      const union nf_conntrack_man_proto *max) -{ -	return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && -	       ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); -} - -static void -icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, -		  struct nf_conntrack_tuple *tuple, -		  const struct nf_nat_range2 *range, -		  enum nf_nat_manip_type maniptype, -		  const struct nf_conn *ct) -{ -	static u_int16_t id; -	unsigned int range_size; -	unsigned int i; - -	range_size = ntohs(range->max_proto.icmp.id) - -		     ntohs(range->min_proto.icmp.id) + 1; -	/* If no range specified... */ -	if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) -		range_size = 0xFFFF; - -	for (i = 0; ; ++id) { -		tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) + -					     (id % range_size)); -		if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) -			return; -	} -	return; -} - -static bool -icmp_manip_pkt(struct sk_buff *skb, -	       const struct nf_nat_l3proto *l3proto, -	       unsigned int iphdroff, unsigned int hdroff, -	       const struct nf_conntrack_tuple *tuple, -	       enum nf_nat_manip_type maniptype) -{ -	struct icmphdr *hdr; - -	if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) -		return false; - -	hdr = (struct icmphdr *)(skb->data + hdroff); -	inet_proto_csum_replace2(&hdr->checksum, skb, -				 hdr->un.echo.id, tuple->src.u.icmp.id, false); -	hdr->un.echo.id = tuple->src.u.icmp.id; -	return true; -} - -const struct nf_nat_l4proto nf_nat_l4proto_icmp = { -	.l4proto		= IPPROTO_ICMP, -	.manip_pkt		= icmp_manip_pkt, -	.in_range		= icmp_in_range, -	.unique_tuple		= icmp_unique_tuple, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) -	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range, -#endif -}; diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 5cd06ba3535d..aa8304c618b8 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -102,6 +102,7 @@ EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);  /* Send RST reply */  void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)  { +	struct net_device *br_indev __maybe_unused;  	struct sk_buff *nskb;  	struct iphdr *niph;  	const struct tcphdr *oth; @@ -147,10 +148,11 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)  	 * build the eth header using the original destination's MAC as the  	 * source, and send the RST packet directly.  	 */ -	if (oldskb->nf_bridge) { +	br_indev = nf_bridge_get_physindev(oldskb); +	if (br_indev) {  		struct ethhdr *oeth = eth_hdr(oldskb); -		nskb->dev = nf_bridge_get_physindev(oldskb); +		nskb->dev = br_indev;  		niph->tot_len = htons(nskb->len);  		ip_send_check(niph);  		if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 70289682a670..c3610b37bb4c 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = {  	SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),  	SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),  	SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), +	SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),  	SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),  	SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),  	SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 32a691b7ce2c..92d249e053be 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -29,6 +29,7 @@  #include <net/protocol.h>  struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; +EXPORT_SYMBOL(inet_protos);  const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;  EXPORT_SYMBOL(inet_offloads); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 169a652b3dd1..c55a5432cf37 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -131,8 +131,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,  		if (net_eq(sock_net(sk), net) && inet->inet_num == num	&&  		    !(inet->inet_daddr && inet->inet_daddr != raddr) 	&&  		    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && -		    !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && -		      sk->sk_bound_dev_if != sdif)) +		    raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))  			goto found; /* gotcha */  	}  	sk = NULL; @@ -805,7 +804,7 @@ out:  	return copied;  } -static int raw_init(struct sock *sk) +static int raw_sk_init(struct sock *sk)  {  	struct raw_sock *rp = raw_sk(sk); @@ -970,7 +969,7 @@ struct proto raw_prot = {  	.connect	   = ip4_datagram_connect,  	.disconnect	   = __udp_disconnect,  	.ioctl		   = raw_ioctl, -	.init		   = raw_init, +	.init		   = raw_sk_init,  	.setsockopt	   = raw_setsockopt,  	.getsockopt	   = raw_getsockopt,  	.sendmsg	   = raw_sendmsg, @@ -1134,3 +1133,27 @@ void __init raw_proc_exit(void)  	unregister_pernet_subsys(&raw_net_ops);  }  #endif /* CONFIG_PROC_FS */ + +static void raw_sysctl_init_net(struct net *net) +{ +#ifdef CONFIG_NET_L3_MASTER_DEV +	net->ipv4.sysctl_raw_l3mdev_accept = 1; +#endif +} + +static int __net_init raw_sysctl_init(struct net *net) +{ +	raw_sysctl_init_net(net); +	return 0; +} + +static struct pernet_operations __net_initdata raw_sysctl_ops = { +	.init	= raw_sysctl_init, +}; + +void __init raw_init(void) +{ +	raw_sysctl_init_net(&init_net); +	if (register_pernet_subsys(&raw_sysctl_ops)) +		panic("RAW: failed to init sysctl parameters.\n"); +} diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c0a9d26c06ce..ce92f73cf104 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1677,7 +1677,7 @@ static void ip_handle_martian_source(struct net_device *dev,  			print_hex_dump(KERN_WARNING, "ll header: ",  				       DUMP_PREFIX_OFFSET, 16, 1,  				       skb_mac_header(skb), -				       dev->hard_header_len, true); +				       dev->hard_header_len, false);  		}  	}  #endif @@ -2849,6 +2849,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,  			err = -rt->dst.error;  	} else {  		fl4.flowi4_iif = LOOPBACK_IFINDEX; +		skb->dev = net->loopback_dev;  		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);  		err = 0;  		if (IS_ERR(rt)) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 891ed2f91467..ba0fc4b18465 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -602,6 +602,17 @@ static struct ctl_table ipv4_net_table[] = {  		.mode		= 0644,  		.proc_handler	= ipv4_ping_group_range,  	}, +#ifdef CONFIG_NET_L3_MASTER_DEV +	{ +		.procname	= "raw_l3mdev_accept", +		.data		= &init_net.ipv4.sysctl_raw_l3mdev_accept, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero, +		.extra2		= &one, +	}, +#endif  	{  		.procname	= "tcp_ecn",  		.data		= &init_net.ipv4.sysctl_tcp_ecn, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9e6bc4d6daa7..27e2f6837062 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1423,7 +1423,7 @@ do_error:  	if (copied + copied_syn)  		goto out;  out_err: -	sock_zerocopy_put_abort(uarg); +	sock_zerocopy_put_abort(uarg, true);  	err = sk_stream_error(sk, flags, err);  	/* make sure we wake any epoll edge trigger waiter */  	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && @@ -2088,7 +2088,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,  		}  		continue; -	found_ok_skb: +found_ok_skb:  		/* Ok so how much can we use? */  		used = skb->len - offset;  		if (len < used) @@ -2147,7 +2147,7 @@ skip_copy:  			sk_eat_skb(sk, skb);  		continue; -	found_fin_ok: +found_fin_ok:  		/* Process the FIN. */  		++*seq;  		if (!(flags & MSG_PEEK)) @@ -2241,10 +2241,6 @@ void tcp_set_state(struct sock *sk, int state)  	 * socket sitting in hash tables.  	 */  	inet_sk_state_store(sk, state); - -#ifdef STATE_TRACE -	SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); -#endif  }  EXPORT_SYMBOL_GPL(tcp_set_state); @@ -3246,6 +3242,7 @@ static size_t tcp_opt_stats_get_size(void)  		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */  		nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */  		nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ +		nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */  		0;  } @@ -3299,6 +3296,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)  			  TCP_NLA_PAD);  	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);  	nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); +	nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);  	return stats;  } @@ -3658,8 +3656,11 @@ bool tcp_alloc_md5sig_pool(void)  	if (unlikely(!tcp_md5sig_pool_populated)) {  		mutex_lock(&tcp_md5sig_mutex); -		if (!tcp_md5sig_pool_populated) +		if (!tcp_md5sig_pool_populated) {  			__tcp_alloc_md5sig_pool(); +			if (tcp_md5sig_pool_populated) +				static_key_slow_inc(&tcp_md5_needed); +		}  		mutex_unlock(&tcp_md5sig_mutex);  	} diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 9277abdd822a..0f497fc49c3f 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -128,7 +128,12 @@ static const u32 bbr_probe_rtt_mode_ms = 200;  /* Skip TSO below the following bandwidth (bits/sec): */  static const int bbr_min_tso_rate = 1200000; -/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */ +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while + * maintaining high utilization, the average pacing rate aims to be slightly + * lower than the estimated bandwidth. This is an important aspect of the + * design. + */  static const int bbr_pacing_margin_percent = 1;  /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain @@ -247,13 +252,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)  	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);  } -/* Pace using current bw estimate and a gain factor. In order to help drive the - * network toward lower queues while maintaining high utilization and low - * latency, the average pacing rate aims to be slightly (~1%) lower than the - * estimated bandwidth. This is an important aspect of the design. In this - * implementation this slightly lower pacing rate is achieved implicitly by not - * including link-layer headers in the packet size used for the pacing rate. - */ +/* Pace using current bw estimate and a gain factor. */  static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)  {  	struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 3b45fe530f91..1bb7321a256d 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -8,6 +8,7 @@  #include <linux/wait.h>  #include <net/inet_common.h> +#include <net/tls.h>  static bool tcp_bpf_stream_read(const struct sock *sk)  { @@ -198,7 +199,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,  		msg->sg.start = i;  		msg->sg.size -= apply_bytes;  		sk_psock_queue_msg(psock, tmp); -		sk->sk_data_ready(sk); +		sk_psock_data_ready(sk, psock);  	} else {  		sk_msg_free(sk, tmp);  		kfree(tmp); @@ -218,6 +219,8 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,  	u32 off;  	while (1) { +		bool has_tx_ulp; +  		sge = sk_msg_elem(msg, msg->sg.start);  		size = (apply && apply_bytes < sge->length) ?  			apply_bytes : sge->length; @@ -226,7 +229,15 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,  		tcp_rate_check_app_limited(sk);  retry: -		ret = do_tcp_sendpages(sk, page, off, size, flags); +		has_tx_ulp = tls_sw_has_ctx_tx(sk); +		if (has_tx_ulp) { +			flags |= MSG_SENDPAGE_NOPOLICY; +			ret = kernel_sendpage_locked(sk, +						     page, off, size, flags); +		} else { +			ret = do_tcp_sendpages(sk, page, off, size, flags); +		} +  		if (ret <= 0)  			return ret;  		if (apply) @@ -289,12 +300,23 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,  {  	bool cork = false, enospc = msg->sg.start == msg->sg.end;  	struct sock *sk_redir; -	u32 tosend; +	u32 tosend, delta = 0;  	int ret;  more_data: -	if (psock->eval == __SK_NONE) +	if (psock->eval == __SK_NONE) { +		/* Track delta in msg size to add/subtract it on SK_DROP from +		 * returned to user copied size. This ensures user doesn't +		 * get a positive return code with msg_cut_data and SK_DROP +		 * verdict. +		 */ +		delta = msg->sg.size;  		psock->eval = sk_psock_msg_verdict(sk, psock, msg); +		if (msg->sg.size < delta) +			delta -= msg->sg.size; +		else +			delta = 0; +	}  	if (msg->cork_bytes &&  	    msg->cork_bytes > msg->sg.size && !enospc) { @@ -350,7 +372,7 @@ more_data:  	default:  		sk_msg_free_partial(sk, msg, tosend);  		sk_msg_apply_bytes(psock, tosend); -		*copied -= tosend; +		*copied -= (tosend + delta);  		return -EACCES;  	} diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a9d9555a973f..76858b14ebe9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1865,16 +1865,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)  /* Emulate SACKs for SACKless connection: account for a new dupack. */ -static void tcp_add_reno_sack(struct sock *sk) +static void tcp_add_reno_sack(struct sock *sk, int num_dupack)  { -	struct tcp_sock *tp = tcp_sk(sk); -	u32 prior_sacked = tp->sacked_out; +	if (num_dupack) { +		struct tcp_sock *tp = tcp_sk(sk); +		u32 prior_sacked = tp->sacked_out; +		s32 delivered; -	tp->sacked_out++; -	tcp_check_reno_reordering(sk, 0); -	if (tp->sacked_out > prior_sacked) -		tp->delivered++; /* Some out-of-order packet is delivered */ -	tcp_verify_left_out(tp); +		tp->sacked_out += num_dupack; +		tcp_check_reno_reordering(sk, 0); +		delivered = tp->sacked_out - prior_sacked; +		if (delivered > 0) +			tp->delivered += delivered; +		tcp_verify_left_out(tp); +	}  }  /* Account for ACK, ACKing some data in Reno Recovery phase. */ @@ -2459,8 +2463,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)  		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +  			       tp->prior_cwnd - 1;  		sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; -	} else if ((flag & FLAG_RETRANS_DATA_ACKED) && -		   !(flag & FLAG_LOST_RETRANS)) { +	} else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) == +		   FLAG_RETRANS_DATA_ACKED) {  		sndcnt = min_t(int, delta,  			       max_t(int, tp->prr_delivered - tp->prr_out,  				     newly_acked_sacked) + 1); @@ -2636,7 +2640,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack)  /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are   * recovered or spurious. Otherwise retransmits more on partial ACKs.   */ -static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, +static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,  			     int *rexmit)  {  	struct tcp_sock *tp = tcp_sk(sk); @@ -2655,7 +2659,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,  			return;  		if (after(tp->snd_nxt, tp->high_seq)) { -			if (flag & FLAG_DATA_SACKED || is_dupack) +			if (flag & FLAG_DATA_SACKED || num_dupack)  				tp->frto = 0; /* Step 3.a. loss was real */  		} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {  			tp->high_seq = tp->snd_nxt; @@ -2681,8 +2685,8 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,  		/* A Reno DUPACK means new data in F-RTO step 2.b above are  		 * delivered. Lower inflight to clock out (re)tranmissions.  		 */ -		if (after(tp->snd_nxt, tp->high_seq) && is_dupack) -			tcp_add_reno_sack(sk); +		if (after(tp->snd_nxt, tp->high_seq) && num_dupack) +			tcp_add_reno_sack(sk, num_dupack);  		else if (flag & FLAG_SND_UNA_ADVANCED)  			tcp_reset_reno_sack(tp);  	} @@ -2759,13 +2763,13 @@ static bool tcp_force_fast_retransmit(struct sock *sk)   * tcp_xmit_retransmit_queue().   */  static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, -				  bool is_dupack, int *ack_flag, int *rexmit) +				  int num_dupack, int *ack_flag, int *rexmit)  {  	struct inet_connection_sock *icsk = inet_csk(sk);  	struct tcp_sock *tp = tcp_sk(sk);  	int fast_rexmit = 0, flag = *ack_flag; -	bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && -				     tcp_force_fast_retransmit(sk)); +	bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && +				      tcp_force_fast_retransmit(sk));  	if (!tp->packets_out && tp->sacked_out)  		tp->sacked_out = 0; @@ -2812,8 +2816,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,  	switch (icsk->icsk_ca_state) {  	case TCP_CA_Recovery:  		if (!(flag & FLAG_SND_UNA_ADVANCED)) { -			if (tcp_is_reno(tp) && is_dupack) -				tcp_add_reno_sack(sk); +			if (tcp_is_reno(tp)) +				tcp_add_reno_sack(sk, num_dupack);  		} else {  			if (tcp_try_undo_partial(sk, prior_snd_una))  				return; @@ -2828,7 +2832,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,  		tcp_identify_packet_loss(sk, ack_flag);  		break;  	case TCP_CA_Loss: -		tcp_process_loss(sk, flag, is_dupack, rexmit); +		tcp_process_loss(sk, flag, num_dupack, rexmit);  		tcp_identify_packet_loss(sk, ack_flag);  		if (!(icsk->icsk_ca_state == TCP_CA_Open ||  		      (*ack_flag & FLAG_LOST_RETRANS))) @@ -2839,8 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,  		if (tcp_is_reno(tp)) {  			if (flag & FLAG_SND_UNA_ADVANCED)  				tcp_reset_reno_sack(tp); -			if (is_dupack) -				tcp_add_reno_sack(sk); +			tcp_add_reno_sack(sk, num_dupack);  		}  		if (icsk->icsk_ca_state <= TCP_CA_Disorder) @@ -3562,7 +3565,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	bool is_sack_reneg = tp->is_sack_reneg;  	u32 ack_seq = TCP_SKB_CB(skb)->seq;  	u32 ack = TCP_SKB_CB(skb)->ack_seq; -	bool is_dupack = false; +	int num_dupack = 0;  	int prior_packets = tp->packets_out;  	u32 delivered = tp->delivered;  	u32 lost = tp->lost; @@ -3614,7 +3617,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  	if (flag & FLAG_UPDATE_TS_RECENT)  		tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); -	if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { +	if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == +	    FLAG_SND_UNA_ADVANCED) {  		/* Window is constant, pure forward advance.  		 * No more checks are required.  		 * Note, we use the fact that SND.UNA>=SND.WL2. @@ -3672,8 +3676,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  		tcp_set_xmit_timer(sk);  	if (tcp_ack_is_dubious(sk, flag)) { -		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); -		tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, +		if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) { +			num_dupack = 1; +			/* Consider if pure acks were aggregated in tcp_add_backlog() */ +			if (!(flag & FLAG_DATA)) +				num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs); +		} +		tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,  				      &rexmit);  	} @@ -3691,7 +3700,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)  no_queue:  	/* If data was DSACKed, see if we can undo a cwnd reduction. */  	if (flag & FLAG_DSACKING_ACK) { -		tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, +		tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,  				      &rexmit);  		tcp_newly_delivered(sk, delivered, flag);  	} @@ -3716,7 +3725,7 @@ old_ack:  	if (TCP_SKB_CB(skb)->sacked) {  		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,  						&sack_state); -		tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, +		tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,  				      &rexmit);  		tcp_newly_delivered(sk, delivered, flag);  		tcp_xmit_recovery(sk, rexmit); @@ -4606,13 +4615,12 @@ end:  	}  } -static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, -		  bool *fragstolen) +static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, +				      bool *fragstolen)  {  	int eaten;  	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); -	__skb_pull(skb, hdrlen);  	eaten = (tail &&  		 tcp_try_coalesce(sk, tail,  				  skb, fragstolen)) ? 1 : 0; @@ -4663,7 +4671,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;  	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; -	if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { +	if (tcp_queue_rcv(sk, skb, &fragstolen)) {  		WARN_ON_ONCE(fragstolen); /* should not happen */  		__kfree_skb(skb);  	} @@ -4723,7 +4731,7 @@ queue_and_out:  			goto drop;  		} -		eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); +		eaten = tcp_queue_rcv(sk, skb, &fragstolen);  		if (skb->len)  			tcp_event_data_recv(sk, skb);  		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -5599,8 +5607,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)  			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);  			/* Bulk data transfer: receiver */ -			eaten = tcp_queue_rcv(sk, skb, tcp_header_len, -					      &fragstolen); +			__skb_pull(skb, tcp_header_len); +			eaten = tcp_queue_rcv(sk, skb, &fragstolen);  			tcp_event_data_recv(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index de47038afdf0..efc6fef692ff 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -423,7 +423,7 @@ EXPORT_SYMBOL(tcp_req_err);   *   */ -void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) +int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  {  	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;  	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); @@ -446,20 +446,21 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  				       inet_iif(icmp_skb), 0);  	if (!sk) {  		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS); -		return; +		return -ENOENT;  	}  	if (sk->sk_state == TCP_TIME_WAIT) {  		inet_twsk_put(inet_twsk(sk)); -		return; +		return 0;  	}  	seq = ntohl(th->seq); -	if (sk->sk_state == TCP_NEW_SYN_RECV) -		return tcp_req_err(sk, seq, -				  type == ICMP_PARAMETERPROB || -				  type == ICMP_TIME_EXCEEDED || -				  (type == ICMP_DEST_UNREACH && -				   (code == ICMP_NET_UNREACH || -				    code == ICMP_HOST_UNREACH))); +	if (sk->sk_state == TCP_NEW_SYN_RECV) { +		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || +				     type == ICMP_TIME_EXCEEDED || +				     (type == ICMP_DEST_UNREACH && +				      (code == ICMP_NET_UNREACH || +				       code == ICMP_HOST_UNREACH))); +		return 0; +	}  	bh_lock_sock(sk);  	/* If too many ICMPs get dropped on busy @@ -541,7 +542,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);  		skb = tcp_rtx_queue_head(sk); -		BUG_ON(!skb);  		tcp_mstamp_refresh(tp);  		delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); @@ -613,6 +613,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)  out:  	bh_unlock_sock(sk);  	sock_put(sk); +	return 0;  }  void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) @@ -969,10 +970,13 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)   * We need to maintain these in the sk structure.   */ +struct static_key tcp_md5_needed __read_mostly; +EXPORT_SYMBOL(tcp_md5_needed); +  /* Find the Key structure for an address.  */ -struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, -					 const union tcp_md5_addr *addr, -					 int family) +struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, +					   const union tcp_md5_addr *addr, +					   int family)  {  	const struct tcp_sock *tp = tcp_sk(sk);  	struct tcp_md5sig_key *key; @@ -1010,7 +1014,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,  	}  	return best_match;  } -EXPORT_SYMBOL(tcp_md5_do_lookup); +EXPORT_SYMBOL(__tcp_md5_do_lookup);  static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,  						      const union tcp_md5_addr *addr, @@ -1618,12 +1622,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)  bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)  {  	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; - -	/* Only socket owner can try to collapse/prune rx queues -	 * to reduce memory overhead, so add a little headroom here. -	 * Few sockets backlog are possibly concurrently non empty. -	 */ -	limit += 64*1024; +	struct skb_shared_info *shinfo; +	const struct tcphdr *th; +	struct tcphdr *thtail; +	struct sk_buff *tail; +	unsigned int hdrlen; +	bool fragstolen; +	u32 gso_segs; +	int delta;  	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),  	 * we can fix skb->truesize to its real value to avoid future drops. @@ -1633,6 +1639,86 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)  	 */  	skb_condense(skb); +	skb_dst_drop(skb); + +	if (unlikely(tcp_checksum_complete(skb))) { +		bh_unlock_sock(sk); +		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); +		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); +		return true; +	} + +	/* Attempt coalescing to last skb in backlog, even if we are +	 * above the limits. +	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS. +	 */ +	th = (const struct tcphdr *)skb->data; +	hdrlen = th->doff * 4; +	shinfo = skb_shinfo(skb); + +	if (!shinfo->gso_size) +		shinfo->gso_size = skb->len - hdrlen; + +	if (!shinfo->gso_segs) +		shinfo->gso_segs = 1; + +	tail = sk->sk_backlog.tail; +	if (!tail) +		goto no_coalesce; +	thtail = (struct tcphdr *)tail->data; + +	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || +	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || +	    ((TCP_SKB_CB(tail)->tcp_flags | +	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) || +	    ((TCP_SKB_CB(tail)->tcp_flags ^ +	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || +#ifdef CONFIG_TLS_DEVICE +	    tail->decrypted != skb->decrypted || +#endif +	    thtail->doff != th->doff || +	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) +		goto no_coalesce; + +	__skb_pull(skb, hdrlen); +	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { +		thtail->window = th->window; + +		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; + +		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) +			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; + +		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; + +		if (TCP_SKB_CB(skb)->has_rxtstamp) { +			TCP_SKB_CB(tail)->has_rxtstamp = true; +			tail->tstamp = skb->tstamp; +			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; +		} + +		/* Not as strict as GRO. We only need to carry mss max value */ +		skb_shinfo(tail)->gso_size = max(shinfo->gso_size, +						 skb_shinfo(tail)->gso_size); + +		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; +		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + +		sk->sk_backlog.len += delta; +		__NET_INC_STATS(sock_net(sk), +				LINUX_MIB_TCPBACKLOGCOALESCE); +		kfree_skb_partial(skb, fragstolen); +		return false; +	} +	__skb_push(skb, hdrlen); + +no_coalesce: +	/* Only socket owner can try to collapse/prune rx queues +	 * to reduce memory overhead, so add a little headroom here. +	 * Few sockets backlog are possibly concurrently non empty. +	 */ +	limit += 64*1024; +  	if (unlikely(sk_add_backlog(sk, skb, limit))) {  		bh_unlock_sock(sk);  		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); @@ -2573,8 +2659,8 @@ static int __net_init tcp_sk_init(struct net *net)  	 * which are too large can cause TCP streams to be bursty.  	 */  	net->ipv4.sysctl_tcp_tso_win_divisor = 3; -	/* Default TSQ limit of four TSO segments */ -	net->ipv4.sysctl_tcp_limit_output_bytes = 262144; +	/* Default TSQ limit of 16 TSO segments */ +	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;  	/* rfc5961 challenge ack rate limiting */  	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;  	net->ipv4.sysctl_tcp_min_tso_segs = 2; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 03b51cdcc731..b467a7cabf40 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1000,7 +1000,7 @@ static int __net_init tcp_net_metrics_init(struct net *net)  	slots = tcpmhash_entries;  	if (!slots) { -		if (totalram_pages >= 128 * 1024) +		if (totalram_pages() >= 128 * 1024)  			slots = 16 * 1024;  		else  			slots = 8 * 1024; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 870b0a335061..0fbf7d4df9da 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -10,6 +10,7 @@   *	TCPv4 GSO/GRO support   */ +#include <linux/indirect_call_wrapper.h>  #include <linux/skbuff.h>  #include <net/tcp.h>  #include <net/protocol.h> @@ -305,7 +306,8 @@ int tcp_gro_complete(struct sk_buff *skb)  }  EXPORT_SYMBOL(tcp_gro_complete); -static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE +struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)  {  	/* Don't bother verifying checksum if we're going to flush anyway. */  	if (!NAPI_GRO_CB(skb)->flush && @@ -318,7 +320,7 @@ static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *  	return tcp_gro_receive(head, skb);  } -static int tcp4_gro_complete(struct sk_buff *skb, int thoff) +INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)  {  	const struct iphdr *iph = ip_hdr(skb);  	struct tcphdr *th = tcp_hdr(skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d1676d8a6ed7..730bc44dbad9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -233,16 +233,14 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,  	if (init_rcv_wnd)  		*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); -	(*rcv_wscale) = 0; +	*rcv_wscale = 0;  	if (wscale_ok) {  		/* Set window scaling on max possible window */  		space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);  		space = max_t(u32, space, sysctl_rmem_max);  		space = min_t(u32, space, *window_clamp); -		while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { -			space >>= 1; -			(*rcv_wscale)++; -		} +		*rcv_wscale = clamp_t(int, ilog2(space) - 15, +				      0, TCP_MAX_WSCALE);  	}  	/* Set the clamp no higher than max representable value */  	(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); @@ -596,7 +594,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,  	*md5 = NULL;  #ifdef CONFIG_TCP_MD5SIG -	if (unlikely(rcu_access_pointer(tp->md5sig_info))) { +	if (static_key_false(&tcp_md5_needed) && +	    rcu_access_pointer(tp->md5sig_info)) {  		*md5 = tp->af_specific->md5_lookup(sk, sk);  		if (*md5) {  			opts->options |= OPTION_MD5; @@ -732,7 +731,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb  	*md5 = NULL;  #ifdef CONFIG_TCP_MD5SIG -	if (unlikely(rcu_access_pointer(tp->md5sig_info))) { +	if (static_key_false(&tcp_md5_needed) && +	    rcu_access_pointer(tp->md5sig_info)) {  		*md5 = tp->af_specific->md5_lookup(sk, sk);  		if (*md5) {  			opts->options |= OPTION_MD5; @@ -1909,18 +1909,22 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,  				 u32 max_segs)  {  	const struct inet_connection_sock *icsk = inet_csk(sk); -	u32 age, send_win, cong_win, limit, in_flight; +	u32 send_win, cong_win, limit, in_flight;  	struct tcp_sock *tp = tcp_sk(sk);  	struct sk_buff *head;  	int win_divisor; +	s64 delta;  	if (icsk->icsk_ca_state >= TCP_CA_Recovery)  		goto send_now;  	/* Avoid bursty behavior by allowing defer -	 * only if the last write was recent. +	 * only if the last write was recent (1 ms). +	 * Note that tp->tcp_wstamp_ns can be in the future if we have +	 * packets waiting in a qdisc or device for EDT delivery.  	 */ -	if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0) +	delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC; +	if (delta > 0)  		goto send_now;  	in_flight = tcp_packets_in_flight(tp); @@ -1967,9 +1971,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,  	head = tcp_rtx_queue_head(sk);  	if (!head)  		goto send_now; -	age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head)); +	delta = tp->tcp_clock_cache - head->tstamp;  	/* If next ACK is likely to come too late (half srtt), do not defer */ -	if (age < (tp->srtt_us >> 4)) +	if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)  		goto send_now;  	/* Ok, it looks like it is advisable to defer. @@ -1991,7 +1995,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,  	}  	/* If this packet won't get more data, do not wait. */ -	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) +	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || +	    TCP_SKB_CB(skb)->eor)  		goto send_now;  	return true; @@ -2228,8 +2233,9 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,  	limit = max_t(unsigned long,  		      2 * skb->truesize,  		      sk->sk_pacing_rate >> sk->sk_pacing_shift); -	limit = min_t(unsigned long, limit, -		      sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); +	if (sk->sk_pacing_status == SK_PACING_NONE) +		limit = min_t(unsigned long, limit, +			      sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);  	limit <<= factor;  	if (refcount_read(&sk->sk_wmem_alloc) > limit) { diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index c0630013c1ae..33bf8e9c8663 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -149,34 +149,40 @@ drop:  }  #endif -static void tunnel4_err(struct sk_buff *skb, u32 info) +static int tunnel4_err(struct sk_buff *skb, u32 info)  {  	struct xfrm_tunnel *handler;  	for_each_tunnel_rcu(tunnel4_handlers, handler)  		if (!handler->err_handler(skb, info)) -			break; +			return 0; + +	return -ENOENT;  }  #if IS_ENABLED(CONFIG_IPV6) -static void tunnel64_err(struct sk_buff *skb, u32 info) +static int tunnel64_err(struct sk_buff *skb, u32 info)  {  	struct xfrm_tunnel *handler;  	for_each_tunnel_rcu(tunnel64_handlers, handler)  		if (!handler->err_handler(skb, info)) -			break; +			return 0; + +	return -ENOENT;  }  #endif  #if IS_ENABLED(CONFIG_MPLS) -static void tunnelmpls4_err(struct sk_buff *skb, u32 info) +static int tunnelmpls4_err(struct sk_buff *skb, u32 info)  {  	struct xfrm_tunnel *handler;  	for_each_tunnel_rcu(tunnelmpls4_handlers, handler)  		if (!handler->err_handler(skb, info)) -			break; +			return 0; + +	return -ENOENT;  }  #endif diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1976fddb9e00..3fb0ed5e4789 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -105,6 +105,7 @@  #include <net/net_namespace.h>  #include <net/icmp.h>  #include <net/inet_hashtables.h> +#include <net/ip_tunnels.h>  #include <net/route.h>  #include <net/checksum.h>  #include <net/xfrm.h> @@ -115,6 +116,7 @@  #include "udp_impl.h"  #include <net/sock_reuseport.h>  #include <net/addrconf.h> +#include <net/udp_tunnel.h>  struct udp_table udp_table __read_mostly;  EXPORT_SYMBOL(udp_table); @@ -371,21 +373,19 @@ static int compute_score(struct sock *sk, struct net *net,  {  	int score;  	struct inet_sock *inet; +	bool dev_match;  	if (!net_eq(sock_net(sk), net) ||  	    udp_sk(sk)->udp_port_hash != hnum ||  	    ipv6_only_sock(sk))  		return -1; -	score = (sk->sk_family == PF_INET) ? 2 : 1; -	inet = inet_sk(sk); +	if (sk->sk_rcv_saddr != daddr) +		return -1; -	if (inet->inet_rcv_saddr) { -		if (inet->inet_rcv_saddr != daddr) -			return -1; -		score += 4; -	} +	score = (sk->sk_family == PF_INET) ? 2 : 1; +	inet = inet_sk(sk);  	if (inet->inet_daddr) {  		if (inet->inet_daddr != saddr)  			return -1; @@ -398,15 +398,11 @@ static int compute_score(struct sock *sk, struct net *net,  		score += 4;  	} -	if (sk->sk_bound_dev_if || exact_dif) { -		bool dev_match = (sk->sk_bound_dev_if == dif || -				  sk->sk_bound_dev_if == sdif); - -		if (!dev_match) -			return -1; -		if (sk->sk_bound_dev_if) -			score += 4; -	} +	dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, +					dif, sdif); +	if (!dev_match) +		return -1; +	score += 4;  	if (sk->sk_incoming_cpu == raw_smp_processor_id())  		score++; @@ -465,65 +461,30 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,  		__be16 sport, __be32 daddr, __be16 dport, int dif,  		int sdif, struct udp_table *udptable, struct sk_buff *skb)  { -	struct sock *sk, *result; +	struct sock *result;  	unsigned short hnum = ntohs(dport); -	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); -	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; +	unsigned int hash2, slot2; +	struct udp_hslot *hslot2;  	bool exact_dif = udp_lib_exact_dif_match(net, skb); -	int score, badness; -	u32 hash = 0; -	if (hslot->count > 10) { -		hash2 = ipv4_portaddr_hash(net, daddr, hnum); +	hash2 = ipv4_portaddr_hash(net, daddr, hnum); +	slot2 = hash2 & udptable->mask; +	hslot2 = &udptable->hash2[slot2]; + +	result = udp4_lib_lookup2(net, saddr, sport, +				  daddr, hnum, dif, sdif, +				  exact_dif, hslot2, skb); +	if (!result) { +		hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);  		slot2 = hash2 & udptable->mask;  		hslot2 = &udptable->hash2[slot2]; -		if (hslot->count < hslot2->count) -			goto begin;  		result = udp4_lib_lookup2(net, saddr, sport, -					  daddr, hnum, dif, sdif, +					  htonl(INADDR_ANY), hnum, dif, sdif,  					  exact_dif, hslot2, skb); -		if (!result) { -			unsigned int old_slot2 = slot2; -			hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); -			slot2 = hash2 & udptable->mask; -			/* avoid searching the same slot again. */ -			if (unlikely(slot2 == old_slot2)) -				return result; - -			hslot2 = &udptable->hash2[slot2]; -			if (hslot->count < hslot2->count) -				goto begin; - -			result = udp4_lib_lookup2(net, saddr, sport, -						  daddr, hnum, dif, sdif, -						  exact_dif, hslot2, skb); -		} -		if (unlikely(IS_ERR(result))) -			return NULL; -		return result; -	} -begin: -	result = NULL; -	badness = 0; -	sk_for_each_rcu(sk, &hslot->head) { -		score = compute_score(sk, net, saddr, sport, -				      daddr, hnum, dif, sdif, exact_dif); -		if (score > badness) { -			if (sk->sk_reuseport) { -				hash = udp_ehashfn(net, daddr, hnum, -						   saddr, sport); -				result = reuseport_select_sock(sk, hash, skb, -							sizeof(struct udphdr)); -				if (unlikely(IS_ERR(result))) -					return NULL; -				if (result) -					return result; -			} -			result = sk; -			badness = score; -		}  	} +	if (unlikely(IS_ERR(result))) +		return NULL;  	return result;  }  EXPORT_SYMBOL_GPL(__udp4_lib_lookup); @@ -585,6 +546,89 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,  	return true;  } +DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); +void udp_encap_enable(void) +{ +	static_branch_inc(&udp_encap_needed_key); +} +EXPORT_SYMBOL(udp_encap_enable); + +/* Handler for tunnels with arbitrary destination ports: no socket lookup, go + * through error handlers in encapsulations looking for a match. + */ +static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info) +{ +	int i; + +	for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { +		int (*handler)(struct sk_buff *skb, u32 info); + +		if (!iptun_encaps[i]) +			continue; +		handler = rcu_dereference(iptun_encaps[i]->err_handler); +		if (handler && !handler(skb, info)) +			return 0; +	} + +	return -ENOENT; +} + +/* Try to match ICMP errors to UDP tunnels by looking up a socket without + * reversing source and destination port: this will match tunnels that force the + * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that + * lwtunnels might actually break this assumption by being configured with + * different destination ports on endpoints, in this case we won't be able to + * trace ICMP messages back to them. + * + * If this doesn't match any socket, probe tunnels with arbitrary destination + * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port + * we've sent packets to won't necessarily match the local destination port. + * + * Then ask the tunnel implementation to match the error against a valid + * association. + * + * Return an error if we can't find a match, the socket if we need further + * processing, zero otherwise. + */ +static struct sock *__udp4_lib_err_encap(struct net *net, +					 const struct iphdr *iph, +					 struct udphdr *uh, +					 struct udp_table *udptable, +					 struct sk_buff *skb, u32 info) +{ +	int network_offset, transport_offset; +	struct sock *sk; + +	network_offset = skb_network_offset(skb); +	transport_offset = skb_transport_offset(skb); + +	/* Network header needs to point to the outer IPv4 header inside ICMP */ +	skb_reset_network_header(skb); + +	/* Transport header needs to point to the UDP header */ +	skb_set_transport_header(skb, iph->ihl << 2); + +	sk = __udp4_lib_lookup(net, iph->daddr, uh->source, +			       iph->saddr, uh->dest, skb->dev->ifindex, 0, +			       udptable, NULL); +	if (sk) { +		int (*lookup)(struct sock *sk, struct sk_buff *skb); +		struct udp_sock *up = udp_sk(sk); + +		lookup = READ_ONCE(up->encap_err_lookup); +		if (!lookup || lookup(sk, skb)) +			sk = NULL; +	} + +	if (!sk) +		sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info)); + +	skb_set_transport_header(skb, transport_offset); +	skb_set_network_header(skb, network_offset); + +	return sk; +} +  /*   * This routine is called by the ICMP module when it gets some   * sort of error condition.  If err < 0 then the socket should @@ -596,13 +640,14 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,   * to find the appropriate port.   */ -void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) +int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)  {  	struct inet_sock *inet;  	const struct iphdr *iph = (const struct iphdr *)skb->data;  	struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));  	const int type = icmp_hdr(skb)->type;  	const int code = icmp_hdr(skb)->code; +	bool tunnel = false;  	struct sock *sk;  	int harderr;  	int err; @@ -612,8 +657,21 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)  			       iph->saddr, uh->source, skb->dev->ifindex,  			       inet_sdif(skb), udptable, NULL);  	if (!sk) { -		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS); -		return;	/* No socket for error */ +		/* No socket for error: try tunnels before discarding */ +		sk = ERR_PTR(-ENOENT); +		if (static_branch_unlikely(&udp_encap_needed_key)) { +			sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb, +						  info); +			if (!sk) +				return 0; +		} + +		if (IS_ERR(sk)) { +			__ICMP_INC_STATS(net, ICMP_MIB_INERRORS); +			return PTR_ERR(sk); +		} + +		tunnel = true;  	}  	err = 0; @@ -656,6 +714,10 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)  	 *      RFC1122: OK.  Passes ICMP errors back to application, as per  	 *	4.1.3.3.  	 */ +	if (tunnel) { +		/* ...not for tunnels though: we don't have a sending socket */ +		goto out; +	}  	if (!inet->recverr) {  		if (!harderr || sk->sk_state != TCP_ESTABLISHED)  			goto out; @@ -665,12 +727,12 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)  	sk->sk_err = err;  	sk->sk_error_report(sk);  out: -	return; +	return 0;  } -void udp_err(struct sk_buff *skb, u32 info) +int udp_err(struct sk_buff *skb, u32 info)  { -	__udp4_lib_err(skb, info, &udp_table); +	return __udp4_lib_err(skb, info, &udp_table);  }  /* @@ -1713,6 +1775,10 @@ try_again:  		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));  		*addr_len = sizeof(*sin);  	} + +	if (udp_sk(sk)->gro_enabled) +		udp_cmsg_recv(msg, sk, skb); +  	if (inet->cmsg_flags)  		ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); @@ -1889,13 +1955,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)  	return 0;  } -DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); -void udp_encap_enable(void) -{ -	static_branch_enable(&udp_encap_needed_key); -} -EXPORT_SYMBOL(udp_encap_enable); -  /* returns:   *  -1: error   *   0: success @@ -1904,7 +1963,7 @@ EXPORT_SYMBOL(udp_encap_enable);   * Note that in the success and error cases, the skb is assumed to   * have either been requeued or freed.   */ -static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)  {  	struct udp_sock *up = udp_sk(sk);  	int is_udplite = IS_UDPLITE(sk); @@ -2007,6 +2066,27 @@ drop:  	return -1;  } +static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ +	struct sk_buff *next, *segs; +	int ret; + +	if (likely(!udp_unexpected_gso(sk, skb))) +		return udp_queue_rcv_one_skb(sk, skb); + +	BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET); +	__skb_push(skb, -skb_mac_offset(skb)); +	segs = udp_rcv_segment(sk, skb, true); +	for (skb = segs; skb; skb = next) { +		next = skb->next; +		__skb_pull(skb, skb_transport_offset(skb)); +		ret = udp_queue_rcv_one_skb(sk, skb); +		if (ret > 0) +			ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret); +	} +	return 0; +} +  /* For TCP sockets, sk_rx_dst is protected by socket lock   * For UDP, we use xchg() to guard against concurrent changes.   */ @@ -2398,11 +2478,15 @@ void udp_destroy_sock(struct sock *sk)  	bool slow = lock_sock_fast(sk);  	udp_flush_pending_frames(sk);  	unlock_sock_fast(sk, slow); -	if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { -		void (*encap_destroy)(struct sock *sk); -		encap_destroy = READ_ONCE(up->encap_destroy); -		if (encap_destroy) -			encap_destroy(sk); +	if (static_branch_unlikely(&udp_encap_needed_key)) { +		if (up->encap_type) { +			void (*encap_destroy)(struct sock *sk); +			encap_destroy = READ_ONCE(up->encap_destroy); +			if (encap_destroy) +				encap_destroy(sk); +		} +		if (up->encap_enabled) +			static_branch_dec(&udp_encap_needed_key);  	}  } @@ -2447,7 +2531,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,  			/* FALLTHROUGH */  		case UDP_ENCAP_L2TPINUDP:  			up->encap_type = val; -			udp_encap_enable(); +			lock_sock(sk); +			udp_tunnel_encap_enable(sk->sk_socket); +			release_sock(sk);  			break;  		default:  			err = -ENOPROTOOPT; @@ -2469,6 +2555,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,  		up->gso_size = val;  		break; +	case UDP_GRO: +		lock_sock(sk); +		if (valbool) +			udp_tunnel_encap_enable(sk->sk_socket); +		up->gro_enabled = valbool; +		release_sock(sk); +		break; +  	/*  	 * 	UDP-Lite's partial checksum coverage (RFC 3828).  	 */ diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index e7d18b140287..322672655419 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -7,7 +7,7 @@  #include <net/inet_common.h>  int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int); -void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); +int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);  int udp_v4_get_port(struct sock *sk, unsigned short snum); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 802f2bc00d69..64f9715173ac 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -13,6 +13,7 @@  #include <linux/skbuff.h>  #include <net/udp.h>  #include <net/protocol.h> +#include <net/inet_common.h>  static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,  	netdev_features_t features, @@ -343,6 +344,56 @@ out:  	return segs;  } +#define UDP_GRO_CNT_MAX 64 +static struct sk_buff *udp_gro_receive_segment(struct list_head *head, +					       struct sk_buff *skb) +{ +	struct udphdr *uh = udp_hdr(skb); +	struct sk_buff *pp = NULL; +	struct udphdr *uh2; +	struct sk_buff *p; + +	/* requires non zero csum, for symmetry with GSO */ +	if (!uh->check) { +		NAPI_GRO_CB(skb)->flush = 1; +		return NULL; +	} + +	/* pull encapsulating udp header */ +	skb_gro_pull(skb, sizeof(struct udphdr)); +	skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); + +	list_for_each_entry(p, head, list) { +		if (!NAPI_GRO_CB(p)->same_flow) +			continue; + +		uh2 = udp_hdr(p); + +		/* Match ports only, as csum is always non zero */ +		if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { +			NAPI_GRO_CB(p)->same_flow = 0; +			continue; +		} + +		/* Terminate the flow on len mismatch or if it grow "too much". +		 * Under small packet flood GRO count could elsewhere grow a lot +		 * leading to execessive truesize values +		 */ +		if (!skb_gro_receive(p, skb) && +		    NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX) +			pp = p; +		else if (uh->len != uh2->len) +			pp = p; + +		return pp; +	} + +	/* mismatch, but we never need to flush */ +	return NULL; +} + +INDIRECT_CALLABLE_DECLARE(struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, +						   __be16 sport, __be16 dport));  struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,  				struct udphdr *uh, udp_lookup_t lookup)  { @@ -353,23 +404,28 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,  	int flush = 1;  	struct sock *sk; +	rcu_read_lock(); +	sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, +				udp4_lib_lookup_skb, skb, uh->source, uh->dest); +	if (!sk) +		goto out_unlock; + +	if (udp_sk(sk)->gro_enabled) { +		pp = call_gro_receive(udp_gro_receive_segment, head, skb); +		rcu_read_unlock(); +		return pp; +	} +  	if (NAPI_GRO_CB(skb)->encap_mark ||  	    (skb->ip_summed != CHECKSUM_PARTIAL &&  	     NAPI_GRO_CB(skb)->csum_cnt == 0 && -	     !NAPI_GRO_CB(skb)->csum_valid)) -		goto out; +	     !NAPI_GRO_CB(skb)->csum_valid) || +	    !udp_sk(sk)->gro_receive) +		goto out_unlock;  	/* mark that this skb passed once through the tunnel gro layer */  	NAPI_GRO_CB(skb)->encap_mark = 1; -	rcu_read_lock(); -	sk = (*lookup)(skb, uh->source, uh->dest); - -	if (sk && udp_sk(sk)->gro_receive) -		goto unflush; -	goto out_unlock; - -unflush:  	flush = 0;  	list_for_each_entry(p, head, list) { @@ -394,14 +450,13 @@ unflush:  out_unlock:  	rcu_read_unlock(); -out:  	skb_gro_flush_final(skb, pp, flush);  	return pp;  }  EXPORT_SYMBOL(udp_gro_receive); -static struct sk_buff *udp4_gro_receive(struct list_head *head, -					struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE +struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)  {  	struct udphdr *uh = udp_gro_udphdr(skb); @@ -427,6 +482,19 @@ flush:  	return NULL;  } +static int udp_gro_complete_segment(struct sk_buff *skb) +{ +	struct udphdr *uh = udp_hdr(skb); + +	skb->csum_start = (unsigned char *)uh - skb->head; +	skb->csum_offset = offsetof(struct udphdr, check); +	skb->ip_summed = CHECKSUM_PARTIAL; + +	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; +	skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4; +	return 0; +} +  int udp_gro_complete(struct sk_buff *skb, int nhoff,  		     udp_lookup_t lookup)  { @@ -437,16 +505,22 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,  	uh->len = newlen; -	/* Set encapsulation before calling into inner gro_complete() functions -	 * to make them set up the inner offsets. -	 */ -	skb->encapsulation = 1; -  	rcu_read_lock(); -	sk = (*lookup)(skb, uh->source, uh->dest); -	if (sk && udp_sk(sk)->gro_complete) +	sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, +				udp4_lib_lookup_skb, skb, uh->source, uh->dest); +	if (sk && udp_sk(sk)->gro_enabled) { +		err = udp_gro_complete_segment(skb); +	} else if (sk && udp_sk(sk)->gro_complete) { +		skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM +					: SKB_GSO_UDP_TUNNEL; + +		/* Set encapsulation before calling into inner gro_complete() +		 * functions to make them set up the inner offsets. +		 */ +		skb->encapsulation = 1;  		err = udp_sk(sk)->gro_complete(sk, skb,  				nhoff + sizeof(struct udphdr)); +	}  	rcu_read_unlock();  	if (skb->remcsum_offload) @@ -456,18 +530,14 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,  }  EXPORT_SYMBOL(udp_gro_complete); -static int udp4_gro_complete(struct sk_buff *skb, int nhoff) +INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)  {  	const struct iphdr *iph = ip_hdr(skb);  	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); -	if (uh->check) { -		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; +	if (uh->check)  		uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,  					  iph->daddr, 0); -	} else { -		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; -	}  	return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);  } diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 6539ff15e9a3..be8b5b2157d8 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -20,6 +20,23 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,  	if (err < 0)  		goto error; +	if (cfg->bind_ifindex) { +		struct net_device *dev; + +		dev = dev_get_by_index(net, cfg->bind_ifindex); +		if (!dev) { +			err = -ENODEV; +			goto error; +		} + +		err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, +					dev->name, strlen(dev->name) + 1); +		dev_put(dev); + +		if (err < 0) +			goto error; +	} +  	udp_addr.sin_family = AF_INET;  	udp_addr.sin_addr = cfg->local_ip;  	udp_addr.sin_port = cfg->local_udp_port; @@ -68,6 +85,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,  	udp_sk(sk)->encap_type = cfg->encap_type;  	udp_sk(sk)->encap_rcv = cfg->encap_rcv; +	udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup;  	udp_sk(sk)->encap_destroy = cfg->encap_destroy;  	udp_sk(sk)->gro_receive = cfg->gro_receive;  	udp_sk(sk)->gro_complete = cfg->gro_complete; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 8545457752fb..39c7f17d916f 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -25,9 +25,9 @@ static int udplite_rcv(struct sk_buff *skb)  	return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);  } -static void udplite_err(struct sk_buff *skb, u32 info) +static int udplite_err(struct sk_buff *skb, u32 info)  { -	__udp4_lib_err(skb, info, &udplite_table); +	return __udp4_lib_err(skb, info, &udplite_table);  }  static const struct net_protocol udplite_protocol = { diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index 8dd0e6ab8606..35c54865dc42 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -106,13 +106,15 @@ static int xfrm4_esp_rcv(struct sk_buff *skb)  	return 0;  } -static void xfrm4_esp_err(struct sk_buff *skb, u32 info) +static int xfrm4_esp_err(struct sk_buff *skb, u32 info)  {  	struct xfrm4_protocol *handler;  	for_each_protocol_rcu(esp4_handlers, handler)  		if (!handler->err_handler(skb, info)) -			break; +			return 0; + +	return -ENOENT;  }  static int xfrm4_ah_rcv(struct sk_buff *skb) @@ -132,13 +134,15 @@ static int xfrm4_ah_rcv(struct sk_buff *skb)  	return 0;  } -static void xfrm4_ah_err(struct sk_buff *skb, u32 info) +static int xfrm4_ah_err(struct sk_buff *skb, u32 info)  {  	struct xfrm4_protocol *handler;  	for_each_protocol_rcu(ah4_handlers, handler)  		if (!handler->err_handler(skb, info)) -			break; +			return 0; + +	return -ENOENT;  }  static int xfrm4_ipcomp_rcv(struct sk_buff *skb) @@ -158,13 +162,15 @@ static int xfrm4_ipcomp_rcv(struct sk_buff *skb)  	return 0;  } -static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) +static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)  {  	struct xfrm4_protocol *handler;  	for_each_protocol_rcu(ipcomp4_handlers, handler)  		if (!handler->err_handler(skb, info)) -			break; +			return 0; + +	return -ENOENT;  }  static const struct net_protocol esp4_protocol = { | 
