diff options
Diffstat (limited to 'net/core')
38 files changed, 2295 insertions, 1025 deletions
| diff --git a/net/core/Makefile b/net/core/Makefile index 6e6548011fae..62be9aef2528 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_NETPOLL) += netpoll.o  obj-$(CONFIG_FIB_RULES) += fib_rules.o  obj-$(CONFIG_TRACEPOINTS) += net-traces.o  obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o +obj-$(CONFIG_NET_IEEE8021Q_HELPERS) += ieee8021q_helpers.o  obj-$(CONFIG_NET_SELFTESTS) += selftests.o  obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o  obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o @@ -41,4 +42,4 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o  obj-$(CONFIG_BPF_SYSCALL) += sock_map.o  obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o  obj-$(CONFIG_OF)	+= of_net.o -obj-$(CONFIG_NET_TEST) += gso_test.o +obj-$(CONFIG_NET_TEST) += net_test.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 6c4d90b24d46..bc01b3aa6b0f 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -496,27 +496,22 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)  	if (!bpf_capable())  		return ERR_PTR(-EPERM); -	nla_for_each_nested(nla, nla_stgs, rem) { -		if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) { -			if (nla_len(nla) != sizeof(u32)) -				return ERR_PTR(-EINVAL); -			nr_maps++; -		} +	nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, +				 nla_stgs, rem) { +		if (nla_len(nla) != sizeof(u32)) +			return ERR_PTR(-EINVAL); +		nr_maps++;  	}  	diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL);  	if (!diag)  		return ERR_PTR(-ENOMEM); -	nla_for_each_nested(nla, nla_stgs, rem) { -		struct bpf_map *map; -		int map_fd; - -		if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD) -			continue; +	nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, +				 nla_stgs, rem) { +		int map_fd = nla_get_u32(nla); +		struct bpf_map *map = bpf_map_get(map_fd); -		map_fd = nla_get_u32(nla); -		map = bpf_map_get(map_fd);  		if (IS_ERR(map)) {  			err = PTR_ERR(map);  			goto err_free; diff --git a/net/core/datagram.c b/net/core/datagram.c index a8b625abe242..a40f733b37d7 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -324,25 +324,6 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)  }  EXPORT_SYMBOL(skb_free_datagram); -void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) -{ -	bool slow; - -	if (!skb_unref(skb)) { -		sk_peek_offset_bwd(sk, len); -		return; -	} - -	slow = lock_sock_fast(sk); -	sk_peek_offset_bwd(sk, len); -	skb_orphan(skb); -	unlock_sock_fast(sk, slow); - -	/* skb is now orphaned, can be freed outside of locked section */ -	__kfree_skb(skb); -} -EXPORT_SYMBOL(__skb_free_datagram_locked); -  int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,  			struct sk_buff *skb, unsigned int flags,  			void (*destructor)(struct sock *sk, @@ -435,15 +416,23 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,  		end = start + skb_frag_size(frag);  		if ((copy = end - offset) > 0) { -			struct page *page = skb_frag_page(frag); -			u8 *vaddr = kmap(page); +			u32 p_off, p_len, copied; +			struct page *p; +			u8 *vaddr;  			if (copy > len)  				copy = len; -			n = INDIRECT_CALL_1(cb, simple_copy_to_iter, -					vaddr + skb_frag_off(frag) + offset - start, -					copy, data, to); -			kunmap(page); + +			n = 0; +			skb_frag_foreach_page(frag, +					      skb_frag_off(frag) + offset - start, +					      copy, p, p_off, p_len, copied) { +				vaddr = kmap_local_page(p); +				n += INDIRECT_CALL_1(cb, simple_copy_to_iter, +					vaddr + p_off, p_len, data, to); +				kunmap_local(vaddr); +			} +  			offset += n;  			if (n != copy)  				goto short_copy; @@ -629,16 +618,10 @@ fault:  }  EXPORT_SYMBOL(skb_copy_datagram_from_iter); -int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, -			    struct sk_buff *skb, struct iov_iter *from, -			    size_t length) +int zerocopy_fill_skb_from_iter(struct sk_buff *skb, +				struct iov_iter *from, size_t length)  { -	int frag; - -	if (msg && msg->msg_ubuf && msg->sg_from_iter) -		return msg->sg_from_iter(sk, skb, from, length); - -	frag = skb_shinfo(skb)->nr_frags; +	int frag = skb_shinfo(skb)->nr_frags;  	while (length && iov_iter_count(from)) {  		struct page *head, *last_head = NULL; @@ -646,7 +629,6 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,  		int refs, order, n = 0;  		size_t start;  		ssize_t copied; -		unsigned long truesize;  		if (frag == MAX_SKB_FRAGS)  			return -EMSGSIZE; @@ -658,17 +640,9 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,  		length -= copied; -		truesize = PAGE_ALIGN(copied + start);  		skb->data_len += copied;  		skb->len += copied; -		skb->truesize += truesize; -		if (sk && sk->sk_type == SOCK_STREAM) { -			sk_wmem_queued_add(sk, truesize); -			if (!skb_zcopy_pure(skb)) -				sk_mem_charge(sk, truesize); -		} else { -			refcount_add(truesize, &skb->sk->sk_wmem_alloc); -		} +		skb->truesize += PAGE_ALIGN(copied + start);  		head = compound_head(pages[n]);  		order = compound_order(head); @@ -711,6 +685,30 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,  	}  	return 0;  } + +int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, +			    struct sk_buff *skb, struct iov_iter *from, +			    size_t length) +{ +	unsigned long orig_size = skb->truesize; +	unsigned long truesize; +	int ret; + +	if (msg && msg->msg_ubuf && msg->sg_from_iter) +		ret = msg->sg_from_iter(skb, from, length); +	else +		ret = zerocopy_fill_skb_from_iter(skb, from, length); + +	truesize = skb->truesize - orig_size; +	if (sk && sk->sk_type == SOCK_STREAM) { +		sk_wmem_queued_add(sk, truesize); +		if (!skb_zcopy_pure(skb)) +			sk_mem_charge(sk, truesize); +	} else { +		refcount_add(truesize, &skb->sk->sk_wmem_alloc); +	} +	return ret; +}  EXPORT_SYMBOL(__zerocopy_sg_from_iter);  /** diff --git a/net/core/dev.c b/net/core/dev.c index 331848eca7d3..f66e61407883 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -77,7 +77,9 @@  #include <linux/hash.h>  #include <linux/slab.h>  #include <linux/sched.h> +#include <linux/sched/isolation.h>  #include <linux/sched/mm.h> +#include <linux/smpboot.h>  #include <linux/mutex.h>  #include <linux/rwsem.h>  #include <linux/string.h> @@ -197,37 +199,62 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)  	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];  } -static inline void rps_lock_irqsave(struct softnet_data *sd, -				    unsigned long *flags) +#ifndef CONFIG_PREEMPT_RT + +static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); + +static int __init setup_backlog_napi_threads(char *arg) +{ +	static_branch_enable(&use_backlog_threads_key); +	return 0; +} +early_param("thread_backlog_napi", setup_backlog_napi_threads); + +static bool use_backlog_threads(void) +{ +	return static_branch_unlikely(&use_backlog_threads_key); +} + +#else + +static bool use_backlog_threads(void) +{ +	return true; +} + +#endif + +static inline void backlog_lock_irq_save(struct softnet_data *sd, +					 unsigned long *flags)  { -	if (IS_ENABLED(CONFIG_RPS)) +	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())  		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); -	else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) +	else  		local_irq_save(*flags);  } -static inline void rps_lock_irq_disable(struct softnet_data *sd) +static inline void backlog_lock_irq_disable(struct softnet_data *sd)  { -	if (IS_ENABLED(CONFIG_RPS)) +	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())  		spin_lock_irq(&sd->input_pkt_queue.lock); -	else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) +	else  		local_irq_disable();  } -static inline void rps_unlock_irq_restore(struct softnet_data *sd, -					  unsigned long *flags) +static inline void backlog_unlock_irq_restore(struct softnet_data *sd, +					      unsigned long *flags)  { -	if (IS_ENABLED(CONFIG_RPS)) +	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())  		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); -	else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) +	else  		local_irq_restore(*flags);  } -static inline void rps_unlock_irq_enable(struct softnet_data *sd) +static inline void backlog_unlock_irq_enable(struct softnet_data *sd)  { -	if (IS_ENABLED(CONFIG_RPS)) +	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())  		spin_unlock_irq(&sd->input_pkt_queue.lock); -	else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) +	else  		local_irq_enable();  } @@ -422,7 +449,9 @@ static RAW_NOTIFIER_HEAD(netdev_chain);   *	queue in the local softnet handler.   */ -DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = { +	.process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock), +};  EXPORT_PER_CPU_SYMBOL(softnet_data);  /* Page_pool has a lockless array/stack to alloc/recycle pages. @@ -912,6 +941,18 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id)  }  EXPORT_SYMBOL(dev_get_by_napi_id); +static DEFINE_SEQLOCK(netdev_rename_lock); + +void netdev_copy_name(struct net_device *dev, char *name) +{ +	unsigned int seq; + +	do { +		seq = read_seqbegin(&netdev_rename_lock); +		strscpy(name, dev->name, IFNAMSIZ); +	} while (read_seqretry(&netdev_rename_lock, seq)); +} +  /**   *	netdev_get_name - get a netdevice name, knowing its ifindex.   *	@net: network namespace @@ -923,7 +964,6 @@ int netdev_get_name(struct net *net, char *name, int ifindex)  	struct net_device *dev;  	int ret; -	down_read(&devnet_rename_sem);  	rcu_read_lock();  	dev = dev_get_by_index_rcu(net, ifindex); @@ -932,12 +972,11 @@ int netdev_get_name(struct net *net, char *name, int ifindex)  		goto out;  	} -	strcpy(name, dev->name); +	netdev_copy_name(dev, name);  	ret = 0;  out:  	rcu_read_unlock(); -	up_read(&devnet_rename_sem);  	return ret;  } @@ -1189,7 +1228,10 @@ int dev_change_name(struct net_device *dev, const char *newname)  	memcpy(oldname, dev->name, IFNAMSIZ); +	write_seqlock_bh(&netdev_rename_lock);  	err = dev_get_valid_name(net, dev, newname); +	write_sequnlock_bh(&netdev_rename_lock); +  	if (err < 0) {  		up_write(&devnet_rename_sem);  		return err; @@ -1229,7 +1271,9 @@ rollback:  		if (err >= 0) {  			err = ret;  			down_write(&devnet_rename_sem); +			write_seqlock_bh(&netdev_rename_lock);  			memcpy(dev->name, oldname, IFNAMSIZ); +			write_sequnlock_bh(&netdev_rename_lock);  			memcpy(oldname, newname, IFNAMSIZ);  			WRITE_ONCE(dev->name_assign_type, old_assign_type);  			old_assign_type = NET_NAME_RENAMED; @@ -2057,6 +2101,11 @@ void net_dec_egress_queue(void)  EXPORT_SYMBOL_GPL(net_dec_egress_queue);  #endif +#ifdef CONFIG_NET_CLS_ACT +DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); +EXPORT_SYMBOL(tcf_bypass_check_needed_key); +#endif +  DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);  EXPORT_SYMBOL(netstamp_needed_key);  #ifdef CONFIG_JUMP_LABEL @@ -2113,7 +2162,7 @@ EXPORT_SYMBOL(net_disable_timestamp);  static inline void net_timestamp_set(struct sk_buff *skb)  {  	skb->tstamp = 0; -	skb->mono_delivery_time = 0; +	skb->tstamp_type = SKB_CLOCK_REALTIME;  	if (static_branch_unlikely(&netstamp_needed_key))  		skb->tstamp = ktime_get_real();  } @@ -3893,6 +3942,7 @@ netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)  	return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));  } +#ifndef CONFIG_PREEMPT_RT  static bool netdev_xmit_txqueue_skipped(void)  {  	return __this_cpu_read(softnet_data.xmit.skip_txqueue); @@ -3903,6 +3953,19 @@ void netdev_xmit_skip_txqueue(bool skip)  	__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);  }  EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); + +#else +static bool netdev_xmit_txqueue_skipped(void) +{ +	return current->net_xmit.skip_txqueue; +} + +void netdev_xmit_skip_txqueue(bool skip) +{ +	current->net_xmit.skip_txqueue = skip; +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); +#endif  #endif /* CONFIG_NET_EGRESS */  #ifdef CONFIG_NET_XGRESS @@ -3917,6 +3980,11 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,  	if (!miniq)  		return ret; +	if (static_branch_unlikely(&tcf_bypass_check_needed_key)) { +		if (tcf_block_bypass_sw(miniq->block)) +			return ret; +	} +  	tc_skb_cb(skb)->mru = 0;  	tc_skb_cb(skb)->post_ct = false;  	tcf_set_drop_reason(skb, *drop_reason); @@ -3977,10 +4045,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,  {  	struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);  	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS; +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;  	int sch_ret;  	if (!entry)  		return skb; + +	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);  	if (*pt_prev) {  		*ret = deliver_skb(skb, *pt_prev, orig_dev);  		*pt_prev = NULL; @@ -4009,10 +4080,12 @@ ingress_verdict:  			break;  		}  		*ret = NET_RX_SUCCESS; +		bpf_net_ctx_clear(bpf_net_ctx);  		return NULL;  	case TC_ACT_SHOT:  		kfree_skb_reason(skb, drop_reason);  		*ret = NET_RX_DROP; +		bpf_net_ctx_clear(bpf_net_ctx);  		return NULL;  	/* used by tc_run */  	case TC_ACT_STOLEN: @@ -4022,8 +4095,10 @@ ingress_verdict:  		fallthrough;  	case TC_ACT_CONSUMED:  		*ret = NET_RX_SUCCESS; +		bpf_net_ctx_clear(bpf_net_ctx);  		return NULL;  	} +	bpf_net_ctx_clear(bpf_net_ctx);  	return skb;  } @@ -4033,11 +4108,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)  {  	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);  	enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;  	int sch_ret;  	if (!entry)  		return skb; +	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); +  	/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was  	 * already set by the caller.  	 */ @@ -4053,10 +4131,12 @@ egress_verdict:  		/* No need to push/pop skb's mac_header here on egress! */  		skb_do_redirect(skb);  		*ret = NET_XMIT_SUCCESS; +		bpf_net_ctx_clear(bpf_net_ctx);  		return NULL;  	case TC_ACT_SHOT:  		kfree_skb_reason(skb, drop_reason);  		*ret = NET_XMIT_DROP; +		bpf_net_ctx_clear(bpf_net_ctx);  		return NULL;  	/* used by tc_run */  	case TC_ACT_STOLEN: @@ -4066,8 +4146,10 @@ egress_verdict:  		fallthrough;  	case TC_ACT_CONSUMED:  		*ret = NET_XMIT_SUCCESS; +		bpf_net_ctx_clear(bpf_net_ctx);  		return NULL;  	} +	bpf_net_ctx_clear(bpf_net_ctx);  	return skb;  } @@ -4410,8 +4492,8 @@ EXPORT_SYMBOL(__dev_direct_xmit);  /*************************************************************************   *			Receiver routines   *************************************************************************/ +static DEFINE_PER_CPU(struct task_struct *, backlog_napi); -unsigned int sysctl_skb_defer_max __read_mostly = 64;  int weight_p __read_mostly = 64;           /* old backlog weight */  int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */  int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */ @@ -4433,18 +4515,16 @@ static inline void ____napi_schedule(struct softnet_data *sd,  		 */  		thread = READ_ONCE(napi->thread);  		if (thread) { -			/* Avoid doing set_bit() if the thread is in -			 * INTERRUPTIBLE state, cause napi_thread_wait() -			 * makes sure to proceed with napi polling -			 * if the thread is explicitly woken from here. -			 */ -			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) -				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); +			if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) +				goto use_local_napi; + +			set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);  			wake_up_process(thread);  			return;  		}  	} +use_local_napi:  	list_add_tail(&napi->poll_list, &sd->poll_list);  	WRITE_ONCE(napi->list_owner, smp_processor_id());  	/* If not called from net_rx_action() @@ -4466,12 +4546,13 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,  	    struct rps_dev_flow *rflow, u16 next_cpu)  {  	if (next_cpu < nr_cpu_ids) { +		u32 head;  #ifdef CONFIG_RFS_ACCEL  		struct netdev_rx_queue *rxqueue;  		struct rps_dev_flow_table *flow_table;  		struct rps_dev_flow *old_rflow; -		u32 flow_id;  		u16 rxq_index; +		u32 flow_id;  		int rc;  		/* Should we steer this flow to a different hardware queue? */ @@ -4493,16 +4574,16 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,  			goto out;  		old_rflow = rflow;  		rflow = &flow_table->flows[flow_id]; -		rflow->filter = rc; -		if (old_rflow->filter == rflow->filter) -			old_rflow->filter = RPS_NO_FILTER; +		WRITE_ONCE(rflow->filter, rc); +		if (old_rflow->filter == rc) +			WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);  	out:  #endif -		rflow->last_qtail = -			per_cpu(softnet_data, next_cpu).input_queue_head; +		head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); +		rps_input_queue_tail_save(&rflow->last_qtail, head);  	} -	rflow->cpu = next_cpu; +	WRITE_ONCE(rflow->cpu, next_cpu);  	return rflow;  } @@ -4581,7 +4662,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  		 */  		if (unlikely(tcpu != next_cpu) &&  		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || -		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head - +		     ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -  		      rflow->last_qtail)) >= 0)) {  			tcpu = next_cpu;  			rflow = set_rps_cpu(dev, skb, rflow, next_cpu); @@ -4635,9 +4716,9 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,  	if (flow_table && flow_id <= flow_table->mask) {  		rflow = &flow_table->flows[flow_id];  		cpu = READ_ONCE(rflow->cpu); -		if (rflow->filter == filter_id && cpu < nr_cpu_ids && -		    ((int)(per_cpu(softnet_data, cpu).input_queue_head - -			   rflow->last_qtail) < +		if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && +		    ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - +			   READ_ONCE(rflow->last_qtail)) <  		     (int)(10 * flow_table->mask)))  			expire = false;  	} @@ -4684,6 +4765,11 @@ static void napi_schedule_rps(struct softnet_data *sd)  #ifdef CONFIG_RPS  	if (sd != mysd) { +		if (use_backlog_threads()) { +			__napi_schedule_irqoff(&sd->backlog); +			return; +		} +  		sd->rps_ipi_next = mysd->rps_ipi_list;  		mysd->rps_ipi_list = sd; @@ -4698,6 +4784,23 @@ static void napi_schedule_rps(struct softnet_data *sd)  	__napi_schedule_irqoff(&mysd->backlog);  } +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) +{ +	unsigned long flags; + +	if (use_backlog_threads()) { +		backlog_lock_irq_save(sd, &flags); + +		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) +			__napi_schedule_irqoff(&sd->backlog); + +		backlog_unlock_irq_restore(sd, &flags); + +	} else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { +		smp_call_function_single_async(cpu, &sd->defer_csd); +	} +} +  #ifdef CONFIG_NET_FLOW_LIMIT  int netdev_flow_limit_table_len __read_mostly = (1 << 12);  #endif @@ -4749,37 +4852,45 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,  	struct softnet_data *sd;  	unsigned long flags;  	unsigned int qlen; +	int max_backlog; +	u32 tail; -	reason = SKB_DROP_REASON_NOT_SPECIFIED; +	reason = SKB_DROP_REASON_DEV_READY; +	if (!netif_running(skb->dev)) +		goto bad_dev; + +	reason = SKB_DROP_REASON_CPU_BACKLOG;  	sd = &per_cpu(softnet_data, cpu); -	rps_lock_irqsave(sd, &flags); -	if (!netif_running(skb->dev)) -		goto drop; +	qlen = skb_queue_len_lockless(&sd->input_pkt_queue); +	max_backlog = READ_ONCE(net_hotdata.max_backlog); +	if (unlikely(qlen > max_backlog)) +		goto cpu_backlog_drop; +	backlog_lock_irq_save(sd, &flags);  	qlen = skb_queue_len(&sd->input_pkt_queue); -	if (qlen <= READ_ONCE(net_hotdata.max_backlog) && -	    !skb_flow_limit(skb, qlen)) { -		if (qlen) { -enqueue: -			__skb_queue_tail(&sd->input_pkt_queue, skb); -			input_queue_tail_incr_save(sd, qtail); -			rps_unlock_irq_restore(sd, &flags); -			return NET_RX_SUCCESS; +	if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { +		if (!qlen) { +			/* Schedule NAPI for backlog device. We can use +			 * non atomic operation as we own the queue lock. +			 */ +			if (!__test_and_set_bit(NAPI_STATE_SCHED, +						&sd->backlog.state)) +				napi_schedule_rps(sd);  		} +		__skb_queue_tail(&sd->input_pkt_queue, skb); +		tail = rps_input_queue_tail_incr(sd); +		backlog_unlock_irq_restore(sd, &flags); -		/* Schedule NAPI for backlog device -		 * We can use non atomic operation since we own the queue lock -		 */ -		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) -			napi_schedule_rps(sd); -		goto enqueue; +		/* save the tail outside of the critical section */ +		rps_input_queue_tail_save(qtail, tail); +		return NET_RX_SUCCESS;  	} -	reason = SKB_DROP_REASON_CPU_BACKLOG; -drop: -	sd->dropped++; -	rps_unlock_irq_restore(sd, &flags); +	backlog_unlock_irq_restore(sd, &flags); +cpu_backlog_drop: +	atomic_inc(&sd->dropped); +bad_dev:  	dev_core_stats_rx_dropped_inc(skb->dev);  	kfree_skb_reason(skb, reason);  	return NET_RX_DROP; @@ -5015,11 +5126,14 @@ static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);  int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)  { +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; +  	if (xdp_prog) {  		struct xdp_buff xdp;  		u32 act;  		int err; +		bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);  		act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);  		if (act != XDP_PASS) {  			switch (act) { @@ -5033,11 +5147,14 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)  				generic_xdp_tx(*pskb, xdp_prog);  				break;  			} +			bpf_net_ctx_clear(bpf_net_ctx);  			return XDP_DROP;  		} +		bpf_net_ctx_clear(bpf_net_ctx);  	}  	return XDP_PASS;  out_redir: +	bpf_net_ctx_clear(bpf_net_ctx);  	kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);  	return XDP_DROP;  } @@ -5153,7 +5270,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)  				trace_consume_skb(skb, net_tx_action);  			else  				trace_kfree_skb(skb, net_tx_action, -						get_kfree_skb_cb(skb)->reason); +						get_kfree_skb_cb(skb)->reason, NULL);  			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)  				__kfree_skb(skb); @@ -5844,23 +5961,25 @@ static void flush_backlog(struct work_struct *work)  	local_bh_disable();  	sd = this_cpu_ptr(&softnet_data); -	rps_lock_irq_disable(sd); +	backlog_lock_irq_disable(sd);  	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {  		if (skb->dev->reg_state == NETREG_UNREGISTERING) {  			__skb_unlink(skb, &sd->input_pkt_queue);  			dev_kfree_skb_irq(skb); -			input_queue_head_incr(sd); +			rps_input_queue_head_incr(sd);  		}  	} -	rps_unlock_irq_enable(sd); +	backlog_unlock_irq_enable(sd); +	local_lock_nested_bh(&softnet_data.process_queue_bh_lock);  	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {  		if (skb->dev->reg_state == NETREG_UNREGISTERING) {  			__skb_unlink(skb, &sd->process_queue);  			kfree_skb(skb); -			input_queue_head_incr(sd); +			rps_input_queue_head_incr(sd);  		}  	} +	local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);  	local_bh_enable();  } @@ -5870,14 +5989,14 @@ static bool flush_required(int cpu)  	struct softnet_data *sd = &per_cpu(softnet_data, cpu);  	bool do_flush; -	rps_lock_irq_disable(sd); +	backlog_lock_irq_disable(sd);  	/* as insertion into process_queue happens with the rps lock held,  	 * process_queue access may race only with dequeue  	 */  	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||  		   !skb_queue_empty_lockless(&sd->process_queue); -	rps_unlock_irq_enable(sd); +	backlog_unlock_irq_enable(sd);  	return do_flush;  #endif @@ -5943,7 +6062,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)  #ifdef CONFIG_RPS  	struct softnet_data *remsd = sd->rps_ipi_list; -	if (remsd) { +	if (!use_backlog_threads() && remsd) {  		sd->rps_ipi_list = NULL;  		local_irq_enable(); @@ -5958,7 +6077,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)  {  #ifdef CONFIG_RPS -	return sd->rps_ipi_list != NULL; +	return !use_backlog_threads() && sd->rps_ipi_list;  #else  	return false;  #endif @@ -5982,17 +6101,22 @@ static int process_backlog(struct napi_struct *napi, int quota)  	while (again) {  		struct sk_buff *skb; +		local_lock_nested_bh(&softnet_data.process_queue_bh_lock);  		while ((skb = __skb_dequeue(&sd->process_queue))) { +			local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);  			rcu_read_lock();  			__netif_receive_skb(skb);  			rcu_read_unlock(); -			input_queue_head_incr(sd); -			if (++work >= quota) +			if (++work >= quota) { +				rps_input_queue_head_add(sd, work);  				return work; +			} +			local_lock_nested_bh(&softnet_data.process_queue_bh_lock);  		} +		local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); -		rps_lock_irq_disable(sd); +		backlog_lock_irq_disable(sd);  		if (skb_queue_empty(&sd->input_pkt_queue)) {  			/*  			 * Inline a custom version of __napi_complete(). @@ -6002,15 +6126,19 @@ static int process_backlog(struct napi_struct *napi, int quota)  			 * We can use a plain write instead of clear_bit(),  			 * and we dont need an smp_mb() memory barrier.  			 */ -			napi->state = 0; +			napi->state &= NAPIF_STATE_THREADED;  			again = false;  		} else { +			local_lock_nested_bh(&softnet_data.process_queue_bh_lock);  			skb_queue_splice_tail_init(&sd->input_pkt_queue,  						   &sd->process_queue); +			local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);  		} -		rps_unlock_irq_enable(sd); +		backlog_unlock_irq_enable(sd);  	} +	if (work) +		rps_input_queue_head_add(sd, work);  	return work;  } @@ -6217,6 +6345,7 @@ enum {  static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,  			   unsigned flags, u16 budget)  { +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;  	bool skip_schedule = false;  	unsigned long timeout;  	int rc; @@ -6234,6 +6363,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,  	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);  	local_bh_disable(); +	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);  	if (flags & NAPI_F_PREFER_BUSY_POLL) {  		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); @@ -6256,6 +6386,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,  	netpoll_poll_unlock(have_poll_lock);  	if (rc == budget)  		__busy_poll_stop(napi, skip_schedule); +	bpf_net_ctx_clear(bpf_net_ctx);  	local_bh_enable();  } @@ -6265,6 +6396,7 @@ static void __napi_busy_loop(unsigned int napi_id,  {  	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;  	int (*napi_poll)(struct napi_struct *napi, int budget); +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;  	void *have_poll_lock = NULL;  	struct napi_struct *napi; @@ -6283,6 +6415,7 @@ restart:  		int work = 0;  		local_bh_disable(); +		bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);  		if (!napi_poll) {  			unsigned long val = READ_ONCE(napi->state); @@ -6313,6 +6446,7 @@ count:  			__NET_ADD_STATS(dev_net(napi->dev),  					LINUX_MIB_BUSYPOLLRXPACKETS, work);  		skb_defer_free_flush(this_cpu_ptr(&softnet_data)); +		bpf_net_ctx_clear(bpf_net_ctx);  		local_bh_enable();  		if (!loop_end || loop_end(loop_end_arg, start_time)) @@ -6447,7 +6581,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded)  		}  	} -	dev->threaded = threaded; +	WRITE_ONCE(dev->threaded, threaded);  	/* Make sure kthread is created before THREADED bit  	 * is set. @@ -6538,7 +6672,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,  	 * threaded mode will not be enabled in napi_enable().  	 */  	if (dev->threaded && napi_kthread_create(napi)) -		dev->threaded = 0; +		dev->threaded = false;  	netif_napi_set_irq(napi, -1);  }  EXPORT_SYMBOL(netif_napi_add_weight); @@ -6716,8 +6850,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)  static int napi_thread_wait(struct napi_struct *napi)  { -	bool woken = false; -  	set_current_state(TASK_INTERRUPTIBLE);  	while (!kthread_should_stop()) { @@ -6726,15 +6858,13 @@ static int napi_thread_wait(struct napi_struct *napi)  		 * Testing SCHED bit is not enough because SCHED bit might be  		 * set by some other busy poll thread or by napi_disable().  		 */ -		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { +		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {  			WARN_ON(!list_empty(&napi->poll_list));  			__set_current_state(TASK_RUNNING);  			return 0;  		}  		schedule(); -		/* woken being true indicates this thread owns this napi. */ -		woken = true;  		set_current_state(TASK_INTERRUPTIBLE);  	}  	__set_current_state(TASK_RUNNING); @@ -6742,43 +6872,52 @@ static int napi_thread_wait(struct napi_struct *napi)  	return -1;  } -static int napi_threaded_poll(void *data) +static void napi_threaded_poll_loop(struct napi_struct *napi)  { -	struct napi_struct *napi = data; +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;  	struct softnet_data *sd; -	void *have; +	unsigned long last_qs = jiffies; -	while (!napi_thread_wait(napi)) { -		unsigned long last_qs = jiffies; +	for (;;) { +		bool repoll = false; +		void *have; -		for (;;) { -			bool repoll = false; +		local_bh_disable(); +		bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); -			local_bh_disable(); -			sd = this_cpu_ptr(&softnet_data); -			sd->in_napi_threaded_poll = true; +		sd = this_cpu_ptr(&softnet_data); +		sd->in_napi_threaded_poll = true; -			have = netpoll_poll_lock(napi); -			__napi_poll(napi, &repoll); -			netpoll_poll_unlock(have); +		have = netpoll_poll_lock(napi); +		__napi_poll(napi, &repoll); +		netpoll_poll_unlock(have); -			sd->in_napi_threaded_poll = false; -			barrier(); +		sd->in_napi_threaded_poll = false; +		barrier(); -			if (sd_has_rps_ipi_waiting(sd)) { -				local_irq_disable(); -				net_rps_action_and_irq_enable(sd); -			} -			skb_defer_free_flush(sd); -			local_bh_enable(); +		if (sd_has_rps_ipi_waiting(sd)) { +			local_irq_disable(); +			net_rps_action_and_irq_enable(sd); +		} +		skb_defer_free_flush(sd); +		bpf_net_ctx_clear(bpf_net_ctx); +		local_bh_enable(); -			if (!repoll) -				break; +		if (!repoll) +			break; -			rcu_softirq_qs_periodic(last_qs); -			cond_resched(); -		} +		rcu_softirq_qs_periodic(last_qs); +		cond_resched();  	} +} + +static int napi_threaded_poll(void *data) +{ +	struct napi_struct *napi = data; + +	while (!napi_thread_wait(napi)) +		napi_threaded_poll_loop(napi); +  	return 0;  } @@ -6787,10 +6926,12 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)  	struct softnet_data *sd = this_cpu_ptr(&softnet_data);  	unsigned long time_limit = jiffies +  		usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;  	int budget = READ_ONCE(net_hotdata.netdev_budget);  	LIST_HEAD(list);  	LIST_HEAD(repoll); +	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);  start:  	sd->in_net_rx_action = true;  	local_irq_disable(); @@ -6843,7 +6984,8 @@ start:  		sd->in_net_rx_action = false;  	net_rps_action_and_irq_enable(sd); -end:; +end: +	bpf_net_ctx_clear(bpf_net_ctx);  }  struct netdev_adjacent { @@ -8459,27 +8601,29 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)  static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)  {  	unsigned int old_flags = dev->flags; +	unsigned int promiscuity, flags;  	kuid_t uid;  	kgid_t gid;  	ASSERT_RTNL(); -	dev->flags |= IFF_PROMISC; -	dev->promiscuity += inc; -	if (dev->promiscuity == 0) { +	promiscuity = dev->promiscuity + inc; +	if (promiscuity == 0) {  		/*  		 * Avoid overflow.  		 * If inc causes overflow, untouch promisc and return error.  		 */ -		if (inc < 0) -			dev->flags &= ~IFF_PROMISC; -		else { -			dev->promiscuity -= inc; +		if (unlikely(inc > 0)) {  			netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");  			return -EOVERFLOW;  		} +		flags = old_flags & ~IFF_PROMISC; +	} else { +		flags = old_flags | IFF_PROMISC;  	} -	if (dev->flags != old_flags) { +	WRITE_ONCE(dev->promiscuity, promiscuity); +	if (flags != old_flags) { +		WRITE_ONCE(dev->flags, flags);  		netdev_info(dev, "%s promiscuous mode\n",  			    dev->flags & IFF_PROMISC ? "entered" : "left");  		if (audit_enabled) { @@ -8530,25 +8674,27 @@ EXPORT_SYMBOL(dev_set_promiscuity);  static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)  {  	unsigned int old_flags = dev->flags, old_gflags = dev->gflags; +	unsigned int allmulti, flags;  	ASSERT_RTNL(); -	dev->flags |= IFF_ALLMULTI; -	dev->allmulti += inc; -	if (dev->allmulti == 0) { +	allmulti = dev->allmulti + inc; +	if (allmulti == 0) {  		/*  		 * Avoid overflow.  		 * If inc causes overflow, untouch allmulti and return error.  		 */ -		if (inc < 0) -			dev->flags &= ~IFF_ALLMULTI; -		else { -			dev->allmulti -= inc; +		if (unlikely(inc > 0)) {  			netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");  			return -EOVERFLOW;  		} +		flags = old_flags & ~IFF_ALLMULTI; +	} else { +		flags = old_flags | IFF_ALLMULTI;  	} -	if (dev->flags ^ old_flags) { +	WRITE_ONCE(dev->allmulti, allmulti); +	if (flags != old_flags) { +		WRITE_ONCE(dev->flags, flags);  		netdev_info(dev, "%s allmulticast mode\n",  			    dev->flags & IFF_ALLMULTI ? "entered" : "left");  		dev_change_rx_flags(dev, IFF_ALLMULTI); @@ -8874,7 +9020,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)  		return -ERANGE;  	if (new_len != orig_len) { -		dev->tx_queue_len = new_len; +		WRITE_ONCE(dev->tx_queue_len, new_len);  		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);  		res = notifier_to_errno(res);  		if (res) @@ -8888,7 +9034,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)  err_rollback:  	netdev_err(dev, "refused to change device tx_queue_len\n"); -	dev->tx_queue_len = orig_len; +	WRITE_ONCE(dev->tx_queue_len, orig_len);  	return res;  } @@ -9134,7 +9280,7 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)  		netif_carrier_off(dev);  	else  		netif_carrier_on(dev); -	dev->proto_down = proto_down; +	WRITE_ONCE(dev->proto_down, proto_down);  	return 0;  } @@ -9148,18 +9294,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)  void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,  				  u32 value)  { +	u32 proto_down_reason;  	int b;  	if (!mask) { -		dev->proto_down_reason = value; +		proto_down_reason = value;  	} else { +		proto_down_reason = dev->proto_down_reason;  		for_each_set_bit(b, &mask, 32) {  			if (value & (1 << b)) -				dev->proto_down_reason |= BIT(b); +				proto_down_reason |= BIT(b);  			else -				dev->proto_down_reason &= ~BIT(b); +				proto_down_reason &= ~BIT(b);  		}  	} +	WRITE_ONCE(dev->proto_down_reason, proto_down_reason);  }  struct bpf_xdp_link { @@ -9763,6 +9912,15 @@ static void netdev_sync_lower_features(struct net_device *upper,  	}  } +static bool netdev_has_ip_or_hw_csum(netdev_features_t features) +{ +	netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; +	bool ip_csum = (features & ip_csum_mask) == ip_csum_mask; +	bool hw_csum = features & NETIF_F_HW_CSUM; + +	return ip_csum || hw_csum; +} +  static netdev_features_t netdev_fix_features(struct net_device *dev,  	netdev_features_t features)  { @@ -9844,15 +10002,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,  		features &= ~NETIF_F_LRO;  	} -	if (features & NETIF_F_HW_TLS_TX) { -		bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == -			(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); -		bool hw_csum = features & NETIF_F_HW_CSUM; - -		if (!ip_csum && !hw_csum) { -			netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); -			features &= ~NETIF_F_HW_TLS_TX; -		} +	if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) { +		netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); +		features &= ~NETIF_F_HW_TLS_TX;  	}  	if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { @@ -9860,6 +10012,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,  		features &= ~NETIF_F_HW_TLS_RX;  	} +	if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) { +		netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n"); +		features &= ~NETIF_F_GSO_UDP_L4; +	} +  	return features;  } @@ -10193,6 +10350,10 @@ int register_netdevice(struct net_device *dev)  	if (ret)  		return ret; +	/* rss ctx ID 0 is reserved for the default context, start from 1 */ +	xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1); +	mutex_init(&dev->ethtool->rss_lock); +  	spin_lock_init(&dev->addr_list_lock);  	netdev_set_addr_lockdep_class(dev); @@ -10349,25 +10510,12 @@ err_free_name:  }  EXPORT_SYMBOL(register_netdevice); -/** - *	init_dummy_netdev	- init a dummy network device for NAPI - *	@dev: device to init - * - *	This takes a network device structure and initialize the minimum - *	amount of fields so it can be used to schedule NAPI polls without - *	registering a full blown interface. This is to be used by drivers - *	that need to tie several hardware interfaces to a single NAPI - *	poll scheduler due to HW limitations. +/* Initialize the core of a dummy net device. + * This is useful if you are calling this function after alloc_netdev(), + * since it does not memset the net_device fields.   */ -void init_dummy_netdev(struct net_device *dev) +static void init_dummy_netdev_core(struct net_device *dev)  { -	/* Clear everything. Note we don't initialize spinlocks -	 * are they aren't supposed to be taken by any of the -	 * NAPI code and this dummy netdev is supposed to be -	 * only ever used for NAPI polls -	 */ -	memset(dev, 0, sizeof(struct net_device)); -  	/* make sure we BUG if trying to hit standard  	 * register/unregister code path  	 */ @@ -10388,8 +10536,28 @@ void init_dummy_netdev(struct net_device *dev)  	 * its refcount.  	 */  } -EXPORT_SYMBOL_GPL(init_dummy_netdev); +/** + *	init_dummy_netdev	- init a dummy network device for NAPI + *	@dev: device to init + * + *	This takes a network device structure and initializes the minimum + *	amount of fields so it can be used to schedule NAPI polls without + *	registering a full blown interface. This is to be used by drivers + *	that need to tie several hardware interfaces to a single NAPI + *	poll scheduler due to HW limitations. + */ +void init_dummy_netdev(struct net_device *dev) +{ +	/* Clear everything. Note we don't initialize spinlocks +	 * as they aren't supposed to be taken by any of the +	 * NAPI code and this dummy netdev is supposed to be +	 * only ever used for NAPI polls +	 */ +	memset(dev, 0, sizeof(struct net_device)); +	init_dummy_netdev_core(dev); +} +EXPORT_SYMBOL_GPL(init_dummy_netdev);  /**   *	register_netdev	- register a network device @@ -10488,8 +10656,9 @@ static struct net_device *netdev_wait_allrefs_any(struct list_head *list)  			rebroadcast_time = jiffies;  		} +		rcu_barrier(); +  		if (!wait) { -			rcu_barrier();  			wait = WAIT_REFS_MIN_MSECS;  		} else {  			msleep(wait); @@ -10603,6 +10772,54 @@ void netdev_run_todo(void)  		wake_up(&netdev_unregistering_wq);  } +/* Collate per-cpu network dstats statistics + * + * Read per-cpu network statistics from dev->dstats and populate the related + * fields in @s. + */ +static void dev_fetch_dstats(struct rtnl_link_stats64 *s, +			     const struct pcpu_dstats __percpu *dstats) +{ +	int cpu; + +	for_each_possible_cpu(cpu) { +		u64 rx_packets, rx_bytes, rx_drops; +		u64 tx_packets, tx_bytes, tx_drops; +		const struct pcpu_dstats *stats; +		unsigned int start; + +		stats = per_cpu_ptr(dstats, cpu); +		do { +			start = u64_stats_fetch_begin(&stats->syncp); +			rx_packets = u64_stats_read(&stats->rx_packets); +			rx_bytes   = u64_stats_read(&stats->rx_bytes); +			rx_drops   = u64_stats_read(&stats->rx_drops); +			tx_packets = u64_stats_read(&stats->tx_packets); +			tx_bytes   = u64_stats_read(&stats->tx_bytes); +			tx_drops   = u64_stats_read(&stats->tx_drops); +		} while (u64_stats_fetch_retry(&stats->syncp, start)); + +		s->rx_packets += rx_packets; +		s->rx_bytes   += rx_bytes; +		s->rx_dropped += rx_drops; +		s->tx_packets += tx_packets; +		s->tx_bytes   += tx_bytes; +		s->tx_dropped += tx_drops; +	} +} + +/* ndo_get_stats64 implementation for dtstats-based accounting. + * + * Populate @s from dev->stats and dev->dstats. This is used internally by the + * core for NETDEV_PCPU_STAT_DSTAT-type stats collection. + */ +static void dev_get_dstats64(const struct net_device *dev, +			     struct rtnl_link_stats64 *s) +{ +	netdev_stats_to_stats64(s, &dev->stats); +	dev_fetch_dstats(s, dev->dstats); +} +  /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has   * all the same fields in the same order as net_device_stats, with only   * the type differing, but rtnl_link_stats64 may have additional fields @@ -10679,6 +10896,8 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,  		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));  	} else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {  		dev_get_tstats64(dev, storage); +	} else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) { +		dev_get_dstats64(dev, storage);  	} else {  		netdev_stats_to_stats64(storage, &dev->stats);  	} @@ -10796,13 +11015,6 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev)  }  EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); -void netdev_freemem(struct net_device *dev) -{ -	char *addr = (char *)dev - dev->padded; - -	kvfree(addr); -} -  /**   * alloc_netdev_mqs - allocate network device   * @sizeof_priv: size of private data to allocate space for @@ -10822,8 +11034,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  		unsigned int txqs, unsigned int rxqs)  {  	struct net_device *dev; -	unsigned int alloc_size; -	struct net_device *p;  	BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -10837,21 +11047,12 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  		return NULL;  	} -	alloc_size = sizeof(struct net_device); -	if (sizeof_priv) { -		/* ensure 32-byte alignment of private area */ -		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); -		alloc_size += sizeof_priv; -	} -	/* ensure 32-byte alignment of whole construct */ -	alloc_size += NETDEV_ALIGN - 1; - -	p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); -	if (!p) +	dev = kvzalloc(struct_size(dev, priv, sizeof_priv), +		       GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); +	if (!dev)  		return NULL; -	dev = PTR_ALIGN(p, NETDEV_ALIGN); -	dev->padded = (char *)dev - (char *)p; +	dev->priv_len = sizeof_priv;  	ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);  #ifdef CONFIG_PCPU_DEV_REFCNT @@ -10915,6 +11116,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	dev->real_num_rx_queues = rxqs;  	if (netif_alloc_rx_queues(dev))  		goto free_all; +	dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT); +	if (!dev->ethtool) +		goto free_all;  	strcpy(dev->name, name);  	dev->name_assign_type = name_assign_type; @@ -10935,7 +11139,7 @@ free_pcpu:  	free_percpu(dev->pcpu_refcnt);  free_dev:  #endif -	netdev_freemem(dev); +	kvfree(dev);  	return NULL;  }  EXPORT_SYMBOL(alloc_netdev_mqs); @@ -10965,6 +11169,7 @@ void free_netdev(struct net_device *dev)  		return;  	} +	kfree(dev->ethtool);  	netif_free_tx_queues(dev);  	netif_free_rx_queues(dev); @@ -10987,8 +11192,9 @@ void free_netdev(struct net_device *dev)  	dev->xdp_bulkq = NULL;  	/*  Compatibility with error handling in drivers */ -	if (dev->reg_state == NETREG_UNINITIALIZED) { -		netdev_freemem(dev); +	if (dev->reg_state == NETREG_UNINITIALIZED || +	    dev->reg_state == NETREG_DUMMY) { +		kvfree(dev);  		return;  	} @@ -11001,6 +11207,19 @@ void free_netdev(struct net_device *dev)  EXPORT_SYMBOL(free_netdev);  /** + * alloc_netdev_dummy - Allocate and initialize a dummy net device. + * @sizeof_priv: size of private data to allocate space for + * + * Return: the allocated net_device on success, NULL otherwise + */ +struct net_device *alloc_netdev_dummy(int sizeof_priv) +{ +	return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, +			    init_dummy_netdev_core); +} +EXPORT_SYMBOL_GPL(alloc_netdev_dummy); + +/**   *	synchronize_net -  Synchronize with packet receive processing   *   *	Wait for packets currently being received to be done. @@ -11016,6 +11235,34 @@ void synchronize_net(void)  }  EXPORT_SYMBOL(synchronize_net); +static void netdev_rss_contexts_free(struct net_device *dev) +{ +	struct ethtool_rxfh_context *ctx; +	unsigned long context; + +	mutex_lock(&dev->ethtool->rss_lock); +	xa_for_each(&dev->ethtool->rss_ctx, context, ctx) { +		struct ethtool_rxfh_param rxfh; + +		rxfh.indir = ethtool_rxfh_context_indir(ctx); +		rxfh.key = ethtool_rxfh_context_key(ctx); +		rxfh.hfunc = ctx->hfunc; +		rxfh.input_xfrm = ctx->input_xfrm; +		rxfh.rss_context = context; +		rxfh.rss_delete = true; + +		xa_erase(&dev->ethtool->rss_ctx, context); +		if (dev->ethtool_ops->create_rxfh_context) +			dev->ethtool_ops->remove_rxfh_context(dev, ctx, +							      context, NULL); +		else +			dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL); +		kfree(ctx); +	} +	xa_destroy(&dev->ethtool->rss_ctx); +	mutex_unlock(&dev->ethtool->rss_lock); +} +  /**   *	unregister_netdevice_queue - remove device from the kernel   *	@dev: device @@ -11119,11 +11366,15 @@ void unregister_netdevice_many_notify(struct list_head *head,  		netdev_name_node_alt_flush(dev);  		netdev_name_node_free(dev->name_node); +		netdev_rss_contexts_free(dev); +  		call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);  		if (dev->netdev_ops->ndo_uninit)  			dev->netdev_ops->ndo_uninit(dev); +		mutex_destroy(&dev->ethtool->rss_lock); +  		if (skb)  			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); @@ -11303,8 +11554,12 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,  	dev_net_set(dev, net);  	dev->ifindex = new_ifindex; -	if (new_name[0]) /* Rename the netdev to prepared name */ +	if (new_name[0]) { +		/* Rename the netdev to prepared name */ +		write_seqlock_bh(&netdev_rename_lock);  		strscpy(dev->name, new_name, IFNAMSIZ); +		write_sequnlock_bh(&netdev_rename_lock); +	}  	/* Fixup kobjects */  	dev_set_uevent_suppress(&dev->dev, 1); @@ -11379,7 +11634,7 @@ static int dev_cpu_dead(unsigned int oldcpu)  		list_del_init(&napi->poll_list);  		if (napi->poll == process_backlog) -			napi->state = 0; +			napi->state &= NAPIF_STATE_THREADED;  		else  			____napi_schedule(sd, napi);  	} @@ -11387,21 +11642,23 @@ static int dev_cpu_dead(unsigned int oldcpu)  	raise_softirq_irqoff(NET_TX_SOFTIRQ);  	local_irq_enable(); +	if (!use_backlog_threads()) {  #ifdef CONFIG_RPS -	remsd = oldsd->rps_ipi_list; -	oldsd->rps_ipi_list = NULL; +		remsd = oldsd->rps_ipi_list; +		oldsd->rps_ipi_list = NULL;  #endif -	/* send out pending IPI's on offline CPU */ -	net_rps_send_ipi(remsd); +		/* send out pending IPI's on offline CPU */ +		net_rps_send_ipi(remsd); +	}  	/* Process offline CPU's input_pkt_queue */  	while ((skb = __skb_dequeue(&oldsd->process_queue))) {  		netif_rx(skb); -		input_queue_head_incr(oldsd); +		rps_input_queue_head_incr(oldsd);  	}  	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {  		netif_rx(skb); -		input_queue_head_incr(oldsd); +		rps_input_queue_head_incr(oldsd);  	}  	return 0; @@ -11718,7 +11975,7 @@ static int net_page_pool_create(int cpuid)  	struct page_pool_params page_pool_params = {  		.pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,  		.flags = PP_FLAG_SYSTEM_POOL, -		.nid = NUMA_NO_NODE, +		.nid = cpu_to_mem(cpuid),  	};  	struct page_pool *pp_ptr; @@ -11731,6 +11988,38 @@ static int net_page_pool_create(int cpuid)  	return 0;  } +static int backlog_napi_should_run(unsigned int cpu) +{ +	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); +	struct napi_struct *napi = &sd->backlog; + +	return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); +} + +static void run_backlog_napi(unsigned int cpu) +{ +	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + +	napi_threaded_poll_loop(&sd->backlog); +} + +static void backlog_napi_setup(unsigned int cpu) +{ +	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); +	struct napi_struct *napi = &sd->backlog; + +	napi->thread = this_cpu_read(backlog_napi); +	set_bit(NAPI_STATE_THREADED, &napi->state); +} + +static struct smp_hotplug_thread backlog_threads = { +	.store			= &backlog_napi, +	.thread_should_run	= backlog_napi_should_run, +	.thread_fn		= run_backlog_napi, +	.thread_comm		= "backlog_napi/%u", +	.setup			= backlog_napi_setup, +}; +  /*   *       This is called single threaded during boot, so no need   *       to take the rtnl semaphore. @@ -11782,10 +12071,13 @@ static int __init net_dev_init(void)  		init_gro_hash(&sd->backlog);  		sd->backlog.poll = process_backlog;  		sd->backlog.weight = weight_p; +		INIT_LIST_HEAD(&sd->backlog.poll_list);  		if (net_page_pool_create(i))  			goto out;  	} +	if (use_backlog_threads()) +		smpboot_register_percpu_thread(&backlog_threads);  	dev_boot_phase = 0; @@ -11811,6 +12103,10 @@ static int __init net_dev_init(void)  				       NULL, dev_cpu_dead);  	WARN_ON(rc < 0);  	rc = 0; + +	/* avoid static key IPIs to isolated CPUs */ +	if (housekeeping_enabled(HK_TYPE_MISC)) +		net_enable_timestamp();  out:  	if (rc < 0) {  		for_each_possible_cpu(i) { diff --git a/net/core/dev.h b/net/core/dev.h index 2bcaf8eee50c..5654325c5b71 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -4,11 +4,9 @@  #include <linux/types.h>  #include <linux/rwsem.h> +#include <linux/netdevice.h>  struct net; -struct net_device; -struct netdev_bpf; -struct netdev_phys_item_id;  struct netlink_ext_ack;  struct cpumask; @@ -38,7 +36,6 @@ int dev_addr_init(struct net_device *dev);  void dev_addr_check(struct net_device *dev);  /* sysctls not referred to from outside net/core/ */ -extern unsigned int	sysctl_skb_defer_max;  extern int		netdev_unregister_timeout_secs;  extern int		weight_p;  extern int		dev_weight_rx_bias; @@ -150,4 +147,45 @@ static inline void xdp_do_check_flushed(struct napi_struct *napi) { }  #endif  struct napi_struct *napi_by_id(unsigned int napi_id); +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); + +#define XMIT_RECURSION_LIMIT	8 + +#ifndef CONFIG_PREEMPT_RT +static inline bool dev_xmit_recursion(void) +{ +	return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > +			XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ +	__this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ +	__this_cpu_dec(softnet_data.xmit.recursion); +} +#else +static inline bool dev_xmit_recursion(void) +{ +	return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ +	current->net_xmit.recursion++; +} + +static inline void dev_xmit_recursion_dec(void) +{ +	current->net_xmit.recursion--; +} +#endif + +int dev_set_hwtstamp_phylib(struct net_device *dev, +			    struct kernel_hwtstamp_config *cfg, +			    struct netlink_ext_ack *extack); +  #endif diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c index 4dbd0dc6aea2..8e1dba825e94 100644 --- a/net/core/dev_addr_lists_test.c +++ b/net/core/dev_addr_lists_test.c @@ -49,7 +49,6 @@ static int dev_addr_test_init(struct kunit *test)  		KUNIT_FAIL(test, "Can't register netdev %d", err);  	} -	rtnl_lock();  	return 0;  } @@ -57,7 +56,6 @@ static void dev_addr_test_exit(struct kunit *test)  {  	struct net_device *netdev = test->priv; -	rtnl_unlock();  	unregister_netdev(netdev);  	free_netdev(netdev);  } @@ -67,6 +65,7 @@ static void dev_addr_test_basic(struct kunit *test)  	struct net_device *netdev = test->priv;  	u8 addr[ETH_ALEN]; +	rtnl_lock();  	KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr);  	memset(addr, 2, sizeof(addr)); @@ -76,6 +75,7 @@ static void dev_addr_test_basic(struct kunit *test)  	memset(addr, 3, sizeof(addr));  	dev_addr_set(netdev, addr);  	KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr)); +	rtnl_unlock();  }  static void dev_addr_test_sync_one(struct kunit *test) @@ -86,6 +86,7 @@ static void dev_addr_test_sync_one(struct kunit *test)  	datp = netdev_priv(netdev); +	rtnl_lock();  	memset(addr, 1, sizeof(addr));  	eth_hw_addr_set(netdev, addr); @@ -103,6 +104,7 @@ static void dev_addr_test_sync_one(struct kunit *test)  	 * considered synced and we overwrite in place.  	 */  	KUNIT_EXPECT_EQ(test, 0, datp->addr_seen); +	rtnl_unlock();  }  static void dev_addr_test_add_del(struct kunit *test) @@ -114,6 +116,7 @@ static void dev_addr_test_add_del(struct kunit *test)  	datp = netdev_priv(netdev); +	rtnl_lock();  	for (i = 1; i < 4; i++) {  		memset(addr, i, sizeof(addr));  		KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, @@ -143,6 +146,7 @@ static void dev_addr_test_add_del(struct kunit *test)  	__hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,  			   dev_addr_test_unsync);  	KUNIT_EXPECT_EQ(test, 1, datp->addr_seen); +	rtnl_unlock();  }  static void dev_addr_test_del_main(struct kunit *test) @@ -150,6 +154,7 @@ static void dev_addr_test_del_main(struct kunit *test)  	struct net_device *netdev = test->priv;  	u8 addr[ETH_ALEN]; +	rtnl_lock();  	memset(addr, 1, sizeof(addr));  	eth_hw_addr_set(netdev, addr); @@ -161,6 +166,7 @@ static void dev_addr_test_del_main(struct kunit *test)  					      NETDEV_HW_ADDR_T_LAN));  	KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr,  						    NETDEV_HW_ADDR_T_LAN)); +	rtnl_unlock();  }  static void dev_addr_test_add_set(struct kunit *test) @@ -172,6 +178,7 @@ static void dev_addr_test_add_set(struct kunit *test)  	datp = netdev_priv(netdev); +	rtnl_lock();  	/* There is no external API like dev_addr_add_excl(),  	 * so shuffle the tree a little bit and exploit aliasing.  	 */ @@ -191,6 +198,7 @@ static void dev_addr_test_add_set(struct kunit *test)  	__hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,  			   dev_addr_test_unsync);  	KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen); +	rtnl_unlock();  }  static void dev_addr_test_add_excl(struct kunit *test) @@ -199,6 +207,7 @@ static void dev_addr_test_add_excl(struct kunit *test)  	u8 addr[ETH_ALEN];  	int i; +	rtnl_lock();  	for (i = 0; i < 10; i++) {  		memset(addr, i, sizeof(addr));  		KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr)); @@ -213,6 +222,7 @@ static void dev_addr_test_add_excl(struct kunit *test)  		memset(addr, i, sizeof(addr));  		KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr));  	} +	rtnl_unlock();  }  static struct kunit_case dev_addr_test_cases[] = { diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 9a66cf5015f2..8592c052c0f4 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -259,9 +259,7 @@ static int dev_eth_ioctl(struct net_device *dev,   * @dev: Network device   * @cfg: Timestamping configuration structure   * - * Helper for enforcing a common policy that phylib timestamping, if available, - * should take precedence in front of hardware timestamping provided by the - * netdev. + * Helper for calling the default hardware provider timestamping.   *   * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and   * there only exists a phydev->mii_ts->hwtstamp() method. So this will return @@ -271,7 +269,7 @@ static int dev_eth_ioctl(struct net_device *dev,  static int dev_get_hwtstamp_phylib(struct net_device *dev,  				   struct kernel_hwtstamp_config *cfg)  { -	if (phy_has_hwtstamp(dev->phydev)) +	if (phy_is_default_hwtstamp(dev->phydev))  		return phy_hwtstamp_get(dev->phydev, cfg);  	return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); @@ -327,7 +325,7 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,  			    struct netlink_ext_ack *extack)  {  	const struct net_device_ops *ops = dev->netdev_ops; -	bool phy_ts = phy_has_hwtstamp(dev->phydev); +	bool phy_ts = phy_is_default_hwtstamp(dev->phydev);  	struct kernel_hwtstamp_config old_cfg = {};  	bool changed = false;  	int err; @@ -363,7 +361,6 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,  	return 0;  } -EXPORT_SYMBOL_GPL(dev_set_hwtstamp_phylib);  static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)  { diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index b0f221d658be..2e0ae3328232 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -74,7 +74,7 @@ struct net_dm_hw_entries {  };  struct per_cpu_dm_data { -	spinlock_t		lock;	/* Protects 'skb', 'hw_entries' and +	raw_spinlock_t		lock;	/* Protects 'skb', 'hw_entries' and  					 * 'send_timer'  					 */  	union { @@ -109,7 +109,8 @@ static u32 net_dm_queue_len = 1000;  struct net_dm_alert_ops {  	void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb,  				void *location, -				enum skb_drop_reason reason); +				enum skb_drop_reason reason, +				struct sock *rx_sk);  	void (*napi_poll_probe)(void *ignore, struct napi_struct *napi,  				int work, int budget);  	void (*work_item_func)(struct work_struct *work); @@ -168,9 +169,9 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)  err:  	mod_timer(&data->send_timer, jiffies + HZ / 10);  out: -	spin_lock_irqsave(&data->lock, flags); +	raw_spin_lock_irqsave(&data->lock, flags);  	swap(data->skb, skb); -	spin_unlock_irqrestore(&data->lock, flags); +	raw_spin_unlock_irqrestore(&data->lock, flags);  	if (skb) {  		struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; @@ -225,7 +226,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location)  	local_irq_save(flags);  	data = this_cpu_ptr(&dm_cpu_data); -	spin_lock(&data->lock); +	raw_spin_lock(&data->lock);  	dskb = data->skb;  	if (!dskb) @@ -259,12 +260,13 @@ static void trace_drop_common(struct sk_buff *skb, void *location)  	}  out: -	spin_unlock_irqrestore(&data->lock, flags); +	raw_spin_unlock_irqrestore(&data->lock, flags);  }  static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb,  				void *location, -				enum skb_drop_reason reason) +				enum skb_drop_reason reason, +				struct sock *rx_sk)  {  	trace_drop_common(skb, location);  } @@ -314,9 +316,9 @@ net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data)  		mod_timer(&hw_data->send_timer, jiffies + HZ / 10);  	} -	spin_lock_irqsave(&hw_data->lock, flags); +	raw_spin_lock_irqsave(&hw_data->lock, flags);  	swap(hw_data->hw_entries, hw_entries); -	spin_unlock_irqrestore(&hw_data->lock, flags); +	raw_spin_unlock_irqrestore(&hw_data->lock, flags);  	return hw_entries;  } @@ -448,7 +450,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,  		return;  	hw_data = this_cpu_ptr(&dm_hw_cpu_data); -	spin_lock_irqsave(&hw_data->lock, flags); +	raw_spin_lock_irqsave(&hw_data->lock, flags);  	hw_entries = hw_data->hw_entries;  	if (!hw_entries) @@ -477,7 +479,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,  	}  out: -	spin_unlock_irqrestore(&hw_data->lock, flags); +	raw_spin_unlock_irqrestore(&hw_data->lock, flags);  }  static const struct net_dm_alert_ops net_dm_alert_summary_ops = { @@ -491,7 +493,8 @@ static const struct net_dm_alert_ops net_dm_alert_summary_ops = {  static void net_dm_packet_trace_kfree_skb_hit(void *ignore,  					      struct sk_buff *skb,  					      void *location, -					      enum skb_drop_reason reason) +					      enum skb_drop_reason reason, +					      struct sock *rx_sk)  {  	ktime_t tstamp = ktime_get_real();  	struct per_cpu_dm_data *data; @@ -1673,7 +1676,7 @@ static struct notifier_block dropmon_net_notifier = {  static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data)  { -	spin_lock_init(&data->lock); +	raw_spin_lock_init(&data->lock);  	skb_queue_head_init(&data->drop_queue);  	u64_stats_init(&data->stats.syncp);  } diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 0ccfd5fa5cb9..70c634b9e7b0 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -27,6 +27,7 @@ struct dst_cache_pcpu {  static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,  				      struct dst_entry *dst, u32 cookie)  { +	DEBUG_NET_WARN_ON_ONCE(!in_softirq());  	dst_release(dst_cache->dst);  	if (dst)  		dst_hold(dst); @@ -40,6 +41,7 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,  {  	struct dst_entry *dst; +	DEBUG_NET_WARN_ON_ONCE(!in_softirq());  	dst = idst->dst;  	if (!dst)  		goto fail; @@ -47,7 +49,8 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,  	/* the cache already hold a dst reference; it can't go away */  	dst_hold(dst); -	if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || +	if (unlikely(!time_after(idst->refresh_ts, +				 READ_ONCE(dst_cache->reset_ts)) ||  		     (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) {  		dst_cache_per_cpu_dst_set(idst, NULL, 0);  		dst_release(dst); @@ -83,7 +86,7 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)  		return NULL;  	*saddr = idst->in_saddr.s_addr; -	return container_of(dst, struct rtable, dst); +	return dst_rtable(dst);  }  EXPORT_SYMBOL_GPL(dst_cache_get_ip4); @@ -111,8 +114,8 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,  		return;  	idst = this_cpu_ptr(dst_cache->cache); -	dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, -				  rt6_get_cookie((struct rt6_info *)dst)); +	dst_cache_per_cpu_dst_set(idst, dst, +				  rt6_get_cookie(dst_rt6_info(dst)));  	idst->in6_saddr = *saddr;  }  EXPORT_SYMBOL_GPL(dst_cache_set_ip6); @@ -170,7 +173,7 @@ void dst_cache_reset_now(struct dst_cache *dst_cache)  	if (!dst_cache->cache)  		return; -	dst_cache->reset_ts = jiffies; +	dst_cache_reset(dst_cache);  	for_each_possible_cpu(i) {  		struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i);  		struct dst_entry *dst = idst->dst; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 3f933ffcefc3..6ebffbc63236 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1142,10 +1142,10 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)  	const struct nlmsghdr *nlh = cb->nlh;  	struct net *net = sock_net(skb->sk);  	struct fib_rules_ops *ops; -	int idx = 0, family; +	int err, idx = 0, family;  	if (cb->strict_check) { -		int err = fib_valid_dumprule_req(nlh, cb->extack); +		err = fib_valid_dumprule_req(nlh, cb->extack);  		if (err < 0)  			return err; @@ -1158,17 +1158,17 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)  		if (ops == NULL)  			return -EAFNOSUPPORT; -		dump_rules(skb, cb, ops); - -		return skb->len; +		return dump_rules(skb, cb, ops);  	} +	err = 0;  	rcu_read_lock();  	list_for_each_entry_rcu(ops, &net->rules_ops, list) {  		if (idx < cb->args[0] || !try_module_get(ops->owner))  			goto skip; -		if (dump_rules(skb, cb, ops) < 0) +		err = dump_rules(skb, cb, ops); +		if (err < 0)  			break;  		cb->args[1] = 0; @@ -1178,7 +1178,7 @@ skip:  	rcu_read_unlock();  	cb->args[0] = idx; -	return skb->len; +	return err;  }  static void notify_rule_change(int event, struct fib_rule *rule, @@ -1293,7 +1293,8 @@ static int __init fib_rules_init(void)  	int err;  	rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0);  	rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); -	rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0); +	rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, +		      RTNL_FLAG_DUMP_UNLOCKED);  	err = register_pernet_subsys(&fib_rules_net_ops);  	if (err < 0) diff --git a/net/core/filter.c b/net/core/filter.c index ae5254f712c9..f3c72cf86099 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -87,6 +87,9 @@  #include "dev.h" +/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ +static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); +  static const struct bpf_func_proto *  bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -1655,13 +1658,21 @@ struct bpf_scratchpad {  		__be32 diff[MAX_BPF_STACK / sizeof(__be32)];  		u8     buff[MAX_BPF_STACK];  	}; +	local_lock_t	bh_lock;  }; -static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); +static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp) = { +	.bh_lock	= INIT_LOCAL_LOCK(bh_lock), +};  static inline int __bpf_try_make_writable(struct sk_buff *skb,  					  unsigned int write_len)  { +#ifdef CONFIG_DEBUG_NET +	/* Avoid a splat in pskb_may_pull_reason() */ +	if (write_len > INT_MAX) +		return -EINVAL; +#endif  	return skb_ensure_writable(skb, write_len);  } @@ -2013,6 +2024,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,  	struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);  	u32 diff_size = from_size + to_size;  	int i, j = 0; +	__wsum ret;  	/* This is quite flexible, some examples:  	 * @@ -2026,12 +2038,15 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,  		     diff_size > sizeof(sp->diff)))  		return -EINVAL; +	local_lock_nested_bh(&bpf_sp.bh_lock);  	for (i = 0; i < from_size / sizeof(__be32); i++, j++)  		sp->diff[j] = ~from[i];  	for (i = 0; i <   to_size / sizeof(__be32); i++, j++)  		sp->diff[j] = to[i]; -	return csum_partial(sp->diff, diff_size, seed); +	ret = csum_partial(sp->diff, diff_size, seed); +	local_unlock_nested_bh(&bpf_sp.bh_lock); +	return ret;  }  static const struct bpf_func_proto bpf_csum_diff_proto = { @@ -2215,7 +2230,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,  	rcu_read_lock();  	if (!nh) {  		dst = skb_dst(skb); -		nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), +		nexthop = rt6_nexthop(dst_rt6_info(dst),  				      &ipv6_hdr(skb)->daddr);  	} else {  		nexthop = &nh->ipv6_nh; @@ -2271,12 +2286,12 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,  	err = bpf_out_neigh_v6(net, skb, dev, nh);  	if (unlikely(net_xmit_eval(err))) -		dev->stats.tx_errors++; +		DEV_STATS_INC(dev, tx_errors);  	else  		ret = NET_XMIT_SUCCESS;  	goto out_xmit;  out_drop: -	dev->stats.tx_errors++; +	DEV_STATS_INC(dev, tx_errors);  	kfree_skb(skb);  out_xmit:  	return ret; @@ -2314,8 +2329,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,  	rcu_read_lock();  	if (!nh) { -		struct dst_entry *dst = skb_dst(skb); -		struct rtable *rt = container_of(dst, struct rtable, dst); +		struct rtable *rt = skb_rtable(skb);  		neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);  	} else if (nh->nh_family == AF_INET6) { @@ -2378,12 +2392,12 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,  	err = bpf_out_neigh_v4(net, skb, dev, nh);  	if (unlikely(net_xmit_eval(err))) -		dev->stats.tx_errors++; +		DEV_STATS_INC(dev, tx_errors);  	else  		ret = NET_XMIT_SUCCESS;  	goto out_xmit;  out_drop: -	dev->stats.tx_errors++; +	DEV_STATS_INC(dev, tx_errors);  	kfree_skb(skb);  out_xmit:  	return ret; @@ -2469,9 +2483,6 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {  	.arg3_type      = ARG_ANYTHING,  }; -DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); -EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); -  static struct net_device *skb_get_peer_dev(struct net_device *dev)  {  	const struct net_device_ops *ops = dev->netdev_ops; @@ -2484,7 +2495,7 @@ static struct net_device *skb_get_peer_dev(struct net_device *dev)  int skb_do_redirect(struct sk_buff *skb)  { -	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); +	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();  	struct net *net = dev_net(skb->dev);  	struct net_device *dev;  	u32 flags = ri->flags; @@ -2517,7 +2528,7 @@ out_drop:  BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)  { -	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); +	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();  	if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))  		return TC_ACT_SHOT; @@ -2538,7 +2549,7 @@ static const struct bpf_func_proto bpf_redirect_proto = {  BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)  { -	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); +	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();  	if (unlikely(flags))  		return TC_ACT_SHOT; @@ -2560,7 +2571,7 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = {  BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,  	   int, plen, u64, flags)  { -	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); +	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();  	if (unlikely((plen && plen < sizeof(*params)) || flags))  		return TC_ACT_SHOT; @@ -3537,13 +3548,20 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,  	if (skb_is_gso(skb)) {  		struct skb_shared_info *shinfo = skb_shinfo(skb); -		/* Due to header grow, MSS needs to be downgraded. */ -		if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) -			skb_decrease_gso_size(shinfo, len_diff); -  		/* Header must be checked, and gso_segs recomputed. */  		shinfo->gso_type |= gso_type;  		shinfo->gso_segs = 0; + +		/* Due to header growth, MSS needs to be downgraded. +		 * There is a BUG_ON() when segmenting the frag_list with +		 * head_frag true, so linearize the skb after downgrading +		 * the MSS. +		 */ +		if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) { +			skb_decrease_gso_size(shinfo, len_diff); +			if (shinfo->frag_list) +				return skb_linearize(skb); +		}  	}  	return 0; @@ -4266,50 +4284,50 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {   */  void xdp_do_flush(void)  { -	__dev_flush(); -	__cpu_map_flush(); -	__xsk_map_flush(); +	struct list_head *lh_map, *lh_dev, *lh_xsk; + +	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); +	if (lh_dev) +		__dev_flush(lh_dev); +	if (lh_map) +		__cpu_map_flush(lh_map); +	if (lh_xsk) +		__xsk_map_flush(lh_xsk);  }  EXPORT_SYMBOL_GPL(xdp_do_flush);  #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)  void xdp_do_check_flushed(struct napi_struct *napi)  { -	bool ret; +	struct list_head *lh_map, *lh_dev, *lh_xsk; +	bool missed = false; -	ret = dev_check_flush(); -	ret |= cpu_map_check_flush(); -	ret |= xsk_map_check_flush(); +	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); +	if (lh_dev) { +		__dev_flush(lh_dev); +		missed = true; +	} +	if (lh_map) { +		__cpu_map_flush(lh_map); +		missed = true; +	} +	if (lh_xsk) { +		__xsk_map_flush(lh_xsk); +		missed = true; +	} -	WARN_ONCE(ret, "Missing xdp_do_flush() invocation after NAPI by %ps\n", +	WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n",  		  napi->poll);  }  #endif -void bpf_clear_redirect_map(struct bpf_map *map) -{ -	struct bpf_redirect_info *ri; -	int cpu; - -	for_each_possible_cpu(cpu) { -		ri = per_cpu_ptr(&bpf_redirect_info, cpu); -		/* Avoid polluting remote cacheline due to writes if -		 * not needed. Once we pass this test, we need the -		 * cmpxchg() to make sure it hasn't been changed in -		 * the meantime by remote CPU. -		 */ -		if (unlikely(READ_ONCE(ri->map) == map)) -			cmpxchg(&ri->map, map, NULL); -	} -} -  DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);  EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);  u32 xdp_master_redirect(struct xdp_buff *xdp)  { +	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();  	struct net_device *master, *slave; -	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);  	master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);  	slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); @@ -4381,7 +4399,7 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,  			map = READ_ONCE(ri->map);  			/* The map pointer is cleared when the map is being torn -			 * down by bpf_clear_redirect_map() +			 * down by dev_map_free()  			 */  			if (unlikely(!map)) {  				err = -ENOENT; @@ -4426,7 +4444,7 @@ err:  int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,  		    struct bpf_prog *xdp_prog)  { -	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); +	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();  	enum bpf_map_type map_type = ri->map_type;  	if (map_type == BPF_MAP_TYPE_XSKMAP) @@ -4440,7 +4458,7 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect);  int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,  			  struct xdp_frame *xdpf, struct bpf_prog *xdp_prog)  { -	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); +	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();  	enum bpf_map_type map_type = ri->map_type;  	if (map_type == BPF_MAP_TYPE_XSKMAP) @@ -4457,7 +4475,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,  				       enum bpf_map_type map_type, u32 map_id,  				       u32 flags)  { -	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); +	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();  	struct bpf_map *map;  	int err; @@ -4469,7 +4487,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,  			map = READ_ONCE(ri->map);  			/* The map pointer is cleared when the map is being torn -			 * down by bpf_clear_redirect_map() +			 * down by dev_map_free()  			 */  			if (unlikely(!map)) {  				err = -ENOENT; @@ -4511,7 +4529,7 @@ err:  int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,  			    struct xdp_buff *xdp, struct bpf_prog *xdp_prog)  { -	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); +	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();  	enum bpf_map_type map_type = ri->map_type;  	void *fwd = ri->tgt_value;  	u32 map_id = ri->map_id; @@ -4547,7 +4565,7 @@ err:  BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)  { -	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); +	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();  	if (unlikely(flags))  		return XDP_ABORTED; @@ -4684,7 +4702,7 @@ set_compat:  	to->tunnel_tos = info->key.tos;  	to->tunnel_ttl = info->key.ttl;  	if (flags & BPF_F_TUNINFO_FLAGS) -		to->tunnel_flags = info->key.tun_flags; +		to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags);  	else  		to->tunnel_ext = 0; @@ -4727,7 +4745,7 @@ BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)  	int err;  	if (unlikely(!info || -		     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { +		     !ip_tunnel_is_options_present(info->key.tun_flags))) {  		err = -ENOENT;  		goto err_clear;  	} @@ -4797,15 +4815,15 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,  	memset(info, 0, sizeof(*info));  	info->mode = IP_TUNNEL_INFO_TX; -	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; -	if (flags & BPF_F_DONT_FRAGMENT) -		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; -	if (flags & BPF_F_ZERO_CSUM_TX) -		info->key.tun_flags &= ~TUNNEL_CSUM; -	if (flags & BPF_F_SEQ_NUMBER) -		info->key.tun_flags |= TUNNEL_SEQ; -	if (flags & BPF_F_NO_TUNNEL_KEY) -		info->key.tun_flags &= ~TUNNEL_KEY; +	__set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); +	__assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags, +		     flags & BPF_F_DONT_FRAGMENT); +	__assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags, +		     !(flags & BPF_F_ZERO_CSUM_TX)); +	__assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags, +		     flags & BPF_F_SEQ_NUMBER); +	__assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags, +		     !(flags & BPF_F_NO_TUNNEL_KEY));  	info->key.tun_id = cpu_to_be64(from->tunnel_id);  	info->key.tos = from->tunnel_tos; @@ -4843,13 +4861,15 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,  {  	struct ip_tunnel_info *info = skb_tunnel_info(skb);  	const struct metadata_dst *md = this_cpu_ptr(md_dst); +	IP_TUNNEL_DECLARE_FLAGS(present) = { };  	if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))  		return -EINVAL;  	if (unlikely(size > IP_TUNNEL_OPTS_MAX))  		return -ENOMEM; -	ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT); +	ip_tunnel_set_options_present(present); +	ip_tunnel_info_opts_set(info, from, size, present);  	return 0;  } @@ -5906,7 +5926,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,  		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);  	} else { -		fl4.flowi4_mark = 0; +		if (flags & BPF_FIB_LOOKUP_MARK) +			fl4.flowi4_mark = params->mark; +		else +			fl4.flowi4_mark = 0;  		fl4.flowi4_secid = 0;  		fl4.flowi4_tun_key.tun_id = 0;  		fl4.flowi4_uid = sock_net_uid(net, NULL); @@ -6049,7 +6072,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,  		err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,  						   strict);  	} else { -		fl6.flowi6_mark = 0; +		if (flags & BPF_FIB_LOOKUP_MARK) +			fl6.flowi6_mark = params->mark; +		else +			fl6.flowi6_mark = 0;  		fl6.flowi6_secid = 0;  		fl6.flowi6_tun_key.tun_id = 0;  		fl6.flowi6_uid = sock_net_uid(net, NULL); @@ -6127,7 +6153,7 @@ set_fwd_params:  #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \  			     BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ -			     BPF_FIB_LOOKUP_SRC) +			     BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK)  BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,  	   struct bpf_fib_lookup *, params, int, plen, u32, flags) @@ -6440,6 +6466,7 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,  	void *srh_tlvs, *srh_end, *ptr;  	int srhoff = 0; +	lockdep_assert_held(&srh_state->bh_lock);  	if (srh == NULL)  		return -EINVAL; @@ -6496,6 +6523,7 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,  	int hdroff = 0;  	int err; +	lockdep_assert_held(&srh_state->bh_lock);  	switch (action) {  	case SEG6_LOCAL_ACTION_END_X:  		if (!seg6_bpf_has_valid_srh(skb)) @@ -6572,6 +6600,7 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,  	int srhoff = 0;  	int ret; +	lockdep_assert_held(&srh_state->bh_lock);  	if (unlikely(srh == NULL))  		return -EINVAL; @@ -6805,7 +6834,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {  	.ret_type	= RET_PTR_TO_SOCK_COMMON_OR_NULL,  	.arg1_type	= ARG_PTR_TO_CTX,  	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY, -	.arg3_type	= ARG_CONST_SIZE, +	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,  	.arg4_type	= ARG_ANYTHING,  	.arg5_type	= ARG_ANYTHING,  }; @@ -6824,7 +6853,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {  	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,  	.arg1_type	= ARG_PTR_TO_CTX,  	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY, -	.arg3_type	= ARG_CONST_SIZE, +	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,  	.arg4_type	= ARG_ANYTHING,  	.arg5_type	= ARG_ANYTHING,  }; @@ -6843,7 +6872,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {  	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,  	.arg1_type	= ARG_PTR_TO_CTX,  	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY, -	.arg3_type	= ARG_CONST_SIZE, +	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,  	.arg4_type	= ARG_ANYTHING,  	.arg5_type	= ARG_ANYTHING,  }; @@ -6867,7 +6896,7 @@ static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = {  	.ret_type	= RET_PTR_TO_SOCK_COMMON_OR_NULL,  	.arg1_type	= ARG_PTR_TO_CTX,  	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY, -	.arg3_type	= ARG_CONST_SIZE, +	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,  	.arg4_type	= ARG_ANYTHING,  	.arg5_type	= ARG_ANYTHING,  }; @@ -6891,7 +6920,7 @@ static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = {  	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,  	.arg1_type	= ARG_PTR_TO_CTX,  	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY, -	.arg3_type	= ARG_CONST_SIZE, +	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,  	.arg4_type	= ARG_ANYTHING,  	.arg5_type	= ARG_ANYTHING,  }; @@ -6915,7 +6944,7 @@ static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = {  	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,  	.arg1_type	= ARG_PTR_TO_CTX,  	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY, -	.arg3_type	= ARG_CONST_SIZE, +	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,  	.arg4_type	= ARG_ANYTHING,  	.arg5_type	= ARG_ANYTHING,  }; @@ -6953,7 +6982,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {  	.ret_type       = RET_PTR_TO_SOCKET_OR_NULL,  	.arg1_type      = ARG_PTR_TO_CTX,  	.arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY, -	.arg3_type      = ARG_CONST_SIZE, +	.arg3_type      = ARG_CONST_SIZE_OR_ZERO,  	.arg4_type      = ARG_ANYTHING,  	.arg5_type      = ARG_ANYTHING,  }; @@ -6977,7 +7006,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {  	.ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,  	.arg1_type      = ARG_PTR_TO_CTX,  	.arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY, -	.arg3_type      = ARG_CONST_SIZE, +	.arg3_type      = ARG_CONST_SIZE_OR_ZERO,  	.arg4_type      = ARG_ANYTHING,  	.arg5_type      = ARG_ANYTHING,  }; @@ -7001,7 +7030,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {  	.ret_type       = RET_PTR_TO_SOCKET_OR_NULL,  	.arg1_type      = ARG_PTR_TO_CTX,  	.arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY, -	.arg3_type      = ARG_CONST_SIZE, +	.arg3_type      = ARG_CONST_SIZE_OR_ZERO,  	.arg4_type      = ARG_ANYTHING,  	.arg5_type      = ARG_ANYTHING,  }; @@ -7021,7 +7050,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {  	.ret_type	= RET_PTR_TO_SOCK_COMMON_OR_NULL,  	.arg1_type	= ARG_PTR_TO_CTX,  	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY, -	.arg3_type	= ARG_CONST_SIZE, +	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,  	.arg4_type	= ARG_ANYTHING,  	.arg5_type	= ARG_ANYTHING,  }; @@ -7040,7 +7069,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {  	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,  	.arg1_type	= ARG_PTR_TO_CTX,  	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY, -	.arg3_type	= ARG_CONST_SIZE, +	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,  	.arg4_type	= ARG_ANYTHING,  	.arg5_type	= ARG_ANYTHING,  }; @@ -7059,7 +7088,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {  	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,  	.arg1_type	= ARG_PTR_TO_CTX,  	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY, -	.arg3_type	= ARG_CONST_SIZE, +	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,  	.arg4_type	= ARG_ANYTHING,  	.arg5_type	= ARG_ANYTHING,  }; @@ -7716,17 +7745,21 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,  		return -EOPNOTSUPP;  	switch (tstamp_type) { -	case BPF_SKB_TSTAMP_DELIVERY_MONO: +	case BPF_SKB_CLOCK_REALTIME: +		skb->tstamp = tstamp; +		skb->tstamp_type = SKB_CLOCK_REALTIME; +		break; +	case BPF_SKB_CLOCK_MONOTONIC:  		if (!tstamp)  			return -EINVAL;  		skb->tstamp = tstamp; -		skb->mono_delivery_time = 1; +		skb->tstamp_type = SKB_CLOCK_MONOTONIC;  		break; -	case BPF_SKB_TSTAMP_UNSPEC: -		if (tstamp) +	case BPF_SKB_CLOCK_TAI: +		if (!tstamp)  			return -EINVAL; -		skb->tstamp = 0; -		skb->mono_delivery_time = 0; +		skb->tstamp = tstamp; +		skb->tstamp_type = SKB_CLOCK_TAI;  		break;  	default:  		return -EINVAL; @@ -8364,8 +8397,6 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  		return &bpf_event_output_data_proto;  	case BPF_FUNC_get_current_uid_gid:  		return &bpf_get_current_uid_gid_proto; -	case BPF_FUNC_get_current_pid_tgid: -		return &bpf_get_current_pid_tgid_proto;  	case BPF_FUNC_sk_storage_get:  		return &bpf_sk_storage_get_proto;  	case BPF_FUNC_sk_storage_delete: @@ -9379,16 +9410,17 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,  {  	__u8 value_reg = si->dst_reg;  	__u8 skb_reg = si->src_reg; -	/* AX is needed because src_reg and dst_reg could be the same */ -	__u8 tmp_reg = BPF_REG_AX; - -	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, -			      SKB_BF_MONO_TC_OFFSET); -	*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, -				SKB_MONO_DELIVERY_TIME_MASK, 2); -	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC); -	*insn++ = BPF_JMP_A(1); -	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO); +	BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI); +	BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME); +	BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC); +	BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI); +	*insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); +	*insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK); +#ifdef __BIG_ENDIAN_BITFIELD +	*insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT); +#else +	BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1)); +#endif  	return insn;  } @@ -9431,11 +9463,12 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,  		__u8 tmp_reg = BPF_REG_AX;  		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); -		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, -					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK); -		*insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg, -					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2); -		/* skb->tc_at_ingress && skb->mono_delivery_time, +		/* check if ingress mask bits is set */ +		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); +		*insn++ = BPF_JMP_A(4); +		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1); +		*insn++ = BPF_JMP_A(2); +		/* skb->tc_at_ingress && skb->tstamp_type,  		 * read 0 as the (rcv) timestamp.  		 */  		*insn++ = BPF_MOV64_IMM(value_reg, 0); @@ -9460,7 +9493,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,  	 * the bpf prog is aware the tstamp could have delivery time.  	 * Thus, write skb->tstamp as is if tstamp_type_access is true.  	 * Otherwise, writing at ingress will have to clear the -	 * mono_delivery_time bit also. +	 * skb->tstamp_type bit also.  	 */  	if (!prog->tstamp_type_access) {  		__u8 tmp_reg = BPF_REG_AX; @@ -9470,8 +9503,8 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,  		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);  		/* goto <store> */  		*insn++ = BPF_JMP_A(2); -		/* <clear>: mono_delivery_time */ -		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK); +		/* <clear>: skb->tstamp_type */ +		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);  		*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);  	}  #endif @@ -11027,7 +11060,6 @@ const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {  };  const struct bpf_prog_ops lwt_seg6local_prog_ops = { -	.test_run		= bpf_prog_test_run_skb,  };  const struct bpf_verifier_ops cg_sock_verifier_ops = { @@ -11845,28 +11877,34 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  }  __bpf_kfunc_start_defs(); -__bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags, -				    struct bpf_dynptr_kern *ptr__uninit) +__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, +				    struct bpf_dynptr *ptr__uninit)  { +	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; +	struct sk_buff *skb = (struct sk_buff *)s; +  	if (flags) { -		bpf_dynptr_set_null(ptr__uninit); +		bpf_dynptr_set_null(ptr);  		return -EINVAL;  	} -	bpf_dynptr_init(ptr__uninit, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len); +	bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);  	return 0;  } -__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags, -				    struct bpf_dynptr_kern *ptr__uninit) +__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, +				    struct bpf_dynptr *ptr__uninit)  { +	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; +	struct xdp_buff *xdp = (struct xdp_buff *)x; +  	if (flags) { -		bpf_dynptr_set_null(ptr__uninit); +		bpf_dynptr_set_null(ptr);  		return -EINVAL;  	} -	bpf_dynptr_init(ptr__uninit, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp)); +	bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));  	return 0;  } @@ -11892,10 +11930,11 @@ __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,  	return 0;  } -__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct sk_buff *skb, struct sock *sk, +__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk,  					struct bpf_tcp_req_attrs *attrs, int attrs__sz)  {  #if IS_ENABLED(CONFIG_SYN_COOKIES) +	struct sk_buff *skb = (struct sk_buff *)s;  	const struct request_sock_ops *ops;  	struct inet_request_sock *ireq;  	struct tcp_request_sock *treq; @@ -11990,16 +12029,17 @@ __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct sk_buff *skb, struct sock *sk,  __bpf_kfunc_end_defs(); -int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, -			       struct bpf_dynptr_kern *ptr__uninit) +int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, +			       struct bpf_dynptr *ptr__uninit)  { +	struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;  	int err;  	err = bpf_dynptr_from_skb(skb, flags, ptr__uninit);  	if (err)  		return err; -	bpf_dynptr_set_rdonly(ptr__uninit); +	bpf_dynptr_set_rdonly(ptr);  	return 0;  } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 272f09251343..0e638a37aa09 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -299,9 +299,10 @@ void skb_flow_dissect_meta(const struct sk_buff *skb,  EXPORT_SYMBOL(skb_flow_dissect_meta);  static void -skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, -				   struct flow_dissector *flow_dissector, -				   void *target_container) +skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type, +				 u32 ctrl_flags, +				 struct flow_dissector *flow_dissector, +				 void *target_container)  {  	struct flow_dissector_key_control *ctrl; @@ -312,6 +313,7 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,  					 FLOW_DISSECTOR_KEY_ENC_CONTROL,  					 target_container);  	ctrl->addr_type = type; +	ctrl->flags = ctrl_flags;  }  void @@ -367,6 +369,7 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,  {  	struct ip_tunnel_info *info;  	struct ip_tunnel_key *key; +	u32 ctrl_flags = 0;  	/* A quick check to see if there might be something to do. */  	if (!dissector_uses_key(flow_dissector, @@ -391,11 +394,20 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,  	key = &info->key; +	if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags)) +		ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM; +	if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags)) +		ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT; +	if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags)) +		ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM; +	if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags)) +		ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT; +  	switch (ip_tunnel_info_af(info)) {  	case AF_INET: -		skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS, -						   flow_dissector, -						   target_container); +		skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS, +						 ctrl_flags, flow_dissector, +						 target_container);  		if (dissector_uses_key(flow_dissector,  				       FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {  			struct flow_dissector_key_ipv4_addrs *ipv4; @@ -408,9 +420,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,  		}  		break;  	case AF_INET6: -		skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS, -						   flow_dissector, -						   target_container); +		skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS, +						 ctrl_flags, flow_dissector, +						 target_container);  		if (dissector_uses_key(flow_dissector,  				       FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) {  			struct flow_dissector_key_ipv6_addrs *ipv6; @@ -422,6 +434,10 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,  			ipv6->dst = key->u.ipv6.dst;  		}  		break; +	default: +		skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector, +						 target_container); +		break;  	}  	if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { @@ -455,17 +471,25 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,  	if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) {  		struct flow_dissector_key_enc_opts *enc_opt; +		IP_TUNNEL_DECLARE_FLAGS(flags) = { }; +		u32 val;  		enc_opt = skb_flow_dissector_target(flow_dissector,  						    FLOW_DISSECTOR_KEY_ENC_OPTS,  						    target_container); -		if (info->options_len) { -			enc_opt->len = info->options_len; -			ip_tunnel_info_opts_get(enc_opt->data, info); -			enc_opt->dst_opt_type = info->key.tun_flags & -						TUNNEL_OPTIONS_PRESENT; -		} +		if (!info->options_len) +			return; + +		enc_opt->len = info->options_len; +		ip_tunnel_info_opts_get(enc_opt->data, info); + +		ip_tunnel_set_options_present(flags); +		ip_tunnel_flags_and(flags, info->key.tun_flags, flags); + +		val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM, +				    IP_TUNNEL_GENEVE_OPT_BIT); +		enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0;  	}  }  EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); @@ -1093,7 +1117,7 @@ bool __skb_flow_dissect(const struct net *net,  		}  	} -	WARN_ON_ONCE(!net); +	DEBUG_NET_WARN_ON_ONCE(!net);  	if (net) {  		enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;  		struct bpf_prog_array *run_array; @@ -1784,6 +1808,13 @@ u32 flow_hash_from_keys(struct flow_keys *keys)  }  EXPORT_SYMBOL(flow_hash_from_keys); +u32 flow_hash_from_keys_seed(struct flow_keys *keys, +			     const siphash_key_t *keyval) +{ +	return __flow_hash_from_keys(keys, keyval); +} +EXPORT_SYMBOL(flow_hash_from_keys_seed); +  static inline u32 ___skb_get_hash(const struct sk_buff *skb,  				  struct flow_keys *keys,  				  const siphash_key_t *keyval) @@ -1823,22 +1854,23 @@ EXPORT_SYMBOL(make_flow_keys_digest);  static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; -u32 __skb_get_hash_symmetric(const struct sk_buff *skb) +u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb)  {  	struct flow_keys keys;  	__flow_hash_secret_init();  	memset(&keys, 0, sizeof(keys)); -	__skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric, +	__skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric,  			   &keys, NULL, 0, 0, 0, 0);  	return __flow_hash_from_keys(&keys, &hashrnd);  } -EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); +EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net);  /** - * __skb_get_hash: calculate a flow hash + * __skb_get_hash_net: calculate a flow hash + * @net: associated network namespace, derived from @skb if NULL   * @skb: sk_buff to calculate flow hash from   *   * This function calculates a flow hash based on src/dst addresses @@ -1846,18 +1878,24 @@ EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);   * on success, zero indicates no valid hash.  Also, sets l4_hash in skb   * if hash is a canonical 4-tuple hash over transport ports.   */ -void __skb_get_hash(struct sk_buff *skb) +void __skb_get_hash_net(const struct net *net, struct sk_buff *skb)  {  	struct flow_keys keys;  	u32 hash; +	memset(&keys, 0, sizeof(keys)); + +	__skb_flow_dissect(net, skb, &flow_keys_dissector, +			   &keys, NULL, 0, 0, 0, +			   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); +  	__flow_hash_secret_init(); -	hash = ___skb_get_hash(skb, &keys, &hashrnd); +	hash = __flow_hash_from_keys(&keys, &hashrnd);  	__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));  } -EXPORT_SYMBOL(__skb_get_hash); +EXPORT_SYMBOL(__skb_get_hash_net);  __u32 skb_get_hash_perturb(const struct sk_buff *skb,  			   const siphash_key_t *perturb) diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index fae9c4694186..412816076b8b 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -206,7 +206,7 @@ void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)  {  	struct net_rate_estimator *est; -	est = xchg((__force struct net_rate_estimator **)rate_est, NULL); +	est = unrcu_pointer(xchg(rate_est, NULL));  	if (est) {  		timer_shutdown_sync(&est->timer);  		kfree_rcu(est, rcu); diff --git a/net/core/gro.c b/net/core/gro.c index c7901253a1a8..b3b43de1a650 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -3,6 +3,7 @@  #include <net/dst_metadata.h>  #include <net/busy_poll.h>  #include <trace/events/net.h> +#include <linux/skbuff_ref.h>  #define MAX_GRO_SKBS 8 @@ -230,6 +231,33 @@ done:  	return 0;  } +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) +{ +	if (unlikely(p->len + skb->len >= 65536)) +		return -E2BIG; + +	if (NAPI_GRO_CB(p)->last == p) +		skb_shinfo(p)->frag_list = skb; +	else +		NAPI_GRO_CB(p)->last->next = skb; + +	skb_pull(skb, skb_gro_offset(skb)); + +	NAPI_GRO_CB(p)->last = skb; +	NAPI_GRO_CB(p)->count++; +	p->data_len += skb->len; + +	/* sk ownership - if any - completely transferred to the aggregated packet */ +	skb->destructor = NULL; +	skb->sk = NULL; +	p->truesize += skb->truesize; +	p->len += skb->len; + +	NAPI_GRO_CB(skb)->same_flow = 1; + +	return 0; +} +  static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)  { @@ -330,8 +358,6 @@ static void gro_list_prepare(const struct list_head *head,  	list_for_each_entry(p, head, list) {  		unsigned long diffs; -		NAPI_GRO_CB(p)->flush = 0; -  		if (hash != skb_get_hash_raw(p)) {  			NAPI_GRO_CB(p)->same_flow = 0;  			continue; @@ -471,7 +497,6 @@ found_ptype:  					sizeof(u32))); /* Avoid slow unaligned acc */  	*(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;  	NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); -	NAPI_GRO_CB(skb)->is_atomic = 1;  	NAPI_GRO_CB(skb)->count = 1;  	if (unlikely(skb_is_gso(skb))) {  		NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; diff --git a/net/core/hotdata.c b/net/core/hotdata.c index c8a7a451c18a..d0aaaaa556f2 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -1,9 +1,9 @@  // SPDX-License-Identifier: GPL-2.0-or-later -#include <net/hotdata.h>  #include <linux/cache.h>  #include <linux/jiffies.h>  #include <linux/list.h> - +#include <net/hotdata.h> +#include <net/proto_memory.h>  struct net_hotdata net_hotdata __cacheline_aligned = {  	.offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), @@ -18,5 +18,8 @@ struct net_hotdata net_hotdata __cacheline_aligned = {  	.max_backlog = 1000,  	.dev_tx_weight = 64,  	.dev_rx_weight = 64, +	.sysctl_max_skb_frags = MAX_SKB_FRAGS, +	.sysctl_skb_defer_max = 64, +	.sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE  };  EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c new file mode 100644 index 000000000000..759a9b9f3f89 --- /dev/null +++ b/net/core/ieee8021q_helpers.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> + +#include <linux/array_size.h> +#include <linux/printk.h> +#include <linux/types.h> +#include <net/dscp.h> +#include <net/ieee8021q.h> + +/* The following arrays map Traffic Types (TT) to traffic classes (TC) for + * different number of queues as shown in the example provided by + * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and + * Table I-1 "Traffic type to traffic class mapping". + */ +static const u8 ieee8021q_8queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, +	[IEEE8021Q_TT_BE] = 1, +	[IEEE8021Q_TT_EE] = 2, +	[IEEE8021Q_TT_CA] = 3, +	[IEEE8021Q_TT_VI] = 4, +	[IEEE8021Q_TT_VO] = 5, +	[IEEE8021Q_TT_IC] = 6, +	[IEEE8021Q_TT_NC] = 7, +}; + +static const u8 ieee8021q_7queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, +	[IEEE8021Q_TT_BE] = 1, +	[IEEE8021Q_TT_EE] = 2, +	[IEEE8021Q_TT_CA] = 3, +	[IEEE8021Q_TT_VI] = 4,	[IEEE8021Q_TT_VO] = 4, +	[IEEE8021Q_TT_IC] = 5, +	[IEEE8021Q_TT_NC] = 6, +}; + +static const u8 ieee8021q_6queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, +	[IEEE8021Q_TT_BE] = 1, +	[IEEE8021Q_TT_EE] = 2,	[IEEE8021Q_TT_CA] = 2, +	[IEEE8021Q_TT_VI] = 3,	[IEEE8021Q_TT_VO] = 3, +	[IEEE8021Q_TT_IC] = 4, +	[IEEE8021Q_TT_NC] = 5, +}; + +static const u8 ieee8021q_5queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, +	[IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, +	[IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, +	[IEEE8021Q_TT_IC] = 3, +	[IEEE8021Q_TT_NC] = 4, +}; + +static const u8 ieee8021q_4queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, +	[IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, +	[IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, +	[IEEE8021Q_TT_IC] = 3, [IEEE8021Q_TT_NC] = 3, +}; + +static const u8 ieee8021q_3queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, +	[IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, +	[IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, +	[IEEE8021Q_TT_IC] = 2, [IEEE8021Q_TT_NC] = 2, +}; + +static const u8 ieee8021q_2queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, +	[IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, +	[IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, +	[IEEE8021Q_TT_IC] = 1, [IEEE8021Q_TT_NC] = 1, +}; + +static const u8 ieee8021q_1queue_tt_tc_map[] = { +	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, +	[IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, +	[IEEE8021Q_TT_VI] = 0, [IEEE8021Q_TT_VO] = 0, +	[IEEE8021Q_TT_IC] = 0, [IEEE8021Q_TT_NC] = 0, +}; + +/** + * ieee8021q_tt_to_tc - Map IEEE 802.1Q Traffic Type to Traffic Class + * @tt: IEEE 802.1Q Traffic Type + * @num_queues: Number of queues + * + * This function maps an IEEE 802.1Q Traffic Type to a Traffic Class (TC) based + * on the number of queues configured on the NIC. The mapping is based on the + * example provided by IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic + * class mapping" and Table I-1 "Traffic type to traffic class mapping". + * + * Return: Traffic Class corresponding to the given Traffic Type or negative + * value in case of error. + */ +int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues) +{ +	if (tt < 0 || tt >= IEEE8021Q_TT_MAX) { +		pr_err("Requested Traffic Type (%d) is out of range (%d)\n", tt, +		       IEEE8021Q_TT_MAX); +		return -EINVAL; +	} + +	switch (num_queues) { +	case 8: +		compiletime_assert(ARRAY_SIZE(ieee8021q_8queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_8queue_tt_tc_map != max - 1"); +		return ieee8021q_8queue_tt_tc_map[tt]; +	case 7: +		compiletime_assert(ARRAY_SIZE(ieee8021q_7queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_7queue_tt_tc_map != max - 1"); + +		return ieee8021q_7queue_tt_tc_map[tt]; +	case 6: +		compiletime_assert(ARRAY_SIZE(ieee8021q_6queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_6queue_tt_tc_map != max - 1"); + +		return ieee8021q_6queue_tt_tc_map[tt]; +	case 5: +		compiletime_assert(ARRAY_SIZE(ieee8021q_5queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_5queue_tt_tc_map != max - 1"); + +		return ieee8021q_5queue_tt_tc_map[tt]; +	case 4: +		compiletime_assert(ARRAY_SIZE(ieee8021q_4queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_4queue_tt_tc_map != max - 1"); + +		return ieee8021q_4queue_tt_tc_map[tt]; +	case 3: +		compiletime_assert(ARRAY_SIZE(ieee8021q_3queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_3queue_tt_tc_map != max - 1"); + +		return ieee8021q_3queue_tt_tc_map[tt]; +	case 2: +		compiletime_assert(ARRAY_SIZE(ieee8021q_2queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_2queue_tt_tc_map != max - 1"); + +		return ieee8021q_2queue_tt_tc_map[tt]; +	case 1: +		compiletime_assert(ARRAY_SIZE(ieee8021q_1queue_tt_tc_map) != +				   IEEE8021Q_TT_MAX - 1, +				   "ieee8021q_1queue_tt_tc_map != max - 1"); + +		return ieee8021q_1queue_tt_tc_map[tt]; +	} + +	pr_err("Invalid number of queues %d\n", num_queues); + +	return -EINVAL; +} +EXPORT_SYMBOL_GPL(ieee8021q_tt_to_tc); + +/** + * ietf_dscp_to_ieee8021q_tt - Map IETF DSCP to IEEE 802.1Q Traffic Type + * @dscp: IETF DSCP value + * + * This function maps an IETF DSCP value to an IEEE 802.1Q Traffic Type (TT). + * Since there is no corresponding mapping between DSCP and IEEE 802.1Q Traffic + * Type, this function is inspired by the RFC8325 documentation which describe + * the mapping between DSCP and 802.11 User Priority (UP) values. + * + * Return: IEEE 802.1Q Traffic Type corresponding to the given DSCP value + */ +int ietf_dscp_to_ieee8021q_tt(u8 dscp) +{ +	switch (dscp) { +	case DSCP_CS0: +	/* Comment from RFC8325: +	 * [RFC4594], Section 4.8, recommends High-Throughput Data be marked +	 * AF1x (that is, AF11, AF12, and AF13, according to the rules defined +	 * in [RFC2475]). +	 * +	 * By default (as described in Section 2.3), High-Throughput Data will +	 * map to UP 1 and, thus, to the Background Access Category (AC_BK), +	 * which is contrary to the intent expressed in [RFC4594]. + +	 * Unfortunately, there really is no corresponding fit for the High- +	 * Throughput Data service class within the constrained 4 Access +	 * Category [IEEE.802.11-2016] model.  If the High-Throughput Data +	 * service class is assigned to the Best Effort Access Category (AC_BE), +	 * then it would contend with Low-Latency Data (while [RFC4594] +	 * recommends a distinction in servicing between these service classes) +	 * as well as with the default service class; alternatively, if it is +	 * assigned to the Background Access Category (AC_BK), then it would +	 * receive a less-then-best-effort service and contend with Low-Priority +	 * Data (as discussed in Section 4.2.10). +	 * +	 * As such, since there is no directly corresponding fit for the High- +	 * Throughout Data service class within the [IEEE.802.11-2016] model, it +	 * is generally RECOMMENDED to map High-Throughput Data to UP 0, thereby +	 * admitting it to the Best Effort Access Category (AC_BE). +	 * +	 * Note: The above text is from RFC8325 which is describing the mapping +	 * between DSCP and 802.11 User Priority (UP) values. The mapping +	 * between UP and IEEE 802.1Q Traffic Type is not defined in the RFC but +	 * the 802.11 AC_BK and AC_BE are closely related to the IEEE 802.1Q +	 * Traffic Types BE and BK. +	 */ +	case DSCP_AF11: +	case DSCP_AF12: +	case DSCP_AF13: +		return IEEE8021Q_TT_BE; +	/* Comment from RFC8325: +	 * RFC3662 and RFC4594 both recommend Low-Priority Data be marked +	 * with DSCP CS1. The Low-Priority Data service class loosely +	 * corresponds to the [IEEE.802.11-2016] Background Access Category +	 */ +	case DSCP_CS1: +		return IEEE8021Q_TT_BK; +	case DSCP_CS2: +	case DSCP_AF21: +	case DSCP_AF22: +	case DSCP_AF23: +		return IEEE8021Q_TT_EE; +	case DSCP_CS3: +	case DSCP_AF31: +	case DSCP_AF32: +	case DSCP_AF33: +		return IEEE8021Q_TT_CA; +	case DSCP_CS4: +	case DSCP_AF41: +	case DSCP_AF42: +	case DSCP_AF43: +		return IEEE8021Q_TT_VI; +	case DSCP_CS5: +	case DSCP_EF: +	case DSCP_VOICE_ADMIT: +		return IEEE8021Q_TT_VO; +	case DSCP_CS6: +		return IEEE8021Q_TT_IC; +	case DSCP_CS7: +		return IEEE8021Q_TT_NC; +	} + +	return SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp); +} +EXPORT_SYMBOL_GPL(ietf_dscp_to_ieee8021q_tt); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 8ec35194bfcb..ab150641142a 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -148,9 +148,9 @@ static void linkwatch_schedule_work(int urgent)  	 * override the existing timer.  	 */  	if (test_bit(LW_URGENT, &linkwatch_flags)) -		mod_delayed_work(system_wq, &linkwatch_work, 0); +		mod_delayed_work(system_unbound_wq, &linkwatch_work, 0);  	else -		schedule_delayed_work(&linkwatch_work, delay); +		queue_delayed_work(system_unbound_wq, &linkwatch_work, delay);  } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 4a0797f0a154..afb05f58b64c 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -38,13 +38,14 @@ static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)  static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,  		       struct dst_entry *dst, bool can_redirect)  { +	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;  	int ret; -	/* Migration disable and BH disable are needed to protect per-cpu -	 * redirect_info between BPF prog and skb_do_redirect(). +	/* Disabling BH is needed to protect per-CPU bpf_redirect_info between +	 * BPF prog and skb_do_redirect().  	 */ -	migrate_disable();  	local_bh_disable(); +	bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);  	bpf_compute_data_pointers(skb);  	ret = bpf_prog_run_save_cb(lwt->prog, skb); @@ -77,8 +78,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,  		break;  	} +	bpf_net_ctx_clear(bpf_net_ctx);  	local_bh_enable(); -	migrate_enable();  	return ret;  } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 552719c3bbc3..a6fe88eca939 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -734,7 +734,9 @@ out_neigh_release:  struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,  				 struct net_device *dev, bool want_ref)  { -	return ___neigh_create(tbl, pkey, dev, 0, false, want_ref); +	bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK); + +	return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref);  }  EXPORT_SYMBOL(__neigh_create); @@ -1769,7 +1771,7 @@ static void neigh_parms_destroy(struct neigh_parms *parms)  static struct lock_class_key neigh_table_proxy_queue_class; -static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly; +static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly;  void neigh_table_init(int index, struct neigh_table *tbl)  { @@ -1826,13 +1828,19 @@ void neigh_table_init(int index, struct neigh_table *tbl)  	tbl->last_flush = now;  	tbl->last_rand	= now + tbl->parms.reachable_time * 20; -	neigh_tables[index] = tbl; +	rcu_assign_pointer(neigh_tables[index], tbl);  }  EXPORT_SYMBOL(neigh_table_init); +/* + * Only called from ndisc_cleanup(), which means this is dead code + * because we no longer can unload IPv6 module. + */  int neigh_table_clear(int index, struct neigh_table *tbl)  { -	neigh_tables[index] = NULL; +	RCU_INIT_POINTER(neigh_tables[index], NULL); +	synchronize_rcu(); +  	/* It is not clean... Fix it to unload IPv6 module safely */  	cancel_delayed_work_sync(&tbl->managed_work);  	cancel_delayed_work_sync(&tbl->gc_work); @@ -1864,10 +1872,10 @@ static struct neigh_table *neigh_find_table(int family)  	switch (family) {  	case AF_INET: -		tbl = neigh_tables[NEIGH_ARP_TABLE]; +		tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]);  		break;  	case AF_INET6: -		tbl = neigh_tables[NEIGH_ND_TABLE]; +		tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]);  		break;  	} @@ -2331,7 +2339,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,  	ndtmsg = nlmsg_data(nlh);  	for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { -		tbl = neigh_tables[tidx]; +		tbl = rcu_dereference_rtnl(neigh_tables[tidx]);  		if (!tbl)  			continue;  		if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) @@ -2519,7 +2527,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)  	for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {  		struct neigh_parms *p; -		tbl = neigh_tables[tidx]; +		tbl = rcu_dereference_rtnl(neigh_tables[tidx]);  		if (!tbl)  			continue; @@ -2674,7 +2682,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx)  	if (!master_idx)  		return false; -	master = dev ? netdev_master_upper_dev_get(dev) : NULL; +	master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL;  	/* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another  	 * invalid value for ifindex to denote "no master". @@ -2707,7 +2715,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,  {  	struct net *net = sock_net(skb->sk);  	struct neighbour *n; -	int rc, h, s_h = cb->args[1]; +	int err = 0, h, s_h = cb->args[1];  	int idx, s_idx = idx = cb->args[2];  	struct neigh_hash_table *nht;  	unsigned int flags = NLM_F_MULTI; @@ -2715,7 +2723,6 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,  	if (filter->dev_idx || filter->master_idx)  		flags |= NLM_F_DUMP_FILTERED; -	rcu_read_lock();  	nht = rcu_dereference(tbl->nht);  	for (h = s_h; h < (1 << nht->hash_shift); h++) { @@ -2729,23 +2736,19 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,  			if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||  			    neigh_master_filtered(n->dev, filter->master_idx))  				goto next; -			if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, -					    cb->nlh->nlmsg_seq, -					    RTM_NEWNEIGH, -					    flags) < 0) { -				rc = -1; +			err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, +					      cb->nlh->nlmsg_seq, +					      RTM_NEWNEIGH, flags); +			if (err < 0)  				goto out; -			}  next:  			idx++;  		}  	} -	rc = skb->len;  out: -	rcu_read_unlock();  	cb->args[1] = h;  	cb->args[2] = idx; -	return rc; +	return err;  }  static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, @@ -2754,7 +2757,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,  {  	struct pneigh_entry *n;  	struct net *net = sock_net(skb->sk); -	int rc, h, s_h = cb->args[3]; +	int err = 0, h, s_h = cb->args[3];  	int idx, s_idx = idx = cb->args[4];  	unsigned int flags = NLM_F_MULTI; @@ -2772,11 +2775,11 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,  			if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||  			    neigh_master_filtered(n->dev, filter->master_idx))  				goto next; -			if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, -					    cb->nlh->nlmsg_seq, -					    RTM_NEWNEIGH, flags, tbl) < 0) { +			err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, +					       cb->nlh->nlmsg_seq, +					       RTM_NEWNEIGH, flags, tbl); +			if (err < 0) {  				read_unlock_bh(&tbl->lock); -				rc = -1;  				goto out;  			}  		next: @@ -2785,12 +2788,10 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,  	}  	read_unlock_bh(&tbl->lock); -	rc = skb->len;  out:  	cb->args[3] = h;  	cb->args[4] = idx; -	return rc; - +	return err;  }  static int neigh_valid_dump_req(const struct nlmsghdr *nlh, @@ -2878,8 +2879,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)  	s_t = cb->args[0]; +	rcu_read_lock();  	for (t = 0; t < NEIGH_NR_TABLES; t++) { -		tbl = neigh_tables[t]; +		tbl = rcu_dereference(neigh_tables[t]);  		if (!tbl)  			continue; @@ -2895,9 +2897,10 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)  		if (err < 0)  			break;  	} +	rcu_read_unlock();  	cb->args[0] = t; -	return skb->len; +	return err;  }  static int neigh_valid_get_req(const struct nlmsghdr *nlh, @@ -3143,14 +3146,15 @@ int neigh_xmit(int index, struct net_device *dev,  	       const void *addr, struct sk_buff *skb)  {  	int err = -EAFNOSUPPORT; +  	if (likely(index < NEIGH_NR_TABLES)) {  		struct neigh_table *tbl;  		struct neighbour *neigh; -		tbl = neigh_tables[index]; -		if (!tbl) -			goto out;  		rcu_read_lock(); +		tbl = rcu_dereference(neigh_tables[index]); +		if (!tbl) +			goto out_unlock;  		if (index == NEIGH_ARP_TABLE) {  			u32 key = *((u32 *)addr); @@ -3166,6 +3170,7 @@ int neigh_xmit(int index, struct net_device *dev,  			goto out_kfree_skb;  		}  		err = READ_ONCE(neigh->output)(neigh, skb); +out_unlock:  		rcu_read_unlock();  	}  	else if (index == NEIGH_LINK_TABLE) { @@ -3538,7 +3543,7 @@ EXPORT_SYMBOL(neigh_app_ns);  #ifdef CONFIG_SYSCTL  static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); -static int proc_unres_qlen(struct ctl_table *ctl, int write, +static int proc_unres_qlen(const struct ctl_table *ctl, int write,  			   void *buffer, size_t *lenp, loff_t *ppos)  {  	int size, ret; @@ -3573,7 +3578,7 @@ static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,  	rcu_read_unlock();  } -static void neigh_proc_update(struct ctl_table *ctl, int write) +static void neigh_proc_update(const struct ctl_table *ctl, int write)  {  	struct net_device *dev = ctl->extra1;  	struct neigh_parms *p = ctl->extra2; @@ -3590,7 +3595,7 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)  		neigh_copy_dflt_parms(net, p, index);  } -static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, +static int neigh_proc_dointvec_zero_intmax(const struct ctl_table *ctl, int write,  					   void *buffer, size_t *lenp,  					   loff_t *ppos)  { @@ -3605,7 +3610,7 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,  	return ret;  } -static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int write, +static int neigh_proc_dointvec_ms_jiffies_positive(const struct ctl_table *ctl, int write,  						   void *buffer, size_t *lenp, loff_t *ppos)  {  	struct ctl_table tmp = *ctl; @@ -3621,7 +3626,7 @@ static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int wr  	return ret;  } -int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, +int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer,  			size_t *lenp, loff_t *ppos)  {  	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -3631,7 +3636,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer,  }  EXPORT_SYMBOL(neigh_proc_dointvec); -int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, +int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer,  				size_t *lenp, loff_t *ppos)  {  	int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); @@ -3641,7 +3646,7 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer,  }  EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); -static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, +static int neigh_proc_dointvec_userhz_jiffies(const struct ctl_table *ctl, int write,  					      void *buffer, size_t *lenp,  					      loff_t *ppos)  { @@ -3651,7 +3656,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,  	return ret;  } -int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, +int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write,  				   void *buffer, size_t *lenp, loff_t *ppos)  {  	int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); @@ -3661,7 +3666,7 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,  }  EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); -static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, +static int neigh_proc_dointvec_unres_qlen(const struct ctl_table *ctl, int write,  					  void *buffer, size_t *lenp,  					  loff_t *ppos)  { @@ -3671,7 +3676,7 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,  	return ret;  } -static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, +static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write,  					  void *buffer, size_t *lenp,  					  loff_t *ppos)  { @@ -3728,7 +3733,7 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,  static struct neigh_sysctl_table {  	struct ctl_table_header *sysctl_header; -	struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; +	struct ctl_table neigh_vars[NEIGH_VAR_MAX];  } neigh_sysctl_template __read_mostly = {  	.neigh_vars = {  		NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), @@ -3779,7 +3784,6 @@ static struct neigh_sysctl_table {  			.extra2		= SYSCTL_INT_MAX,  			.proc_handler	= proc_dointvec_minmax,  		}, -		{},  	},  }; @@ -3807,8 +3811,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,  	if (dev) {  		dev_name_source = dev->name;  		/* Terminate the table early */ -		memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, -		       sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));  		neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1;  	} else {  		struct neigh_table *tbl = p->tbl; @@ -3889,7 +3891,8 @@ static int __init neigh_init(void)  {  	rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0);  	rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); -	rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0); +	rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, +		      RTNL_FLAG_DUMP_UNLOCKED);  	rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,  		      0); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index a97eceb84e61..fa6d3969734a 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -144,7 +144,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v)  	seq_printf(seq,  		   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x "  		   "%08x %08x\n", -		   sd->processed, sd->dropped, sd->time_squeeze, 0, +		   sd->processed, atomic_read(&sd->dropped), +		   sd->time_squeeze, 0,  		   0, 0, 0, 0, /* was fastroute */  		   0,	/* was cpu_collision */  		   sd->received_rps, flow_limit_count, diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index e3d7a8cfa20b..444f23e74f8e 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -235,7 +235,7 @@ static ssize_t speed_show(struct device *dev,  	if (!rtnl_trylock())  		return restart_syscall(); -	if (netif_running(netdev) && netif_device_present(netdev)) { +	if (netif_running(netdev)) {  		struct ethtool_link_ksettings cmd;  		if (!__ethtool_get_link_ksettings(netdev, &cmd)) @@ -605,13 +605,13 @@ static ssize_t threaded_show(struct device *dev,  	struct net_device *netdev = to_net_dev(dev);  	ssize_t ret = -EINVAL; -	if (!rtnl_trylock()) -		return restart_syscall(); +	rcu_read_lock();  	if (dev_isalive(netdev)) -		ret = sysfs_emit(buf, fmt_dec, netdev->threaded); +		ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded)); + +	rcu_read_unlock(); -	rtnl_unlock();  	return ret;  } @@ -1419,7 +1419,7 @@ static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf)  {  	struct dql *dql = &queue->dql; -	return sprintf(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); +	return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs));  }  static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, @@ -1451,7 +1451,7 @@ static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init =  static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf)  { -	return sprintf(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); +	return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max));  }  static ssize_t bql_set_stall_max(struct netdev_queue *queue, @@ -1468,7 +1468,7 @@ static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf)  {  	struct dql *dql = &queue->dql; -	return sprintf(buf, "%lu\n", dql->stall_cnt); +	return sysfs_emit(buf, "%lu\n", dql->stall_cnt);  }  static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = @@ -2028,7 +2028,7 @@ static void netdev_release(struct device *d)  	 * device is dead and about to be freed.  	 */  	kfree(rcu_access_pointer(dev->ifalias)); -	netdev_freemem(dev); +	kvfree(dev);  }  static const void *net_namespace(const struct device *d) @@ -2046,7 +2046,7 @@ static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid)  	net_ns_get_ownership(net, uid, gid);  } -static struct class net_class __ro_after_init = { +static const struct class net_class = {  	.name = "net",  	.dev_release = netdev_release,  	.dev_groups = net_class_groups, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9d690d32da33..6a823ba906c6 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -693,11 +693,16 @@ EXPORT_SYMBOL_GPL(__put_net);   * get_net_ns - increment the refcount of the network namespace   * @ns: common namespace (net)   * - * Returns the net's common namespace. + * Returns the net's common namespace or ERR_PTR() if ref is zero.   */  struct ns_common *get_net_ns(struct ns_common *ns)  { -	return &get_net(container_of(ns, struct net, ns))->ns; +	struct net *net; + +	net = maybe_get_net(container_of(ns, struct net, ns)); +	if (net) +		return &net->ns; +	return ERR_PTR(-EINVAL);  }  EXPORT_SYMBOL_GPL(get_net_ns); @@ -1093,7 +1098,7 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)  end:  	if (net_cb.fillargs.add_ref)  		put_net(net_cb.tgt_net); -	return err < 0 ? err : skb->len; +	return err;  }  static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, @@ -1208,7 +1213,8 @@ void __init net_ns_init(void)  	rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL,  		      RTNL_FLAG_DOIT_UNLOCKED);  	rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, -		      RTNL_FLAG_DOIT_UNLOCKED); +		      RTNL_FLAG_DOIT_UNLOCKED | +		      RTNL_FLAG_DUMP_UNLOCKED);  }  static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) diff --git a/net/core/gso_test.c b/net/core/net_test.c index 358c44680d91..9c3a590865d2 100644 --- a/net/core/gso_test.c +++ b/net/core/net_test.c @@ -1,6 +1,9 @@  // SPDX-License-Identifier: GPL-2.0-or-later  #include <kunit/test.h> + +/* GSO */ +  #include <linux/skbuff.h>  static const char hdr[] = "abcdefgh"; @@ -258,17 +261,127 @@ free_gso_skb:  	consume_skb(skb);  } -static struct kunit_case gso_test_cases[] = { -	KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params), -	{} +/* IP tunnel flags */ + +#include <net/ip_tunnels.h> + +struct ip_tunnel_flags_test { +	const char	*name; + +	const u16	*src_bits; +	const u16	*exp_bits; +	u8		src_num; +	u8		exp_num; + +	__be16		exp_val; +	bool		exp_comp;  }; -static struct kunit_suite gso_test_suite = { -	.name = "net_core_gso", -	.test_cases = gso_test_cases, +#define IP_TUNNEL_FLAGS_TEST(n, src, comp, eval, exp) {	\ +	.name		= (n),				\ +	.src_bits	= (src),			\ +	.src_num	= ARRAY_SIZE(src),		\ +	.exp_comp	= (comp),			\ +	.exp_val	= (eval),			\ +	.exp_bits	= (exp),			\ +	.exp_num	= ARRAY_SIZE(exp),		\ +} + +/* These are __be16-compatible and can be compared as is */ +static const u16 ip_tunnel_flags_1[] = { +	IP_TUNNEL_KEY_BIT, +	IP_TUNNEL_STRICT_BIT, +	IP_TUNNEL_ERSPAN_OPT_BIT,  }; -kunit_test_suite(gso_test_suite); +/* Due to the previous flags design limitation, setting either + * ``IP_TUNNEL_CSUM_BIT`` (on Big Endian) or ``IP_TUNNEL_DONT_FRAGMENT_BIT`` + * (on Little) also sets VTI/ISATAP bit. In the bitmap implementation, they + * correspond to ``BIT(16)``, which is bigger than ``U16_MAX``, but still is + * backward-compatible. + */ +#ifdef __LITTLE_ENDIAN +#define IP_TUNNEL_CONFLICT_BIT	IP_TUNNEL_DONT_FRAGMENT_BIT +#else +#define IP_TUNNEL_CONFLICT_BIT	IP_TUNNEL_CSUM_BIT +#endif + +static const u16 ip_tunnel_flags_2_src[] = { +	IP_TUNNEL_CONFLICT_BIT, +}; + +static const u16 ip_tunnel_flags_2_exp[] = { +	IP_TUNNEL_CONFLICT_BIT, +	IP_TUNNEL_SIT_ISATAP_BIT, +}; + +/* Bits 17 and higher are not compatible with __be16 flags */ +static const u16 ip_tunnel_flags_3_src[] = { +	IP_TUNNEL_VXLAN_OPT_BIT, +	17, +	18, +	20, +}; + +static const u16 ip_tunnel_flags_3_exp[] = { +	IP_TUNNEL_VXLAN_OPT_BIT, +}; + +static const struct ip_tunnel_flags_test ip_tunnel_flags_test[] = { +	IP_TUNNEL_FLAGS_TEST("compat", ip_tunnel_flags_1, true, +			     cpu_to_be16(BIT(IP_TUNNEL_KEY_BIT) | +					 BIT(IP_TUNNEL_STRICT_BIT) | +					 BIT(IP_TUNNEL_ERSPAN_OPT_BIT)), +			     ip_tunnel_flags_1), +	IP_TUNNEL_FLAGS_TEST("conflict", ip_tunnel_flags_2_src, true, +			     VTI_ISVTI, ip_tunnel_flags_2_exp), +	IP_TUNNEL_FLAGS_TEST("new", ip_tunnel_flags_3_src, false, +			     cpu_to_be16(BIT(IP_TUNNEL_VXLAN_OPT_BIT)), +			     ip_tunnel_flags_3_exp), +}; + +static void +ip_tunnel_flags_test_case_to_desc(const struct ip_tunnel_flags_test *t, +				  char *desc) +{ +	strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE); +} +KUNIT_ARRAY_PARAM(ip_tunnel_flags_test, ip_tunnel_flags_test, +		  ip_tunnel_flags_test_case_to_desc); + +static void ip_tunnel_flags_test_run(struct kunit *test) +{ +	const struct ip_tunnel_flags_test *t = test->param_value; +	IP_TUNNEL_DECLARE_FLAGS(src) = { }; +	IP_TUNNEL_DECLARE_FLAGS(exp) = { }; +	IP_TUNNEL_DECLARE_FLAGS(out); + +	for (u32 j = 0; j < t->src_num; j++) +		__set_bit(t->src_bits[j], src); +	for (u32 j = 0; j < t->exp_num; j++) +		__set_bit(t->exp_bits[j], exp); + +	KUNIT_ASSERT_EQ(test, t->exp_comp, +			ip_tunnel_flags_is_be16_compat(src)); +	KUNIT_ASSERT_EQ(test, (__force u16)t->exp_val, +			(__force u16)ip_tunnel_flags_to_be16(src)); + +	ip_tunnel_flags_from_be16(out, t->exp_val); +	KUNIT_ASSERT_TRUE(test, __ipt_flag_op(bitmap_equal, exp, out)); +} + +static struct kunit_case net_test_cases[] = { +	KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params), +	KUNIT_CASE_PARAM(ip_tunnel_flags_test_run, +			 ip_tunnel_flags_test_gen_params), +	{ }, +}; + +static struct kunit_suite net_test_suite = { +	.name		= "net_core", +	.test_cases	= net_test_cases, +}; +kunit_test_suite(net_test_suite); +MODULE_DESCRIPTION("KUnit tests for networking core");  MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("KUnit tests for segmentation offload"); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 8d8ace9ef87f..8350a0afa9ec 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -70,6 +70,7 @@ static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFIN  /* NETDEV_CMD_QSTATS_GET - dump */  static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = { +	[NETDEV_A_QSTATS_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),  	[NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1),  }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 7004b3399c2b..05f9515d2c05 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -59,22 +59,22 @@ XDP_METADATA_KFUNC_xxx  	    nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,  			      xdp_rx_meta, NETDEV_A_DEV_PAD) ||  	    nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, -			      xsk_features, NETDEV_A_DEV_PAD)) { -		genlmsg_cancel(rsp, hdr); -		return -EINVAL; -	} +			      xsk_features, NETDEV_A_DEV_PAD)) +		goto err_cancel_msg;  	if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) {  		if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, -				netdev->xdp_zc_max_segs)) { -			genlmsg_cancel(rsp, hdr); -			return -EINVAL; -		} +				netdev->xdp_zc_max_segs)) +			goto err_cancel_msg;  	}  	genlmsg_end(rsp, hdr);  	return 0; + +err_cancel_msg: +	genlmsg_cancel(rsp, hdr); +	return -EMSGSIZE;  }  static void @@ -489,7 +489,17 @@ netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx)  {  	if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) ||  	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) || -	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail)) +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits))  		return -EMSGSIZE;  	return 0;  } @@ -498,7 +508,18 @@ static int  netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx)  {  	if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || -	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes)) +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) || +	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake))  		return -EMSGSIZE;  	return 0;  } @@ -639,6 +660,24 @@ nla_put_failure:  	return -EMSGSIZE;  } +static int +netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope, +			      struct sk_buff *skb, const struct genl_info *info, +			      struct netdev_nl_dump_ctx *ctx) +{ +	if (!netdev->stat_ops) +		return 0; + +	switch (scope) { +	case 0: +		return netdev_nl_stats_by_netdev(netdev, skb, info); +	case NETDEV_QSTATS_SCOPE_QUEUE: +		return netdev_nl_stats_by_queue(netdev, skb, info, ctx); +	} + +	return -EINVAL;	/* Should not happen, per netlink policy */ +} +  int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,  				struct netlink_callback *cb)  { @@ -646,6 +685,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,  	const struct genl_info *info = genl_info_dump(cb);  	struct net *net = sock_net(skb->sk);  	struct net_device *netdev; +	unsigned int ifindex;  	unsigned int scope;  	int err = 0; @@ -653,21 +693,28 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,  	if (info->attrs[NETDEV_A_QSTATS_SCOPE])  		scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); -	rtnl_lock(); -	for_each_netdev_dump(net, netdev, ctx->ifindex) { -		if (!netdev->stat_ops) -			continue; +	ifindex = 0; +	if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) +		ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); -		switch (scope) { -		case 0: -			err = netdev_nl_stats_by_netdev(netdev, skb, info); -			break; -		case NETDEV_QSTATS_SCOPE_QUEUE: -			err = netdev_nl_stats_by_queue(netdev, skb, info, ctx); -			break; +	rtnl_lock(); +	if (ifindex) { +		netdev = __dev_get_by_index(net, ifindex); +		if (netdev && netdev->stat_ops) { +			err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, +							    info, ctx); +		} else { +			NL_SET_BAD_ATTR(info->extack, +					info->attrs[NETDEV_A_QSTATS_IFINDEX]); +			err = netdev ? -EOPNOTSUPP : -ENODEV; +		} +	} else { +		for_each_netdev_dump(net, netdev, ctx->ifindex) { +			err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, +							    info, ctx); +			if (err < 0) +				break;  		} -		if (err < 0) -			break;  	}  	rtnl_unlock(); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 543007f159f9..d657b042d5a0 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -228,7 +228,6 @@ void netpoll_poll_disable(struct net_device *dev)  		down(&ni->dev_lock);  	srcu_read_unlock(&netpoll_srcu, idx);  } -EXPORT_SYMBOL(netpoll_poll_disable);  void netpoll_poll_enable(struct net_device *dev)  { @@ -239,7 +238,6 @@ void netpoll_poll_enable(struct net_device *dev)  		up(&ni->dev_lock);  	rcu_read_unlock();  } -EXPORT_SYMBOL(netpoll_poll_enable);  static void refill_skbs(void)  { @@ -316,7 +314,7 @@ static int netpoll_owner_active(struct net_device *dev)  	struct napi_struct *napi;  	list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { -		if (napi->poll_owner == smp_processor_id()) +		if (READ_ONCE(napi->poll_owner) == smp_processor_id())  			return 1;  	}  	return 0; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index dd364d738c00..2abe6e919224 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -5,6 +5,7 @@   *	Copyright (C) 2016 Red Hat, Inc.   */ +#include <linux/error-injection.h>  #include <linux/types.h>  #include <linux/kernel.h>  #include <linux/slab.h> @@ -123,9 +124,9 @@ int page_pool_ethtool_stats_get_count(void)  }  EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); -u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)  { -	struct page_pool_stats *pool_stats = stats; +	const struct page_pool_stats *pool_stats = stats;  	*data++ = pool_stats->alloc_stats.fast;  	*data++ = pool_stats->alloc_stats.slow; @@ -172,19 +173,30 @@ static void page_pool_producer_unlock(struct page_pool *pool,  		spin_unlock_bh(&pool->ring.producer_lock);  } +static void page_pool_struct_check(void) +{ +	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); +	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); +	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); +	CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, +				    PAGE_POOL_FRAG_GROUP_ALIGN); +} +  static int page_pool_init(struct page_pool *pool,  			  const struct page_pool_params *params,  			  int cpuid)  {  	unsigned int ring_qsize = 1024; /* Default */ +	page_pool_struct_check(); +  	memcpy(&pool->p, ¶ms->fast, sizeof(pool->p));  	memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow));  	pool->cpuid = cpuid;  	/* Validate only known flags were used */ -	if (pool->p.flags & ~(PP_FLAG_ALL)) +	if (pool->slow.flags & ~PP_FLAG_ALL)  		return -EINVAL;  	if (pool->p.pool_size) @@ -198,22 +210,26 @@ static int page_pool_init(struct page_pool *pool,  	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,  	 * which is the XDP_TX use-case.  	 */ -	if (pool->p.flags & PP_FLAG_DMA_MAP) { +	if (pool->slow.flags & PP_FLAG_DMA_MAP) {  		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&  		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))  			return -EINVAL; + +		pool->dma_map = true;  	} -	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { +	if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {  		/* In order to request DMA-sync-for-device the page  		 * needs to be mapped  		 */ -		if (!(pool->p.flags & PP_FLAG_DMA_MAP)) +		if (!(pool->slow.flags & PP_FLAG_DMA_MAP))  			return -EINVAL;  		if (!pool->p.max_len)  			return -EINVAL; +		pool->dma_sync = true; +  		/* pool->p.offset has to be set according to the address  		 * offset used by the DMA engine to start copying rx data  		 */ @@ -222,7 +238,7 @@ static int page_pool_init(struct page_pool *pool,  	pool->has_init_callback = !!pool->slow.init_callback;  #ifdef CONFIG_PAGE_POOL_STATS -	if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) { +	if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {  		pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);  		if (!pool->recycle_stats)  			return -ENOMEM; @@ -232,12 +248,13 @@ static int page_pool_init(struct page_pool *pool,  		 * (also percpu) page pool instance.  		 */  		pool->recycle_stats = &pp_system_recycle_stats; +		pool->system = true;  	}  #endif  	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {  #ifdef CONFIG_PAGE_POOL_STATS -		if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) +		if (!pool->system)  			free_percpu(pool->recycle_stats);  #endif  		return -ENOMEM; @@ -248,7 +265,7 @@ static int page_pool_init(struct page_pool *pool,  	/* Driver calling page_pool_create() also call page_pool_destroy() */  	refcount_set(&pool->user_cnt, 1); -	if (pool->p.flags & PP_FLAG_DMA_MAP) +	if (pool->dma_map)  		get_device(pool->p.dev);  	return 0; @@ -258,11 +275,11 @@ static void page_pool_uninit(struct page_pool *pool)  {  	ptr_ring_cleanup(&pool->ring, NULL); -	if (pool->p.flags & PP_FLAG_DMA_MAP) +	if (pool->dma_map)  		put_device(pool->p.dev);  #ifdef CONFIG_PAGE_POOL_STATS -	if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) +	if (!pool->system)  		free_percpu(pool->recycle_stats);  #endif  } @@ -311,19 +328,18 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)  }  EXPORT_SYMBOL(page_pool_create); -static void page_pool_return_page(struct page_pool *pool, struct page *page); +static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem); -noinline -static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) +static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)  {  	struct ptr_ring *r = &pool->ring; -	struct page *page; +	netmem_ref netmem;  	int pref_nid; /* preferred NUMA node */  	/* Quicker fallback, avoid locks when ring is empty */  	if (__ptr_ring_empty(r)) {  		alloc_stat_inc(pool, empty); -		return NULL; +		return 0;  	}  	/* Softirq guarantee CPU and thus NUMA node is stable. This, @@ -338,64 +354,74 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)  	/* Refill alloc array, but only if NUMA match */  	do { -		page = __ptr_ring_consume(r); -		if (unlikely(!page)) +		netmem = (__force netmem_ref)__ptr_ring_consume(r); +		if (unlikely(!netmem))  			break; -		if (likely(page_to_nid(page) == pref_nid)) { -			pool->alloc.cache[pool->alloc.count++] = page; +		if (likely(page_to_nid(netmem_to_page(netmem)) == pref_nid)) { +			pool->alloc.cache[pool->alloc.count++] = netmem;  		} else {  			/* NUMA mismatch;  			 * (1) release 1 page to page-allocator and  			 * (2) break out to fallthrough to alloc_pages_node.  			 * This limit stress on page buddy alloactor.  			 */ -			page_pool_return_page(pool, page); +			page_pool_return_page(pool, netmem);  			alloc_stat_inc(pool, waive); -			page = NULL; +			netmem = 0;  			break;  		}  	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);  	/* Return last page */  	if (likely(pool->alloc.count > 0)) { -		page = pool->alloc.cache[--pool->alloc.count]; +		netmem = pool->alloc.cache[--pool->alloc.count];  		alloc_stat_inc(pool, refill);  	} -	return page; +	return netmem;  }  /* fast path */ -static struct page *__page_pool_get_cached(struct page_pool *pool) +static netmem_ref __page_pool_get_cached(struct page_pool *pool)  { -	struct page *page; +	netmem_ref netmem;  	/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */  	if (likely(pool->alloc.count)) {  		/* Fast-path */ -		page = pool->alloc.cache[--pool->alloc.count]; +		netmem = pool->alloc.cache[--pool->alloc.count];  		alloc_stat_inc(pool, fast);  	} else { -		page = page_pool_refill_alloc_cache(pool); +		netmem = page_pool_refill_alloc_cache(pool);  	} -	return page; +	return netmem;  } -static void page_pool_dma_sync_for_device(struct page_pool *pool, -					  struct page *page, -					  unsigned int dma_sync_size) +static void __page_pool_dma_sync_for_device(const struct page_pool *pool, +					    netmem_ref netmem, +					    u32 dma_sync_size)  { -	dma_addr_t dma_addr = page_pool_get_dma_addr(page); +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) +	dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);  	dma_sync_size = min(dma_sync_size, pool->p.max_len); -	dma_sync_single_range_for_device(pool->p.dev, dma_addr, -					 pool->p.offset, dma_sync_size, -					 pool->p.dma_dir); +	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, +				     dma_sync_size, pool->p.dma_dir); +#endif  } -static bool page_pool_dma_map(struct page_pool *pool, struct page *page) +static __always_inline void +page_pool_dma_sync_for_device(const struct page_pool *pool, +			      netmem_ref netmem, +			      u32 dma_sync_size) +{ +	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) +		__page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); +} + +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)  {  	dma_addr_t dma; @@ -404,32 +430,32 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)  	 * into page private data (i.e 32bit cpu with 64bit DMA caps)  	 * This mapping is kept for lifetime of page, until leaving pool.  	 */ -	dma = dma_map_page_attrs(pool->p.dev, page, 0, -				 (PAGE_SIZE << pool->p.order), -				 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | -						  DMA_ATTR_WEAK_ORDERING); +	dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0, +				 (PAGE_SIZE << pool->p.order), pool->p.dma_dir, +				 DMA_ATTR_SKIP_CPU_SYNC | +					 DMA_ATTR_WEAK_ORDERING);  	if (dma_mapping_error(pool->p.dev, dma))  		return false; -	if (page_pool_set_dma_addr(page, dma)) +	if (page_pool_set_dma_addr_netmem(netmem, dma))  		goto unmap_failed; -	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) -		page_pool_dma_sync_for_device(pool, page, pool->p.max_len); +	page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);  	return true;  unmap_failed: -	WARN_ON_ONCE("unexpected DMA address, please report to netdev@"); +	WARN_ONCE(1, "unexpected DMA address, please report to netdev@");  	dma_unmap_page_attrs(pool->p.dev, dma,  			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,  			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);  	return false;  } -static void page_pool_set_pp_info(struct page_pool *pool, -				  struct page *page) +static void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)  { +	struct page *page = netmem_to_page(netmem); +  	page->pp = pool;  	page->pp_magic |= PP_SIGNATURE; @@ -439,13 +465,15 @@ static void page_pool_set_pp_info(struct page_pool *pool,  	 * is dirtying the same cache line as the page->pp_magic above, so  	 * the overhead is negligible.  	 */ -	page_pool_fragment_page(page, 1); +	page_pool_fragment_netmem(netmem, 1);  	if (pool->has_init_callback) -		pool->slow.init_callback(page, pool->slow.init_arg); +		pool->slow.init_callback(netmem, pool->slow.init_arg);  } -static void page_pool_clear_pp_info(struct page *page) +static void page_pool_clear_pp_info(netmem_ref netmem)  { +	struct page *page = netmem_to_page(netmem); +  	page->pp_magic = 0;  	page->pp = NULL;  } @@ -460,35 +488,34 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,  	if (unlikely(!page))  		return NULL; -	if ((pool->p.flags & PP_FLAG_DMA_MAP) && -	    unlikely(!page_pool_dma_map(pool, page))) { +	if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {  		put_page(page);  		return NULL;  	}  	alloc_stat_inc(pool, slow_high_order); -	page_pool_set_pp_info(pool, page); +	page_pool_set_pp_info(pool, page_to_netmem(page));  	/* Track how many pages are held 'in-flight' */  	pool->pages_state_hold_cnt++; -	trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); +	trace_page_pool_state_hold(pool, page_to_netmem(page), +				   pool->pages_state_hold_cnt);  	return page;  }  /* slow path */ -noinline -static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, -						 gfp_t gfp) +static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, +							gfp_t gfp)  {  	const int bulk = PP_ALLOC_CACHE_REFILL; -	unsigned int pp_flags = pool->p.flags;  	unsigned int pp_order = pool->p.order; -	struct page *page; +	bool dma_map = pool->dma_map; +	netmem_ref netmem;  	int i, nr_pages;  	/* Don't support bulk alloc for high-order pages */  	if (unlikely(pp_order)) -		return __page_pool_alloc_page_order(pool, gfp); +		return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));  	/* Unnecessary as alloc cache is empty, but guarantees zero count */  	if (unlikely(pool->alloc.count > 0)) @@ -497,59 +524,66 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,  	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */  	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); -	nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk, -					       pool->alloc.cache); +	nr_pages = alloc_pages_bulk_array_node(gfp, +					       pool->p.nid, bulk, +					       (struct page **)pool->alloc.cache);  	if (unlikely(!nr_pages)) -		return NULL; +		return 0;  	/* Pages have been filled into alloc.cache array, but count is zero and  	 * page element have not been (possibly) DMA mapped.  	 */  	for (i = 0; i < nr_pages; i++) { -		page = pool->alloc.cache[i]; -		if ((pp_flags & PP_FLAG_DMA_MAP) && -		    unlikely(!page_pool_dma_map(pool, page))) { -			put_page(page); +		netmem = pool->alloc.cache[i]; +		if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) { +			put_page(netmem_to_page(netmem));  			continue;  		} -		page_pool_set_pp_info(pool, page); -		pool->alloc.cache[pool->alloc.count++] = page; +		page_pool_set_pp_info(pool, netmem); +		pool->alloc.cache[pool->alloc.count++] = netmem;  		/* Track how many pages are held 'in-flight' */  		pool->pages_state_hold_cnt++; -		trace_page_pool_state_hold(pool, page, +		trace_page_pool_state_hold(pool, netmem,  					   pool->pages_state_hold_cnt);  	}  	/* Return last page */  	if (likely(pool->alloc.count > 0)) { -		page = pool->alloc.cache[--pool->alloc.count]; +		netmem = pool->alloc.cache[--pool->alloc.count];  		alloc_stat_inc(pool, slow);  	} else { -		page = NULL; +		netmem = 0;  	}  	/* When page just alloc'ed is should/must have refcnt 1. */ -	return page; +	return netmem;  }  /* For using page_pool replace: alloc_pages() API calls, but provide   * synchronization guarantee for allocation side.   */ -struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp)  { -	struct page *page; +	netmem_ref netmem;  	/* Fast-path: Get a page from cache */ -	page = __page_pool_get_cached(pool); -	if (page) -		return page; +	netmem = __page_pool_get_cached(pool); +	if (netmem) +		return netmem;  	/* Slow-path: cache empty, do real allocation */ -	page = __page_pool_alloc_pages_slow(pool, gfp); -	return page; +	netmem = __page_pool_alloc_pages_slow(pool, gfp); +	return netmem; +} +EXPORT_SYMBOL(page_pool_alloc_netmem); + +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +{ +	return netmem_to_page(page_pool_alloc_netmem(pool, gfp));  }  EXPORT_SYMBOL(page_pool_alloc_pages); +ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);  /* Calculate distance between two u32 values, valid if distance is below 2^(31)   *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution @@ -575,24 +609,24 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict)  	return inflight;  } -static __always_inline -void __page_pool_release_page_dma(struct page_pool *pool, struct page *page) +static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, +							 netmem_ref netmem)  {  	dma_addr_t dma; -	if (!(pool->p.flags & PP_FLAG_DMA_MAP)) +	if (!pool->dma_map)  		/* Always account for inflight pages, even if we didn't  		 * map them  		 */  		return; -	dma = page_pool_get_dma_addr(page); +	dma = page_pool_get_dma_addr_netmem(netmem);  	/* When page is unmapped, it cannot be returned to our pool */  	dma_unmap_page_attrs(pool->p.dev, dma,  			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,  			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); -	page_pool_set_dma_addr(page, 0); +	page_pool_set_dma_addr_netmem(netmem, 0);  }  /* Disconnects a page (from a page_pool).  API users can have a need @@ -600,35 +634,34 @@ void __page_pool_release_page_dma(struct page_pool *pool, struct page *page)   * a regular page (that will eventually be returned to the normal   * page-allocator via put_page).   */ -void page_pool_return_page(struct page_pool *pool, struct page *page) +void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)  {  	int count; -	__page_pool_release_page_dma(pool, page); - -	page_pool_clear_pp_info(page); +	__page_pool_release_page_dma(pool, netmem);  	/* This may be the last page returned, releasing the pool, so  	 * it is not safe to reference pool afterwards.  	 */  	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); -	trace_page_pool_state_release(pool, page, count); +	trace_page_pool_state_release(pool, netmem, count); -	put_page(page); +	page_pool_clear_pp_info(netmem); +	put_page(netmem_to_page(netmem));  	/* An optimization would be to call __free_pages(page, pool->p.order)  	 * knowing page is not part of page-cache (thus avoiding a  	 * __page_cache_release() call).  	 */  } -static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) +static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)  {  	int ret;  	/* BH protection not needed if current is softirq */  	if (in_softirq()) -		ret = ptr_ring_produce(&pool->ring, page); +		ret = ptr_ring_produce(&pool->ring, (__force void *)netmem);  	else -		ret = ptr_ring_produce_bh(&pool->ring, page); +		ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem);  	if (!ret) {  		recycle_stat_inc(pool, ring); @@ -643,7 +676,7 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)   *   * Caller must provide appropriate safe context.   */ -static bool page_pool_recycle_in_cache(struct page *page, +static bool page_pool_recycle_in_cache(netmem_ref netmem,  				       struct page_pool *pool)  {  	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { @@ -652,24 +685,25 @@ static bool page_pool_recycle_in_cache(struct page *page,  	}  	/* Caller MUST have verified/know (page_ref_count(page) == 1) */ -	pool->alloc.cache[pool->alloc.count++] = page; +	pool->alloc.cache[pool->alloc.count++] = netmem;  	recycle_stat_inc(pool, cached);  	return true;  } -static bool __page_pool_page_can_be_recycled(const struct page *page) +static bool __page_pool_page_can_be_recycled(netmem_ref netmem)  { -	return page_ref_count(page) == 1 && !page_is_pfmemalloc(page); +	return page_ref_count(netmem_to_page(netmem)) == 1 && +	       !page_is_pfmemalloc(netmem_to_page(netmem));  }  /* If the page refcnt == 1, this will try to recycle the page. - * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for + * If pool->dma_sync is set, we'll try to sync the DMA area for   * the configured size min(dma_sync_size, pool->max_len).   * If the page refcnt != 1, then the page will be returned to memory   * subsystem.   */ -static __always_inline struct page * -__page_pool_put_page(struct page_pool *pool, struct page *page, +static __always_inline netmem_ref +__page_pool_put_page(struct page_pool *pool, netmem_ref netmem,  		     unsigned int dma_sync_size, bool allow_direct)  {  	lockdep_assert_no_hardirq(); @@ -683,19 +717,16 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,  	 * page is NOT reusable when allocated when system is under  	 * some pressure. (page_is_pfmemalloc)  	 */ -	if (likely(__page_pool_page_can_be_recycled(page))) { +	if (likely(__page_pool_page_can_be_recycled(netmem))) {  		/* Read barrier done in page_ref_count / READ_ONCE */ -		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) -			page_pool_dma_sync_for_device(pool, page, -						      dma_sync_size); +		page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); -		if (allow_direct && in_softirq() && -		    page_pool_recycle_in_cache(page, pool)) -			return NULL; +		if (allow_direct && page_pool_recycle_in_cache(netmem, pool)) +			return 0;  		/* Page found as candidate for recycling */ -		return page; +		return netmem;  	}  	/* Fallback/non-XDP mode: API user have elevated refcnt.  	 * @@ -711,21 +742,56 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,  	 * will be invoking put_page.  	 */  	recycle_stat_inc(pool, released_refcnt); -	page_pool_return_page(pool, page); +	page_pool_return_page(pool, netmem); -	return NULL; +	return 0;  } -void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, -				unsigned int dma_sync_size, bool allow_direct) +static bool page_pool_napi_local(const struct page_pool *pool) +{ +	const struct napi_struct *napi; +	u32 cpuid; + +	if (unlikely(!in_softirq())) +		return false; + +	/* Allow direct recycle if we have reasons to believe that we are +	 * in the same context as the consumer would run, so there's +	 * no possible race. +	 * __page_pool_put_page() makes sure we're not in hardirq context +	 * and interrupts are enabled prior to accessing the cache. +	 */ +	cpuid = smp_processor_id(); +	if (READ_ONCE(pool->cpuid) == cpuid) +		return true; + +	napi = READ_ONCE(pool->p.napi); + +	return napi && READ_ONCE(napi->list_owner) == cpuid; +} + +void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, +				  unsigned int dma_sync_size, bool allow_direct)  { -	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); -	if (page && !page_pool_recycle_in_ring(pool, page)) { +	if (!allow_direct) +		allow_direct = page_pool_napi_local(pool); + +	netmem = +		__page_pool_put_page(pool, netmem, dma_sync_size, allow_direct); +	if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {  		/* Cache full, fallback to free pages */  		recycle_stat_inc(pool, ring_full); -		page_pool_return_page(pool, page); +		page_pool_return_page(pool, netmem);  	}  } +EXPORT_SYMBOL(page_pool_put_unrefed_netmem); + +void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, +				unsigned int dma_sync_size, bool allow_direct) +{ +	page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size, +				     allow_direct); +}  EXPORT_SYMBOL(page_pool_put_unrefed_page);  /** @@ -747,22 +813,25 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,  			     int count)  {  	int i, bulk_len = 0; +	bool allow_direct;  	bool in_softirq; +	allow_direct = page_pool_napi_local(pool); +  	for (i = 0; i < count; i++) { -		struct page *page = virt_to_head_page(data[i]); +		netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i]));  		/* It is not the last user for the page frag case */ -		if (!page_pool_is_last_ref(page)) +		if (!page_pool_is_last_ref(netmem))  			continue; -		page = __page_pool_put_page(pool, page, -1, false); +		netmem = __page_pool_put_page(pool, netmem, -1, allow_direct);  		/* Approved for bulk recycling in ptr_ring cache */ -		if (page) -			data[bulk_len++] = page; +		if (netmem) +			data[bulk_len++] = (__force void *)netmem;  	} -	if (unlikely(!bulk_len)) +	if (!bulk_len)  		return;  	/* Bulk producer into ptr_ring page_pool cache */ @@ -785,100 +854,106 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,  	 * since put_page() with refcnt == 1 can be an expensive operation  	 */  	for (; i < bulk_len; i++) -		page_pool_return_page(pool, data[i]); +		page_pool_return_page(pool, (__force netmem_ref)data[i]);  }  EXPORT_SYMBOL(page_pool_put_page_bulk); -static struct page *page_pool_drain_frag(struct page_pool *pool, -					 struct page *page) +static netmem_ref page_pool_drain_frag(struct page_pool *pool, +				       netmem_ref netmem)  {  	long drain_count = BIAS_MAX - pool->frag_users;  	/* Some user is still using the page frag */ -	if (likely(page_pool_unref_page(page, drain_count))) -		return NULL; - -	if (__page_pool_page_can_be_recycled(page)) { -		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) -			page_pool_dma_sync_for_device(pool, page, -1); +	if (likely(page_pool_unref_netmem(netmem, drain_count))) +		return 0; -		return page; +	if (__page_pool_page_can_be_recycled(netmem)) { +		page_pool_dma_sync_for_device(pool, netmem, -1); +		return netmem;  	} -	page_pool_return_page(pool, page); -	return NULL; +	page_pool_return_page(pool, netmem); +	return 0;  }  static void page_pool_free_frag(struct page_pool *pool)  {  	long drain_count = BIAS_MAX - pool->frag_users; -	struct page *page = pool->frag_page; +	netmem_ref netmem = pool->frag_page; -	pool->frag_page = NULL; +	pool->frag_page = 0; -	if (!page || page_pool_unref_page(page, drain_count)) +	if (!netmem || page_pool_unref_netmem(netmem, drain_count))  		return; -	page_pool_return_page(pool, page); +	page_pool_return_page(pool, netmem);  } -struct page *page_pool_alloc_frag(struct page_pool *pool, -				  unsigned int *offset, -				  unsigned int size, gfp_t gfp) +netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, +				       unsigned int *offset, unsigned int size, +				       gfp_t gfp)  {  	unsigned int max_size = PAGE_SIZE << pool->p.order; -	struct page *page = pool->frag_page; +	netmem_ref netmem = pool->frag_page;  	if (WARN_ON(size > max_size)) -		return NULL; +		return 0;  	size = ALIGN(size, dma_get_cache_alignment());  	*offset = pool->frag_offset; -	if (page && *offset + size > max_size) { -		page = page_pool_drain_frag(pool, page); -		if (page) { +	if (netmem && *offset + size > max_size) { +		netmem = page_pool_drain_frag(pool, netmem); +		if (netmem) {  			alloc_stat_inc(pool, fast);  			goto frag_reset;  		}  	} -	if (!page) { -		page = page_pool_alloc_pages(pool, gfp); -		if (unlikely(!page)) { -			pool->frag_page = NULL; -			return NULL; +	if (!netmem) { +		netmem = page_pool_alloc_netmem(pool, gfp); +		if (unlikely(!netmem)) { +			pool->frag_page = 0; +			return 0;  		} -		pool->frag_page = page; +		pool->frag_page = netmem;  frag_reset:  		pool->frag_users = 1;  		*offset = 0;  		pool->frag_offset = size; -		page_pool_fragment_page(page, BIAS_MAX); -		return page; +		page_pool_fragment_netmem(netmem, BIAS_MAX); +		return netmem;  	}  	pool->frag_users++;  	pool->frag_offset = *offset + size;  	alloc_stat_inc(pool, fast); -	return page; +	return netmem; +} +EXPORT_SYMBOL(page_pool_alloc_frag_netmem); + +struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, +				  unsigned int size, gfp_t gfp) +{ +	return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size, +							  gfp));  }  EXPORT_SYMBOL(page_pool_alloc_frag);  static void page_pool_empty_ring(struct page_pool *pool)  { -	struct page *page; +	netmem_ref netmem;  	/* Empty recycle ring */ -	while ((page = ptr_ring_consume_bh(&pool->ring))) { +	while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {  		/* Verify the refcnt invariant of cached pages */ -		if (!(page_ref_count(page) == 1)) +		if (!(page_ref_count(netmem_to_page(netmem)) == 1))  			pr_crit("%s() page_pool refcnt %d violation\n", -				__func__, page_ref_count(page)); +				__func__, netmem_ref_count(netmem)); -		page_pool_return_page(pool, page); +		page_pool_return_page(pool, netmem);  	}  } @@ -894,7 +969,7 @@ static void __page_pool_destroy(struct page_pool *pool)  static void page_pool_empty_alloc_cache_once(struct page_pool *pool)  { -	struct page *page; +	netmem_ref netmem;  	if (pool->destroy_cnt)  		return; @@ -904,8 +979,8 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)  	 * call concurrently.  	 */  	while (pool->alloc.count) { -		page = pool->alloc.cache[--pool->alloc.count]; -		page_pool_return_page(pool, page); +		netmem = pool->alloc.cache[--pool->alloc.count]; +		page_pool_return_page(pool, netmem);  	}  } @@ -959,17 +1034,17 @@ static void page_pool_release_retry(struct work_struct *wq)  }  void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), -			   struct xdp_mem_info *mem) +			   const struct xdp_mem_info *mem)  {  	refcount_inc(&pool->user_cnt);  	pool->disconnect = disconnect;  	pool->xdp_mem_id = mem->id;  } -static void page_pool_disable_direct_recycling(struct page_pool *pool) +void page_pool_disable_direct_recycling(struct page_pool *pool)  {  	/* Disable direct recycling based on pool->cpuid. -	 * Paired with READ_ONCE() in napi_pp_put_page(). +	 * Paired with READ_ONCE() in page_pool_napi_local().  	 */  	WRITE_ONCE(pool->cpuid, -1); @@ -979,11 +1054,12 @@ static void page_pool_disable_direct_recycling(struct page_pool *pool)  	/* To avoid races with recycling and additional barriers make sure  	 * pool and NAPI are unlinked when NAPI is disabled.  	 */ -	WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) || -		READ_ONCE(pool->p.napi->list_owner) != -1); +	WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state)); +	WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);  	WRITE_ONCE(pool->p.napi, NULL);  } +EXPORT_SYMBOL(page_pool_disable_direct_recycling);  void page_pool_destroy(struct page_pool *pool)  { @@ -1011,15 +1087,15 @@ EXPORT_SYMBOL(page_pool_destroy);  /* Caller must provide appropriate safe context, e.g. NAPI. */  void page_pool_update_nid(struct page_pool *pool, int new_nid)  { -	struct page *page; +	netmem_ref netmem;  	trace_page_pool_update_nid(pool, new_nid);  	pool->p.nid = new_nid;  	/* Flush pool alloc cache, as refill will check NUMA node */  	while (pool->alloc.count) { -		page = pool->alloc.cache[--pool->alloc.count]; -		page_pool_return_page(pool, page); +		netmem = pool->alloc.cache[--pool->alloc.count]; +		page_pool_return_page(pool, netmem);  	}  }  EXPORT_SYMBOL(page_pool_update_nid); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index ea55a758a475..197a50ef8e2e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3654,7 +3654,7 @@ static int pktgen_thread_worker(void *arg)  	struct pktgen_dev *pkt_dev = NULL;  	int cpu = t->cpu; -	WARN_ON(smp_processor_id() != cpu); +	WARN_ON_ONCE(smp_processor_id() != cpu);  	init_waitqueue_head(&t->queue);  	complete(&t->start_done); @@ -3989,6 +3989,7 @@ static int __net_init pg_net_init(struct net *net)  		goto remove;  	} +	cpus_read_lock();  	for_each_online_cpu(cpu) {  		int err; @@ -3997,6 +3998,7 @@ static int __net_init pg_net_init(struct net *net)  			pr_warn("Cannot create thread for cpu %d (%d)\n",  				   cpu, err);  	} +	cpus_read_unlock();  	if (list_empty(&pn->pktgen_threads)) {  		pr_err("Initialization failed for all threads\n"); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8ba6a4e4be26..73fd7f543fd0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1036,8 +1036,8 @@ static size_t rtnl_proto_down_size(const struct net_device *dev)  {  	size_t size = nla_total_size(1); -	if (dev->proto_down_reason) -		size += nla_total_size(0) + nla_total_size(4); +	/* Assume dev->proto_down_reason is not zero. */ +	size += nla_total_size(0) + nla_total_size(4);  	return size;  } @@ -1477,13 +1477,15 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb,  static u32 rtnl_xdp_prog_skb(struct net_device *dev)  {  	const struct bpf_prog *generic_xdp_prog; +	u32 res = 0; -	ASSERT_RTNL(); +	rcu_read_lock(); +	generic_xdp_prog = rcu_dereference(dev->xdp_prog); +	if (generic_xdp_prog) +		res = generic_xdp_prog->aux->id; +	rcu_read_unlock(); -	generic_xdp_prog = rtnl_dereference(dev->xdp_prog); -	if (!generic_xdp_prog) -		return 0; -	return generic_xdp_prog->aux->id; +	return res;  }  static u32 rtnl_xdp_prog_drv(struct net_device *dev) @@ -1603,7 +1605,8 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev)  	upper_dev = netdev_master_upper_dev_get_rcu(dev);  	if (upper_dev) -		ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); +		ret = nla_put_u32(skb, IFLA_MASTER, +				  READ_ONCE(upper_dev->ifindex));  	rcu_read_unlock();  	return ret; @@ -1736,10 +1739,10 @@ static int rtnl_fill_proto_down(struct sk_buff *skb,  	struct nlattr *pr;  	u32 preason; -	if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) +	if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down)))  		goto nla_put_failure; -	preason = dev->proto_down_reason; +	preason = READ_ONCE(dev->proto_down_reason);  	if (!preason)  		return 0; @@ -1812,6 +1815,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,  			    u32 event, int *new_nsid, int new_ifindex,  			    int tgt_netnsid, gfp_t gfp)  { +	char devname[IFNAMSIZ];  	struct ifinfomsg *ifm;  	struct nlmsghdr *nlh;  	struct Qdisc *qdisc; @@ -1824,41 +1828,51 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,  	ifm = nlmsg_data(nlh);  	ifm->ifi_family = AF_UNSPEC;  	ifm->__ifi_pad = 0; -	ifm->ifi_type = dev->type; -	ifm->ifi_index = dev->ifindex; +	ifm->ifi_type = READ_ONCE(dev->type); +	ifm->ifi_index = READ_ONCE(dev->ifindex);  	ifm->ifi_flags = dev_get_flags(dev);  	ifm->ifi_change = change;  	if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid))  		goto nla_put_failure; -	qdisc = rtnl_dereference(dev->qdisc); -	if (nla_put_string(skb, IFLA_IFNAME, dev->name) || -	    nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || +	netdev_copy_name(dev, devname); +	if (nla_put_string(skb, IFLA_IFNAME, devname)) +		goto nla_put_failure; + +	if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) ||  	    nla_put_u8(skb, IFLA_OPERSTATE, -		       netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || -	    nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || -	    nla_put_u32(skb, IFLA_MTU, dev->mtu) || -	    nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) || -	    nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || -	    nla_put_u32(skb, IFLA_GROUP, dev->group) || -	    nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || -	    nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) || -	    nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || -	    nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || -	    nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || -	    nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || -	    nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, dev->gso_ipv4_max_size) || -	    nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, dev->gro_ipv4_max_size) || -	    nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || -	    nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || +		       netif_running(dev) ? READ_ONCE(dev->operstate) : +					    IF_OPER_DOWN) || +	    nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) || +	    nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || +	    nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) || +	    nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) || +	    nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) || +	    nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) || +	    nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) || +	    nla_put_u32(skb, IFLA_NUM_TX_QUEUES, +			READ_ONCE(dev->num_tx_queues)) || +	    nla_put_u32(skb, IFLA_GSO_MAX_SEGS, +			READ_ONCE(dev->gso_max_segs)) || +	    nla_put_u32(skb, IFLA_GSO_MAX_SIZE, +			READ_ONCE(dev->gso_max_size)) || +	    nla_put_u32(skb, IFLA_GRO_MAX_SIZE, +			READ_ONCE(dev->gro_max_size)) || +	    nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, +			READ_ONCE(dev->gso_ipv4_max_size)) || +	    nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, +			READ_ONCE(dev->gro_ipv4_max_size)) || +	    nla_put_u32(skb, IFLA_TSO_MAX_SIZE, +			READ_ONCE(dev->tso_max_size)) || +	    nla_put_u32(skb, IFLA_TSO_MAX_SEGS, +			READ_ONCE(dev->tso_max_segs)) ||  #ifdef CONFIG_RPS -	    nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || +	    nla_put_u32(skb, IFLA_NUM_RX_QUEUES, +			READ_ONCE(dev->num_rx_queues)) ||  #endif  	    put_master_ifindex(skb, dev) ||  	    nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || -	    (qdisc && -	     nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) ||  	    nla_put_ifalias(skb, dev) ||  	    nla_put_u32(skb, IFLA_CARRIER_CHANGES,  			atomic_read(&dev->carrier_up_count) + @@ -1909,9 +1923,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,  			goto nla_put_failure;  	} -	if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp)) -		goto nla_put_failure; -  	if (new_nsid &&  	    nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)  		goto nla_put_failure; @@ -1924,6 +1935,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,  		goto nla_put_failure;  	rcu_read_lock(); +	if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC)) +		goto nla_put_failure_rcu; +	qdisc = rcu_dereference(dev->qdisc); +	if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) +		goto nla_put_failure_rcu;  	if (rtnl_fill_link_af(skb, dev, ext_filter_mask))  		goto nla_put_failure_rcu;  	if (rtnl_fill_link_ifmap(skb, dev)) @@ -3272,7 +3288,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,  	if (ifm->ifi_index > 0)  		dev = __dev_get_by_index(tgt_net, ifm->ifi_index);  	else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) -		dev = rtnl_dev_get(net, tb); +		dev = rtnl_dev_get(tgt_net, tb);  	else if (tb[IFLA_GROUP])  		err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP]));  	else @@ -3953,22 +3969,28 @@ static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,  	return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack);  } -static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) +static noinline_for_stack u32 rtnl_calcit(struct sk_buff *skb, +					  struct nlmsghdr *nlh)  {  	struct net *net = sock_net(skb->sk);  	size_t min_ifinfo_dump_size = 0; -	struct nlattr *tb[IFLA_MAX+1];  	u32 ext_filter_mask = 0;  	struct net_device *dev; -	int hdrlen; +	struct nlattr *nla; +	int hdrlen, rem;  	/* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */  	hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?  		 sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); -	if (nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) { -		if (tb[IFLA_EXT_MASK]) -			ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); +	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) +		return NLMSG_GOODSIZE; + +	nla_for_each_attr_type(nla, IFLA_EXT_MASK, +			       nlmsg_attrdata(nlh, hdrlen), +			       nlmsg_attrlen(nlh, hdrlen), rem) { +		if (nla_len(nla) == sizeof(u32)) +			ext_filter_mask = nla_get_u32(nla);  	}  	if (!ext_filter_mask) @@ -5245,15 +5267,14 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,  	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);  	if (br_spec) { -		nla_for_each_nested(attr, br_spec, rem) { -			if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { -				if (nla_len(attr) < sizeof(flags)) -					return -EINVAL; +		nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec, +					 rem) { +			if (nla_len(attr) < sizeof(flags)) +				return -EINVAL; -				have_flags = true; -				flags = nla_get_u16(attr); -				break; -			} +			have_flags = true; +			flags = nla_get_u16(attr); +			break;  		}  	} @@ -5962,19 +5983,17 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,  static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)  {  	struct netlink_ext_ack *extack = cb->extack; -	int h, s_h, err, s_idx, s_idxattr, s_prividx;  	struct rtnl_stats_dump_filters filters;  	struct net *net = sock_net(skb->sk);  	unsigned int flags = NLM_F_MULTI;  	struct if_stats_msg *ifsm; -	struct hlist_head *head; +	struct { +		unsigned long ifindex; +		int idxattr; +		int prividx; +	} *ctx = (void *)cb->ctx;  	struct net_device *dev; -	int idx = 0; - -	s_h = cb->args[0]; -	s_idx = cb->args[1]; -	s_idxattr = cb->args[2]; -	s_prividx = cb->args[3]; +	int err;  	cb->seq = net->dev_base_seq; @@ -5993,39 +6012,26 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)  	if (err)  		return err; -	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { -		idx = 0; -		head = &net->dev_index_head[h]; -		hlist_for_each_entry(dev, head, index_hlist) { -			if (idx < s_idx) -				goto cont; -			err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, -						  NETLINK_CB(cb->skb).portid, -						  cb->nlh->nlmsg_seq, 0, -						  flags, &filters, -						  &s_idxattr, &s_prividx, -						  extack); -			/* If we ran out of room on the first message, -			 * we're in trouble -			 */ -			WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); +	for_each_netdev_dump(net, dev, ctx->ifindex) { +		err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, +					  NETLINK_CB(cb->skb).portid, +					  cb->nlh->nlmsg_seq, 0, +					  flags, &filters, +					  &ctx->idxattr, &ctx->prividx, +					  extack); +		/* If we ran out of room on the first message, +		 * we're in trouble. +		 */ +		WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); -			if (err < 0) -				goto out; -			s_prividx = 0; -			s_idxattr = 0; -			nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: -			idx++; -		} +		if (err < 0) +			break; +		ctx->prividx = 0; +		ctx->idxattr = 0; +		nl_dump_check_consistent(cb, nlmsg_hdr(skb));  	} -out: -	cb->args[3] = s_prividx; -	cb->args[2] = s_idxattr; -	cb->args[1] = idx; -	cb->args[0] = h; -	return skb->len; +	return err;  }  void rtnl_offload_xstats_notify(struct net_device *dev) @@ -6484,6 +6490,52 @@ static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,  /* Process one rtnetlink message. */ +static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ +	const bool needs_lock = !(cb->flags & RTNL_FLAG_DUMP_UNLOCKED); +	rtnl_dumpit_func dumpit = cb->data; +	int err; + +	/* Previous iteration have already finished, avoid calling->dumpit() +	 * again, it may not expect to be called after it reached the end. +	 */ +	if (!dumpit) +		return 0; + +	if (needs_lock) +		rtnl_lock(); +	err = dumpit(skb, cb); +	if (needs_lock) +		rtnl_unlock(); + +	/* Old dump handlers used to send NLM_DONE as in a separate recvmsg(). +	 * Some applications which parse netlink manually depend on this. +	 */ +	if (cb->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) { +		if (err < 0 && err != -EMSGSIZE) +			return err; +		if (!err) +			cb->data = NULL; + +		return skb->len; +	} +	return err; +} + +static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb, +				const struct nlmsghdr *nlh, +				struct netlink_dump_control *control) +{ +	if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE || +	    !(control->flags & RTNL_FLAG_DUMP_UNLOCKED)) { +		WARN_ON(control->data); +		control->data = control->dump; +		control->dump = rtnl_dumpit; +	} + +	return netlink_dump_start(ssk, skb, nlh, control); +} +  static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,  			     struct netlink_ext_ack *extack)  { @@ -6548,7 +6600,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,  				.module		= owner,  				.flags		= flags,  			}; -			err = netlink_dump_start(rtnl, skb, nlh, &c); +			err = rtnetlink_dump_start(rtnl, skb, nlh, &c);  			/* netlink_dump_start() will keep a reference on  			 * module if dump is still in progress.  			 */ @@ -6663,7 +6715,6 @@ static int __net_init rtnetlink_net_init(struct net *net)  	struct netlink_kernel_cfg cfg = {  		.groups		= RTNLGRP_MAX,  		.input		= rtnetlink_rcv, -		.cb_mutex	= &rtnl_mutex,  		.flags		= NL_CFG_F_NONROOT_RECV,  		.bind		= rtnetlink_bind,  	}; @@ -6694,7 +6745,7 @@ void __init rtnetlink_init(void)  	register_netdevice_notifier(&rtnetlink_dev_notifier);  	rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, -		      rtnl_dump_ifinfo, 0); +		      rtnl_dump_ifinfo, RTNL_FLAG_DUMP_SPLIT_NLM_DONE);  	rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0);  	rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0);  	rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0); diff --git a/net/core/scm.c b/net/core/scm.c index 9cd4b0a01cd6..4f6a14babe5a 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -89,6 +89,12 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)  		fpl->count_unix = 0;  		fpl->max = SCM_MAX_FD;  		fpl->user = NULL; +#if IS_ENABLED(CONFIG_UNIX) +		fpl->inflight = false; +		fpl->dead = false; +		fpl->edges = NULL; +		INIT_LIST_HEAD(&fpl->vertices); +#endif  	}  	fpp = &fpl->fp[fpl->count]; @@ -376,8 +382,14 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)  	if (new_fpl) {  		for (i = 0; i < fpl->count; i++)  			get_file(fpl->fp[i]); +  		new_fpl->max = new_fpl->count;  		new_fpl->user = get_uid(fpl->user); +#if IS_ENABLED(CONFIG_UNIX) +		new_fpl->inflight = false; +		new_fpl->edges = NULL; +		INIT_LIST_HEAD(&new_fpl->vertices); +#endif  	}  	return new_fpl;  } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4096e679f61c..83f8cd8aa2d1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -51,6 +51,7 @@  #endif  #include <linux/string.h>  #include <linux/skbuff.h> +#include <linux/skbuff_ref.h>  #include <linux/splice.h>  #include <linux/cache.h>  #include <linux/rtnetlink.h> @@ -108,9 +109,6 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init;  #define SKB_SMALL_HEAD_HEADROOM						\  	SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) -int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; -EXPORT_SYMBOL(sysctl_max_skb_frags); -  /* kcm_write_msgs() relies on casting paged frags to bio_vec to use   * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the   * netmem is a page. @@ -279,6 +277,7 @@ static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)  #endif  struct napi_alloc_cache { +	local_lock_t bh_lock;  	struct page_frag_cache page;  	struct page_frag_1k page_small;  	unsigned int skb_count; @@ -286,7 +285,9 @@ struct napi_alloc_cache {  };  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); +static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { +	.bh_lock = INIT_LOCAL_LOCK(bh_lock), +};  /* Double check that napi_get_frags() allocates skbs with   * skb->head being backed by slab, not a page fragment. @@ -308,11 +309,16 @@ void napi_get_frags_check(struct napi_struct *napi)  void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)  {  	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); +	void *data;  	fragsz = SKB_DATA_ALIGN(fragsz); -	return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, +	local_lock_nested_bh(&napi_alloc_cache.bh_lock); +	data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,  				       align_mask); +	local_unlock_nested_bh(&napi_alloc_cache.bh_lock); +	return data; +  }  EXPORT_SYMBOL(__napi_alloc_frag_align); @@ -320,19 +326,15 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)  {  	void *data; -	fragsz = SKB_DATA_ALIGN(fragsz);  	if (in_hardirq() || irqs_disabled()) {  		struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); +		fragsz = SKB_DATA_ALIGN(fragsz);  		data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC,  					       align_mask);  	} else { -		struct napi_alloc_cache *nc; -  		local_bh_disable(); -		nc = this_cpu_ptr(&napi_alloc_cache); -		data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, -					       align_mask); +		data = __napi_alloc_frag_align(fragsz, align_mask);  		local_bh_enable();  	}  	return data; @@ -344,16 +346,20 @@ static struct sk_buff *napi_skb_cache_get(void)  	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);  	struct sk_buff *skb; +	local_lock_nested_bh(&napi_alloc_cache.bh_lock);  	if (unlikely(!nc->skb_count)) {  		nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,  						      GFP_ATOMIC,  						      NAPI_SKB_CACHE_BULK,  						      nc->skb_cache); -		if (unlikely(!nc->skb_count)) +		if (unlikely(!nc->skb_count)) { +			local_unlock_nested_bh(&napi_alloc_cache.bh_lock);  			return NULL; +		}  	}  	skb = nc->skb_cache[--nc->skb_count]; +	local_unlock_nested_bh(&napi_alloc_cache.bh_lock);  	kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));  	return skb; @@ -746,9 +752,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,  		pfmemalloc = nc->pfmemalloc;  	} else {  		local_bh_disable(); +		local_lock_nested_bh(&napi_alloc_cache.bh_lock); +  		nc = this_cpu_ptr(&napi_alloc_cache.page);  		data = page_frag_alloc(nc, len, gfp_mask);  		pfmemalloc = nc->pfmemalloc; + +		local_unlock_nested_bh(&napi_alloc_cache.bh_lock);  		local_bh_enable();  	} @@ -775,10 +785,9 @@ skb_fail:  EXPORT_SYMBOL(__netdev_alloc_skb);  /** - *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + *	napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance   *	@napi: napi instance this buffer was allocated for   *	@len: length to allocate - *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages   *   *	Allocate a new sk_buff for use in NAPI receive.  This buffer will   *	attempt to allocate the head from a special reserved region used @@ -787,9 +796,9 @@ EXPORT_SYMBOL(__netdev_alloc_skb);   *   *	%NULL is returned if there is no free memory.   */ -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, -				 gfp_t gfp_mask) +struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)  { +	gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;  	struct napi_alloc_cache *nc;  	struct sk_buff *skb;  	bool pfmemalloc; @@ -813,11 +822,11 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,  		goto skb_success;  	} -	nc = this_cpu_ptr(&napi_alloc_cache); -  	if (sk_memalloc_socks())  		gfp_mask |= __GFP_MEMALLOC; +	local_lock_nested_bh(&napi_alloc_cache.bh_lock); +	nc = this_cpu_ptr(&napi_alloc_cache);  	if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {  		/* we are artificially inflating the allocation size, but  		 * that is not as bad as it may look like, as: @@ -839,6 +848,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,  		data = page_frag_alloc(&nc->page, len, gfp_mask);  		pfmemalloc = nc->page.pfmemalloc;  	} +	local_unlock_nested_bh(&napi_alloc_cache.bh_lock);  	if (unlikely(!data))  		return NULL; @@ -860,7 +870,7 @@ skb_success:  skb_fail:  	return skb;  } -EXPORT_SYMBOL(__napi_alloc_skb); +EXPORT_SYMBOL(napi_alloc_skb);  void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,  			    int off, int size, unsigned int truesize) @@ -1005,10 +1015,9 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,  EXPORT_SYMBOL(skb_cow_data_for_xdp);  #if IS_ENABLED(CONFIG_PAGE_POOL) -bool napi_pp_put_page(struct page *page, bool napi_safe) +bool napi_pp_put_page(netmem_ref netmem)  { -	bool allow_direct = false; -	struct page_pool *pp; +	struct page *page = netmem_to_page(netmem);  	page = compound_head(page); @@ -1022,39 +1031,18 @@ bool napi_pp_put_page(struct page *page, bool napi_safe)  	if (unlikely(!is_pp_page(page)))  		return false; -	pp = page->pp; - -	/* Allow direct recycle if we have reasons to believe that we are -	 * in the same context as the consumer would run, so there's -	 * no possible race. -	 * __page_pool_put_page() makes sure we're not in hardirq context -	 * and interrupts are enabled prior to accessing the cache. -	 */ -	if (napi_safe || in_softirq()) { -		const struct napi_struct *napi = READ_ONCE(pp->p.napi); -		unsigned int cpuid = smp_processor_id(); - -		allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid; -		allow_direct |= READ_ONCE(pp->cpuid) == cpuid; -	} - -	/* Driver set this to memory recycling info. Reset it on recycle. -	 * This will *not* work for NIC using a split-page memory model. -	 * The page will be returned to the pool here regardless of the -	 * 'flipped' fragment being in use or not. -	 */ -	page_pool_put_full_page(pp, page, allow_direct); +	page_pool_put_full_netmem(page->pp, page_to_netmem(page), false);  	return true;  }  EXPORT_SYMBOL(napi_pp_put_page);  #endif -static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) +static bool skb_pp_recycle(struct sk_buff *skb, void *data)  {  	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)  		return false; -	return napi_pp_put_page(virt_to_page(data), napi_safe); +	return napi_pp_put_page(page_to_netmem(virt_to_page(data)));  }  /** @@ -1096,12 +1084,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)  		kfree(head);  } -static void skb_free_head(struct sk_buff *skb, bool napi_safe) +static void skb_free_head(struct sk_buff *skb)  {  	unsigned char *head = skb->head;  	if (skb->head_frag) { -		if (skb_pp_recycle(skb, head, napi_safe)) +		if (skb_pp_recycle(skb, head))  			return;  		skb_free_frag(head);  	} else { @@ -1109,8 +1097,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe)  	}  } -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, -			     bool napi_safe) +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)  {  	struct skb_shared_info *shinfo = skb_shinfo(skb);  	int i; @@ -1127,13 +1114,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,  	}  	for (i = 0; i < shinfo->nr_frags; i++) -		napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe); +		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);  free_head:  	if (shinfo->frag_list)  		kfree_skb_list_reason(shinfo->frag_list, reason); -	skb_free_head(skb, napi_safe); +	skb_free_head(skb);  exit:  	/* When we clone an SKB we copy the reycling bit. The pp_recycle  	 * bit is only set on the head though, so in order to avoid races @@ -1194,12 +1181,11 @@ void skb_release_head_state(struct sk_buff *skb)  }  /* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, -			    bool napi_safe) +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)  {  	skb_release_head_state(skb);  	if (likely(skb->head)) -		skb_release_data(skb, reason, napi_safe); +		skb_release_data(skb, reason);  }  /** @@ -1213,13 +1199,14 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,  void __kfree_skb(struct sk_buff *skb)  { -	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false); +	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);  	kfree_skbmem(skb);  }  EXPORT_SYMBOL(__kfree_skb);  static __always_inline -bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, +			  enum skb_drop_reason reason)  {  	if (unlikely(!skb_unref(skb)))  		return false; @@ -1232,26 +1219,27 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)  	if (reason == SKB_CONSUMED)  		trace_consume_skb(skb, __builtin_return_address(0));  	else -		trace_kfree_skb(skb, __builtin_return_address(0), reason); +		trace_kfree_skb(skb, __builtin_return_address(0), reason, sk);  	return true;  }  /** - *	kfree_skb_reason - free an sk_buff with special reason + *	sk_skb_reason_drop - free an sk_buff with special reason + *	@sk: the socket to receive @skb, or NULL if not applicable   *	@skb: buffer to free   *	@reason: reason why this skb is dropped   * - *	Drop a reference to the buffer and free it if the usage count has - *	hit zero. Meanwhile, pass the drop reason to 'kfree_skb' - *	tracepoint. + *	Drop a reference to the buffer and free it if the usage count has hit + *	zero. Meanwhile, pass the receiving socket and drop reason to + *	'kfree_skb' tracepoint.   */  void __fix_address -kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)  { -	if (__kfree_skb_reason(skb, reason)) +	if (__sk_skb_reason_drop(sk, skb, reason))  		__kfree_skb(skb);  } -EXPORT_SYMBOL(kfree_skb_reason); +EXPORT_SYMBOL(sk_skb_reason_drop);  #define KFREE_SKB_BULK_SIZE	16 @@ -1270,7 +1258,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,  		return;  	} -	skb_release_all(skb, reason, false); +	skb_release_all(skb, reason);  	sa->skb_array[sa->skb_count++] = skb;  	if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { @@ -1290,7 +1278,7 @@ kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)  	while (segs) {  		struct sk_buff *next = segs->next; -		if (__kfree_skb_reason(segs, reason)) { +		if (__sk_skb_reason_drop(NULL, segs, reason)) {  			skb_poison_list(segs);  			kfree_skb_add_bulk(segs, &sa, reason);  		} @@ -1331,22 +1319,28 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)  	has_trans = skb_transport_header_was_set(skb);  	printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" -	       "mac=(%d,%d) net=(%d,%d) trans=%d\n" +	       "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n"  	       "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" -	       "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" -	       "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", +	       "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n" +	       "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" +	       "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n" +	       "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n",  	       level, skb->len, headroom, skb_headlen(skb), tailroom,  	       has_mac ? skb->mac_header : -1,  	       has_mac ? skb_mac_header_len(skb) : -1, +	       skb->mac_len,  	       skb->network_header,  	       has_trans ? skb_network_header_len(skb) : -1,  	       has_trans ? skb->transport_header : -1,  	       sh->tx_flags, sh->nr_frags,  	       sh->gso_size, sh->gso_type, sh->gso_segs, -	       skb->csum, skb->ip_summed, skb->csum_complete_sw, -	       skb->csum_valid, skb->csum_level, +	       skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed, +	       skb->csum_complete_sw, skb->csum_valid, skb->csum_level,  	       skb->hash, skb->sw_hash, skb->l4_hash, -	       ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); +	       ntohs(skb->protocol), skb->pkt_type, skb->skb_iif, +	       skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all, +	       skb->encapsulation, skb->inner_protocol, skb->inner_mac_header, +	       skb->inner_network_header, skb->inner_transport_header);  	if (dev)  		printk("%sdev name=%s feat=%pNF\n", @@ -1444,7 +1438,7 @@ EXPORT_SYMBOL(consume_skb);  void __consume_stateless_skb(struct sk_buff *skb)  {  	trace_consume_skb(skb, __builtin_return_address(0)); -	skb_release_data(skb, SKB_CONSUMED, false); +	skb_release_data(skb, SKB_CONSUMED);  	kfree_skbmem(skb);  } @@ -1456,6 +1450,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)  	if (!kasan_mempool_poison_object(skb))  		return; +	local_lock_nested_bh(&napi_alloc_cache.bh_lock);  	nc->skb_cache[nc->skb_count++] = skb;  	if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { @@ -1467,11 +1462,12 @@ static void napi_skb_cache_put(struct sk_buff *skb)  				     nc->skb_cache + NAPI_SKB_CACHE_HALF);  		nc->skb_count = NAPI_SKB_CACHE_HALF;  	} +	local_unlock_nested_bh(&napi_alloc_cache.bh_lock);  }  void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)  { -	skb_release_all(skb, reason, true); +	skb_release_all(skb, reason);  	napi_skb_cache_put(skb);  } @@ -1509,7 +1505,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)  		return;  	} -	skb_release_all(skb, SKB_CONSUMED, !!budget); +	skb_release_all(skb, SKB_CONSUMED);  	napi_skb_cache_put(skb);  }  EXPORT_SYMBOL(napi_consume_skb); @@ -1640,7 +1636,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);   */  struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)  { -	skb_release_all(dst, SKB_CONSUMED, false); +	skb_release_all(dst, SKB_CONSUMED);  	return __skb_clone(dst, src);  }  EXPORT_SYMBOL_GPL(skb_morph); @@ -1708,7 +1704,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)  		return NULL;  	} -	uarg->ubuf.callback = msg_zerocopy_callback; +	uarg->ubuf.ops = &msg_zerocopy_ubuf_ops;  	uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;  	uarg->len = 1;  	uarg->bytelen = size; @@ -1734,7 +1730,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,  		u32 bytelen, next;  		/* there might be non MSG_ZEROCOPY users */ -		if (uarg->callback != msg_zerocopy_callback) +		if (uarg->ops != &msg_zerocopy_ubuf_ops)  			return NULL;  		/* realloc only when socket is locked (TCP, UDP cork), @@ -1845,8 +1841,8 @@ release:  	sock_put(sk);  } -void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, -			   bool success) +static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, +				  bool success)  {  	struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); @@ -1855,7 +1851,6 @@ void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,  	if (refcount_dec_and_test(&uarg->refcnt))  		__msg_zerocopy_callback(uarg_zc);  } -EXPORT_SYMBOL_GPL(msg_zerocopy_callback);  void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)  { @@ -1865,22 +1860,35 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)  	uarg_to_msgzc(uarg)->len--;  	if (have_uref) -		msg_zerocopy_callback(NULL, uarg, true); +		msg_zerocopy_complete(NULL, uarg, true);  }  EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); +const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { +	.complete = msg_zerocopy_complete, +}; +EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); +  int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,  			     struct msghdr *msg, int len,  			     struct ubuf_info *uarg)  { -	struct ubuf_info *orig_uarg = skb_zcopy(skb);  	int err, orig_len = skb->len; -	/* An skb can only point to one uarg. This edge case happens when -	 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. -	 */ -	if (orig_uarg && uarg != orig_uarg) -		return -EEXIST; +	if (uarg->ops->link_skb) { +		err = uarg->ops->link_skb(skb, uarg); +		if (err) +			return err; +	} else { +		struct ubuf_info *orig_uarg = skb_zcopy(skb); + +		/* An skb can only point to one uarg. This edge case happens +		 * when TCP appends to an skb, but zerocopy_realloc triggered +		 * a new alloc. +		 */ +		if (orig_uarg && uarg != orig_uarg) +			return -EEXIST; +	}  	err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);  	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { @@ -2278,9 +2286,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,  		if (skb_has_frag_list(skb))  			skb_clone_fraglist(skb); -		skb_release_data(skb, SKB_CONSUMED, false); +		skb_release_data(skb, SKB_CONSUMED);  	} else { -		skb_free_head(skb, false); +		skb_free_head(skb);  	}  	off = (data + nhead) - skb->head; @@ -4150,6 +4158,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)  	if (skb_zcopy(tgt) || skb_zcopy(skb))  		return 0; +	DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle); +	DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb)); +  	todo = shiftlen;  	from = 0;  	to = skb_shinfo(tgt)->nr_frags; @@ -6586,12 +6597,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,  			skb_frag_ref(skb, i);  		if (skb_has_frag_list(skb))  			skb_clone_fraglist(skb); -		skb_release_data(skb, SKB_CONSUMED, false); +		skb_release_data(skb, SKB_CONSUMED);  	} else {  		/* we can reuse existing recount- all we did was  		 * relocate values  		 */ -		skb_free_head(skb, false); +		skb_free_head(skb);  	}  	skb->head = data; @@ -6726,7 +6737,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,  		skb_kfree_head(data, size);  		return -ENOMEM;  	} -	skb_release_data(skb, SKB_CONSUMED, false); +	skb_release_data(skb, SKB_CONSUMED);  	skb->head = data;  	skb->head_frag = 0; @@ -7006,6 +7017,19 @@ free_now:  EXPORT_SYMBOL(__skb_ext_put);  #endif /* CONFIG_SKB_EXTENSIONS */ +static void kfree_skb_napi_cache(struct sk_buff *skb) +{ +	/* if SKB is a clone, don't handle this case */ +	if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { +		__kfree_skb(skb); +		return; +	} + +	local_bh_disable(); +	__napi_kfree_skb(skb, SKB_CONSUMED); +	local_bh_enable(); +} +  /**   * skb_attempt_defer_free - queue skb for remote freeing   * @skb: buffer @@ -7021,10 +7045,10 @@ void skb_attempt_defer_free(struct sk_buff *skb)  	unsigned int defer_max;  	bool kick; -	if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || -	    !cpu_online(cpu) || -	    cpu == raw_smp_processor_id()) { -nodefer:	__kfree_skb(skb); +	if (cpu == raw_smp_processor_id() || +	    WARN_ON_ONCE(cpu >= nr_cpu_ids) || +	    !cpu_online(cpu)) { +nodefer:	kfree_skb_napi_cache(skb);  		return;  	} @@ -7032,7 +7056,7 @@ nodefer:	__kfree_skb(skb);  	DEBUG_NET_WARN_ON_ONCE(skb->destructor);  	sd = &per_cpu(softnet_data, cpu); -	defer_max = READ_ONCE(sysctl_skb_defer_max); +	defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);  	if (READ_ONCE(sd->defer_count) >= defer_max)  		goto nodefer; @@ -7050,8 +7074,8 @@ nodefer:	__kfree_skb(skb);  	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU  	 * if we are unlucky enough (this seems very unlikely).  	 */ -	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) -		smp_call_function_single_async(cpu, &sd->defer_csd); +	if (unlikely(kick)) +		kick_defer_list_purge(sd, cpu);  }  static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, @@ -7084,7 +7108,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,  ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,  			     ssize_t maxsize, gfp_t gfp)  { -	size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); +	size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);  	struct page *pages[8], **ppages = pages;  	ssize_t spliced = 0, ret = 0;  	unsigned int i; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index fd20aae30be2..bbf40b999713 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -434,7 +434,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,  			page = sg_page(sge);  			if (copied + copy > len)  				copy = len - copied; -			copy = copy_page_to_iter(page, sge->offset, copy, iter); +			if (copy) +				copy = copy_page_to_iter(page, sge->offset, copy, iter);  			if (!copy) {  				copied = copied ? copied : -EFAULT;  				goto out; diff --git a/net/core/sock.c b/net/core/sock.c index 0963689a5950..9abc4fe25953 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -127,6 +127,7 @@  #include <net/net_namespace.h>  #include <net/request_sock.h>  #include <net/sock.h> +#include <net/proto_memory.h>  #include <linux/net_tstamp.h>  #include <net/xfrm.h>  #include <linux/ipsec.h> @@ -283,7 +284,6 @@ __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;  EXPORT_SYMBOL(sysctl_rmem_max);  __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;  __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE;  int sysctl_tstamp_allow_data __read_mostly = 1; @@ -1083,6 +1083,17 @@ bool sockopt_capable(int cap)  }  EXPORT_SYMBOL(sockopt_capable); +static int sockopt_validate_clockid(__kernel_clockid_t value) +{ +	switch (value) { +	case CLOCK_REALTIME: +	case CLOCK_MONOTONIC: +	case CLOCK_TAI: +		return 0; +	} +	return -EINVAL; +} +  /*   *	This is meant for all protocols to use and covers goings on   *	at the socket level. Everything here is generic. @@ -1497,6 +1508,11 @@ set_sndbuf:  			ret = -EPERM;  			break;  		} + +		ret = sockopt_validate_clockid(sk_txtime.clockid); +		if (ret) +			break; +  		sock_valbool_flag(sk, SOCK_TXTIME, true);  		sk->sk_clockid = sk_txtime.clockid;  		sk->sk_txtime_deadline_mode = @@ -2262,7 +2278,12 @@ static void sk_init_common(struct sock *sk)  	lockdep_set_class_and_name(&sk->sk_error_queue.lock,  			af_elock_keys + sk->sk_family,  			af_family_elock_key_strings[sk->sk_family]); -	lockdep_set_class_and_name(&sk->sk_callback_lock, +	if (sk->sk_kern_sock) +		lockdep_set_class_and_name(&sk->sk_callback_lock, +			af_kern_callback_keys + sk->sk_family, +			af_family_kern_clock_key_strings[sk->sk_family]); +	else +		lockdep_set_class_and_name(&sk->sk_callback_lock,  			af_callback_keys + sk->sk_family,  			af_family_clock_key_strings[sk->sk_family]);  } @@ -2526,13 +2547,12 @@ EXPORT_SYMBOL(skb_set_owner_w);  static bool can_skb_orphan_partial(const struct sk_buff *skb)  { -#ifdef CONFIG_TLS_DEVICE  	/* Drivers depend on in-order delivery for crypto offload,  	 * partial orphan breaks out-of-order-OK logic.  	 */ -	if (skb->decrypted) +	if (skb_is_decrypted(skb))  		return false; -#endif +  	return (skb->destructor == sock_wfree ||  		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));  } @@ -3242,8 +3262,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2)  }  EXPORT_SYMBOL(sock_no_socketpair); -int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, -		   bool kern) +int sock_no_accept(struct socket *sock, struct socket *newsock, +		   struct proto_accept_arg *arg)  {  	return -EOPNOTSUPP;  } @@ -3338,7 +3358,7 @@ static void sock_def_error_report(struct sock *sk)  	wq = rcu_dereference(sk->sk_wq);  	if (skwq_has_sleeper(wq))  		wake_up_interruptible_poll(&wq->wait, EPOLLERR); -	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); +	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);  	rcu_read_unlock();  } @@ -3353,7 +3373,7 @@ void sock_def_readable(struct sock *sk)  	if (skwq_has_sleeper(wq))  		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |  						EPOLLRDNORM | EPOLLRDBAND); -	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); +	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);  	rcu_read_unlock();  } @@ -3373,7 +3393,7 @@ static void sock_def_write_space(struct sock *sk)  						EPOLLWRNORM | EPOLLWRBAND);  		/* Should agree with poll, otherwise some programs break */ -		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); +		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);  	}  	rcu_read_unlock(); @@ -3398,7 +3418,7 @@ static void sock_def_write_space_wfree(struct sock *sk)  						EPOLLWRNORM | EPOLLWRBAND);  		/* Should agree with poll, otherwise some programs break */ -		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); +		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);  	}  } @@ -3461,18 +3481,6 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)  	}  	sk->sk_uid	=	uid; -	rwlock_init(&sk->sk_callback_lock); -	if (sk->sk_kern_sock) -		lockdep_set_class_and_name( -			&sk->sk_callback_lock, -			af_kern_callback_keys + sk->sk_family, -			af_family_kern_clock_key_strings[sk->sk_family]); -	else -		lockdep_set_class_and_name( -			&sk->sk_callback_lock, -			af_callback_keys + sk->sk_family, -			af_family_clock_key_strings[sk->sk_family]); -  	sk->sk_state_change	=	sock_def_wakeup;  	sk->sk_data_ready	=	sock_def_readable;  	sk->sk_write_space	=	sock_def_write_space; @@ -3743,6 +3751,9 @@ void sk_common_release(struct sock *sk)  	sk->sk_prot->unhash(sk); +	if (sk->sk_socket) +		sk->sk_socket->sk = NULL; +  	/*  	 * In this point socket cannot receive new packets, but it is possible  	 * that some packets are in flight because some CPU runs receiver and diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 654122838025..a08eed9b9142 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -18,7 +18,7 @@  static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX]; -static struct sock_diag_inet_compat __rcu *inet_rcv_compat; +static const struct sock_diag_inet_compat __rcu *inet_rcv_compat;  static struct workqueue_struct *broadcast_wq; @@ -187,8 +187,7 @@ void sock_diag_broadcast_destroy(struct sock *sk)  void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr)  { -	xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat, -	     ptr); +	xchg(&inet_rcv_compat, RCU_INITIALIZER(ptr));  }  EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat); @@ -196,8 +195,7 @@ void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr)  {  	const struct sock_diag_inet_compat *old; -	old = xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat, -		   NULL); +	old = unrcu_pointer(xchg(&inet_rcv_compat, NULL));  	WARN_ON_ONCE(old != ptr);  }  EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 8598466a3805..d3dbb92153f2 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,8 +24,16 @@ struct bpf_stab {  #define SOCK_CREATE_FLAG_MASK				\  	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +/* This mutex is used to + *  - protect race between prog/link attach/detach and link prog update, and + *  - protect race between releasing and accessing map in bpf_link. + * A single global mutex lock is used since it is expected contention is low. + */ +static DEFINE_MUTEX(sockmap_mutex); +  static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, -				struct bpf_prog *old, u32 which); +				struct bpf_prog *old, struct bpf_link *link, +				u32 which);  static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);  static struct bpf_map *sock_map_alloc(union bpf_attr *attr) @@ -71,7 +79,9 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)  	map = __bpf_map_get(f);  	if (IS_ERR(map))  		return PTR_ERR(map); -	ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); +	mutex_lock(&sockmap_mutex); +	ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); +	mutex_unlock(&sockmap_mutex);  	fdput(f);  	return ret;  } @@ -103,7 +113,9 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)  		goto put_prog;  	} -	ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); +	mutex_lock(&sockmap_mutex); +	ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); +	mutex_unlock(&sockmap_mutex);  put_prog:  	bpf_prog_put(prog);  put_map: @@ -411,9 +423,6 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,  	struct sock *sk;  	int err = 0; -	if (irqs_disabled()) -		return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ -  	spin_lock_bh(&stab->lock);  	sk = *psk;  	if (!sk_test || sk_test == sk) @@ -936,9 +945,6 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key)  	struct bpf_shtab_elem *elem;  	int ret = -ENOENT; -	if (irqs_disabled()) -		return -EOPNOTSUPP; /* locks here are hardirq-unsafe */ -  	hash = sock_hash_bucket_hash(key, key_size);  	bucket = sock_hash_select_bucket(htab, hash); @@ -1460,55 +1466,84 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)  	return NULL;  } -static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, -				u32 which) +static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, +				     struct bpf_link ***plink, u32 which)  {  	struct sk_psock_progs *progs = sock_map_progs(map); +	struct bpf_prog **cur_pprog; +	struct bpf_link **cur_plink;  	if (!progs)  		return -EOPNOTSUPP;  	switch (which) {  	case BPF_SK_MSG_VERDICT: -		*pprog = &progs->msg_parser; +		cur_pprog = &progs->msg_parser; +		cur_plink = &progs->msg_parser_link;  		break;  #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)  	case BPF_SK_SKB_STREAM_PARSER: -		*pprog = &progs->stream_parser; +		cur_pprog = &progs->stream_parser; +		cur_plink = &progs->stream_parser_link;  		break;  #endif  	case BPF_SK_SKB_STREAM_VERDICT:  		if (progs->skb_verdict)  			return -EBUSY; -		*pprog = &progs->stream_verdict; +		cur_pprog = &progs->stream_verdict; +		cur_plink = &progs->stream_verdict_link;  		break;  	case BPF_SK_SKB_VERDICT:  		if (progs->stream_verdict)  			return -EBUSY; -		*pprog = &progs->skb_verdict; +		cur_pprog = &progs->skb_verdict; +		cur_plink = &progs->skb_verdict_link;  		break;  	default:  		return -EOPNOTSUPP;  	} +	*pprog = cur_pprog; +	if (plink) +		*plink = cur_plink;  	return 0;  } +/* Handle the following four cases: + * prog_attach: prog != NULL, old == NULL, link == NULL + * prog_detach: prog == NULL, old != NULL, link == NULL + * link_attach: prog != NULL, old == NULL, link != NULL + * link_detach: prog == NULL, old != NULL, link != NULL + */  static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, -				struct bpf_prog *old, u32 which) +				struct bpf_prog *old, struct bpf_link *link, +				u32 which)  {  	struct bpf_prog **pprog; +	struct bpf_link **plink;  	int ret; -	ret = sock_map_prog_lookup(map, &pprog, which); +	ret = sock_map_prog_link_lookup(map, &pprog, &plink, which);  	if (ret)  		return ret; -	if (old) -		return psock_replace_prog(pprog, prog, old); +	/* for prog_attach/prog_detach/link_attach, return error if a bpf_link +	 * exists for that prog. +	 */ +	if ((!link || prog) && *plink) +		return -EBUSY; -	psock_set_prog(pprog, prog); -	return 0; +	if (old) { +		ret = psock_replace_prog(pprog, prog, old); +		if (!ret) +			*plink = NULL; +	} else { +		psock_set_prog(pprog, prog); +		if (link) +			*plink = link; +	} + +	return ret;  }  int sock_map_bpf_prog_query(const union bpf_attr *attr, @@ -1533,7 +1568,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr,  	rcu_read_lock(); -	ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); +	ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type);  	if (ret)  		goto end; @@ -1639,19 +1674,23 @@ void sock_map_close(struct sock *sk, long timeout)  	lock_sock(sk);  	rcu_read_lock(); -	psock = sk_psock_get(sk); -	if (unlikely(!psock)) { -		rcu_read_unlock(); -		release_sock(sk); -		saved_close = READ_ONCE(sk->sk_prot)->close; -	} else { +	psock = sk_psock(sk); +	if (likely(psock)) {  		saved_close = psock->saved_close;  		sock_map_remove_links(sk, psock); +		psock = sk_psock_get(sk); +		if (unlikely(!psock)) +			goto no_psock;  		rcu_read_unlock();  		sk_psock_stop(psock);  		release_sock(sk);  		cancel_delayed_work_sync(&psock->work);  		sk_psock_put(sk, psock); +	} else { +		saved_close = READ_ONCE(sk->sk_prot)->close; +no_psock: +		rcu_read_unlock(); +		release_sock(sk);  	}  	/* Make sure we do not recurse. This is a bug. @@ -1663,6 +1702,196 @@ void sock_map_close(struct sock *sk, long timeout)  }  EXPORT_SYMBOL_GPL(sock_map_close); +struct sockmap_link { +	struct bpf_link link; +	struct bpf_map *map; +	enum bpf_attach_type attach_type; +}; + +static void sock_map_link_release(struct bpf_link *link) +{ +	struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + +	mutex_lock(&sockmap_mutex); +	if (!sockmap_link->map) +		goto out; + +	WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, +					  sockmap_link->attach_type)); + +	bpf_map_put_with_uref(sockmap_link->map); +	sockmap_link->map = NULL; +out: +	mutex_unlock(&sockmap_mutex); +} + +static int sock_map_link_detach(struct bpf_link *link) +{ +	sock_map_link_release(link); +	return 0; +} + +static void sock_map_link_dealloc(struct bpf_link *link) +{ +	kfree(link); +} + +/* Handle the following two cases: + * case 1: link != NULL, prog != NULL, old != NULL + * case 2: link != NULL, prog != NULL, old == NULL + */ +static int sock_map_link_update_prog(struct bpf_link *link, +				     struct bpf_prog *prog, +				     struct bpf_prog *old) +{ +	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); +	struct bpf_prog **pprog, *old_link_prog; +	struct bpf_link **plink; +	int ret = 0; + +	mutex_lock(&sockmap_mutex); + +	/* If old prog is not NULL, ensure old prog is the same as link->prog. */ +	if (old && link->prog != old) { +		ret = -EPERM; +		goto out; +	} +	/* Ensure link->prog has the same type/attach_type as the new prog. */ +	if (link->prog->type != prog->type || +	    link->prog->expected_attach_type != prog->expected_attach_type) { +		ret = -EINVAL; +		goto out; +	} + +	ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, +					sockmap_link->attach_type); +	if (ret) +		goto out; + +	/* return error if the stored bpf_link does not match the incoming bpf_link. */ +	if (link != *plink) { +		ret = -EBUSY; +		goto out; +	} + +	if (old) { +		ret = psock_replace_prog(pprog, prog, old); +		if (ret) +			goto out; +	} else { +		psock_set_prog(pprog, prog); +	} + +	bpf_prog_inc(prog); +	old_link_prog = xchg(&link->prog, prog); +	bpf_prog_put(old_link_prog); + +out: +	mutex_unlock(&sockmap_mutex); +	return ret; +} + +static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) +{ +	u32 map_id = 0; + +	mutex_lock(&sockmap_mutex); +	if (sockmap_link->map) +		map_id = sockmap_link->map->id; +	mutex_unlock(&sockmap_mutex); +	return map_id; +} + +static int sock_map_link_fill_info(const struct bpf_link *link, +				   struct bpf_link_info *info) +{ +	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); +	u32 map_id = sock_map_link_get_map_id(sockmap_link); + +	info->sockmap.map_id = map_id; +	info->sockmap.attach_type = sockmap_link->attach_type; +	return 0; +} + +static void sock_map_link_show_fdinfo(const struct bpf_link *link, +				      struct seq_file *seq) +{ +	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); +	u32 map_id = sock_map_link_get_map_id(sockmap_link); + +	seq_printf(seq, "map_id:\t%u\n", map_id); +	seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type); +} + +static const struct bpf_link_ops sock_map_link_ops = { +	.release = sock_map_link_release, +	.dealloc = sock_map_link_dealloc, +	.detach = sock_map_link_detach, +	.update_prog = sock_map_link_update_prog, +	.fill_link_info = sock_map_link_fill_info, +	.show_fdinfo = sock_map_link_show_fdinfo, +}; + +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ +	struct bpf_link_primer link_primer; +	struct sockmap_link *sockmap_link; +	enum bpf_attach_type attach_type; +	struct bpf_map *map; +	int ret; + +	if (attr->link_create.flags) +		return -EINVAL; + +	map = bpf_map_get_with_uref(attr->link_create.target_fd); +	if (IS_ERR(map)) +		return PTR_ERR(map); +	if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { +		ret = -EINVAL; +		goto out; +	} + +	sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); +	if (!sockmap_link) { +		ret = -ENOMEM; +		goto out; +	} + +	attach_type = attr->link_create.attach_type; +	bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog); +	sockmap_link->map = map; +	sockmap_link->attach_type = attach_type; + +	ret = bpf_link_prime(&sockmap_link->link, &link_primer); +	if (ret) { +		kfree(sockmap_link); +		goto out; +	} + +	mutex_lock(&sockmap_mutex); +	ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); +	mutex_unlock(&sockmap_mutex); +	if (ret) { +		bpf_link_cleanup(&link_primer); +		goto out; +	} + +	/* Increase refcnt for the prog since when old prog is replaced with +	 * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. +	 * +	 * Actually, we do not need to increase refcnt for the prog since bpf_link +	 * will hold a reference. But in order to have less complexity w.r.t. +	 * replacing/setting prog, let us increase the refcnt to make things simpler. +	 */ +	bpf_prog_inc(prog); + +	return bpf_link_settle(&link_primer); + +out: +	bpf_map_put_with_uref(map); +	return ret; +} +  static int sock_map_iter_attach_target(struct bpf_prog *prog,  				       union bpf_iter_link_info *linfo,  				       struct bpf_iter_aux_info *aux) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 6973dda3abda..86a2476678c4 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,6 +24,7 @@  #include <net/busy_poll.h>  #include <net/pkt_sched.h>  #include <net/hotdata.h> +#include <net/proto_memory.h>  #include <net/rps.h>  #include "dev.h" @@ -94,7 +95,7 @@ static struct cpumask *rps_default_mask_cow_alloc(struct net *net)  	return rps_default_mask;  } -static int rps_default_mask_sysctl(struct ctl_table *table, int write, +static int rps_default_mask_sysctl(const struct ctl_table *table, int write,  				   void *buffer, size_t *lenp, loff_t *ppos)  {  	struct net *net = (struct net *)table->data; @@ -125,7 +126,7 @@ done:  	return err;  } -static int rps_sock_flow_sysctl(struct ctl_table *table, int write, +static int rps_sock_flow_sysctl(const struct ctl_table *table, int write,  				void *buffer, size_t *lenp, loff_t *ppos)  {  	unsigned int orig_size, size; @@ -197,7 +198,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,  #ifdef CONFIG_NET_FLOW_LIMIT  static DEFINE_MUTEX(flow_limit_update_mutex); -static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, +static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write,  				 void *buffer, size_t *lenp, loff_t *ppos)  {  	struct sd_flow_limit *cur; @@ -254,7 +255,7 @@ done:  	return ret;  } -static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, +static int flow_limit_table_len_sysctl(const struct ctl_table *table, int write,  				       void *buffer, size_t *lenp, loff_t *ppos)  {  	unsigned int old, *ptr; @@ -276,7 +277,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,  #endif /* CONFIG_NET_FLOW_LIMIT */  #ifdef CONFIG_NET_SCHED -static int set_default_qdisc(struct ctl_table *table, int write, +static int set_default_qdisc(const struct ctl_table *table, int write,  			     void *buffer, size_t *lenp, loff_t *ppos)  {  	char id[IFNAMSIZ]; @@ -295,7 +296,7 @@ static int set_default_qdisc(struct ctl_table *table, int write,  }  #endif -static int proc_do_dev_weight(struct ctl_table *table, int write, +static int proc_do_dev_weight(const struct ctl_table *table, int write,  			   void *buffer, size_t *lenp, loff_t *ppos)  {  	static DEFINE_MUTEX(dev_weight_mutex); @@ -313,7 +314,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write,  	return ret;  } -static int proc_do_rss_key(struct ctl_table *table, int write, +static int proc_do_rss_key(const struct ctl_table *table, int write,  			   void *buffer, size_t *lenp, loff_t *ppos)  {  	struct ctl_table fake_table; @@ -326,7 +327,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write,  }  #ifdef CONFIG_BPF_JIT -static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, +static int proc_dointvec_minmax_bpf_enable(const struct ctl_table *table, int write,  					   void *buffer, size_t *lenp,  					   loff_t *ppos)  { @@ -359,7 +360,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,  # ifdef CONFIG_HAVE_EBPF_JIT  static int -proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, +proc_dointvec_minmax_bpf_restricted(const struct ctl_table *table, int write,  				    void *buffer, size_t *lenp, loff_t *ppos)  {  	if (!capable(CAP_SYS_ADMIN)) @@ -370,7 +371,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,  # endif /* CONFIG_HAVE_EBPF_JIT */  static int -proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, +proc_dolongvec_minmax_bpf_restricted(const struct ctl_table *table, int write,  				     void *buffer, size_t *lenp, loff_t *ppos)  {  	if (!capable(CAP_SYS_ADMIN)) @@ -382,40 +383,8 @@ proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,  static struct ctl_table net_core_table[] = {  	{ -		.procname	= "wmem_max", -		.data		= &sysctl_wmem_max, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec_minmax, -		.extra1		= &min_sndbuf, -	}, -	{ -		.procname	= "rmem_max", -		.data		= &sysctl_rmem_max, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec_minmax, -		.extra1		= &min_rcvbuf, -	}, -	{ -		.procname	= "wmem_default", -		.data		= &sysctl_wmem_default, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec_minmax, -		.extra1		= &min_sndbuf, -	}, -	{ -		.procname	= "rmem_default", -		.data		= &sysctl_rmem_default, -		.maxlen		= sizeof(int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec_minmax, -		.extra1		= &min_rcvbuf, -	}, -	{  		.procname	= "mem_pcpu_rsv", -		.data		= &sysctl_mem_pcpu_rsv, +		.data		= &net_hotdata.sysctl_mem_pcpu_rsv,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_minmax, @@ -595,7 +564,7 @@ static struct ctl_table net_core_table[] = {  	},  	{  		.procname	= "max_skb_frags", -		.data		= &sysctl_max_skb_frags, +		.data		= &net_hotdata.sysctl_max_skb_frags,  		.maxlen		= sizeof(int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_minmax, @@ -654,13 +623,12 @@ static struct ctl_table net_core_table[] = {  	},  	{  		.procname	= "skb_defer_max", -		.data		= &sysctl_skb_defer_max, +		.data		= &net_hotdata.sysctl_skb_defer_max,  		.maxlen		= sizeof(unsigned int),  		.mode		= 0644,  		.proc_handler	= proc_dointvec_minmax,  		.extra1		= SYSCTL_ZERO,  	}, -	{ }  };  static struct ctl_table netns_core_table[] = { @@ -697,7 +665,41 @@ static struct ctl_table netns_core_table[] = {  		.extra2		= SYSCTL_ONE,  		.proc_handler	= proc_dou8vec_minmax,  	}, -	{ } +	/* sysctl_core_net_init() will set the values after this +	 * to readonly in network namespaces +	 */ +	{ +		.procname	= "wmem_max", +		.data		= &sysctl_wmem_max, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &min_sndbuf, +	}, +	{ +		.procname	= "rmem_max", +		.data		= &sysctl_rmem_max, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &min_rcvbuf, +	}, +	{ +		.procname	= "wmem_default", +		.data		= &sysctl_wmem_default, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &min_sndbuf, +	}, +	{ +		.procname	= "rmem_default", +		.data		= &sysctl_rmem_default, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &min_rcvbuf, +	},  };  static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) @@ -715,20 +717,27 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);  static __net_init int sysctl_core_net_init(struct net *net)  { -	struct ctl_table *tbl, *tmp; +	size_t table_size = ARRAY_SIZE(netns_core_table); +	struct ctl_table *tbl;  	tbl = netns_core_table;  	if (!net_eq(net, &init_net)) { +		int i;  		tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);  		if (tbl == NULL)  			goto err_dup; -		for (tmp = tbl; tmp->procname; tmp++) -			tmp->data += (char *)net - (char *)&init_net; +		for (i = 0; i < table_size; ++i) { +			if (tbl[i].data == &sysctl_wmem_max) +				break; + +			tbl[i].data += (char *)net - (char *)&init_net; +		} +		for (; i < table_size; ++i) +			tbl[i].mode &= ~0222;  	} -	net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, -						      ARRAY_SIZE(netns_core_table)); +	net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size);  	if (net->core.sysctl_hdr == NULL)  		goto err_reg; @@ -743,7 +752,7 @@ err_dup:  static __net_exit void sysctl_core_net_exit(struct net *net)  { -	struct ctl_table *tbl; +	const struct ctl_table *tbl;  	tbl = net->core.sysctl_hdr->ctl_table_arg;  	unregister_net_sysctl_table(net->core.sysctl_hdr); diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 04840697fe79..3717fb152ecc 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -25,7 +25,8 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)  	struct sk_buff *clone;  	unsigned int type; -	if (!skb->sk) +	if (!skb->sk || !skb->dev || +	    !phy_is_default_hwtstamp(skb->dev->phydev))  		return;  	type = classify(skb); @@ -47,7 +48,7 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)  	struct mii_timestamper *mii_ts;  	unsigned int type; -	if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts) +	if (!skb->dev || !phy_is_default_hwtstamp(skb->dev->phydev))  		return false;  	if (skb_headroom(skb) < ETH_HLEN) diff --git a/net/core/xdp.c b/net/core/xdp.c index 41693154e426..bcc5551c6424 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -127,10 +127,8 @@ void xdp_unreg_mem_model(struct xdp_mem_info *mem)  		return;  	if (type == MEM_TYPE_PAGE_POOL) { -		rcu_read_lock(); -		xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); +		xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);  		page_pool_destroy(xa->page_pool); -		rcu_read_unlock();  	}  }  EXPORT_SYMBOL_GPL(xdp_unreg_mem_model); @@ -295,10 +293,8 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem,  		mutex_lock(&mem_id_lock);  		ret = __mem_id_init_hash_table();  		mutex_unlock(&mem_id_lock); -		if (ret < 0) { -			WARN_ON(1); +		if (ret < 0)  			return ERR_PTR(ret); -		}  	}  	xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); | 
