diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 1 | ||||
-rw-r--r-- | net/core/datagram.c | 53 | ||||
-rw-r--r-- | net/core/dev.c | 868 | ||||
-rw-r--r-- | net/core/devlink.c | 100 | ||||
-rw-r--r-- | net/core/drop_monitor.c | 21 | ||||
-rw-r--r-- | net/core/ethtool.c | 97 | ||||
-rw-r--r-- | net/core/fib_rules.c | 78 | ||||
-rw-r--r-- | net/core/filter.c | 299 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 2 | ||||
-rw-r--r-- | net/core/lwt_bpf.c | 396 | ||||
-rw-r--r-- | net/core/lwtunnel.c | 17 | ||||
-rw-r--r-- | net/core/neighbour.c | 15 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 65 | ||||
-rw-r--r-- | net/core/net_namespace.c | 16 | ||||
-rw-r--r-- | net/core/netpoll.c | 6 | ||||
-rw-r--r-- | net/core/pktgen.c | 2 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 20 | ||||
-rw-r--r-- | net/core/secure_seq.c | 11 | ||||
-rw-r--r-- | net/core/skbuff.c | 38 | ||||
-rw-r--r-- | net/core/sock.c | 84 | ||||
-rw-r--r-- | net/core/stream.c | 28 |
21 files changed, 1651 insertions, 566 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index d6508c2ddca5..f6761b6e3b29 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o +obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o diff --git a/net/core/datagram.c b/net/core/datagram.c index b7de71f8d5d3..49816af8586b 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -165,6 +165,7 @@ done: * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket * @flags: MSG_ flags + * @destructor: invoked under the receive lock on successful dequeue * @peeked: returns non-zero if this packet has been seen before * @off: an offset in bytes to peek skb from. Returns an offset * within an skb where data actually starts @@ -197,6 +198,8 @@ done: * the standard around please. */ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, + void (*destructor)(struct sock *sk, + struct sk_buff *skb), int *peeked, int *off, int *err, struct sk_buff **last) { @@ -241,9 +244,11 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, } atomic_inc(&skb->users); - } else + } else { __skb_unlink(skb, queue); - + if (destructor) + destructor(sk, skb); + } spin_unlock_irqrestore(&queue->lock, cpu_flags); *off = _off; return skb; @@ -262,6 +267,8 @@ no_packet: EXPORT_SYMBOL(__skb_try_recv_datagram); struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, + void (*destructor)(struct sock *sk, + struct sk_buff *skb), int *peeked, int *off, int *err) { struct sk_buff *skb, *last; @@ -270,8 +277,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { - skb = __skb_try_recv_datagram(sk, flags, peeked, off, err, - &last); + skb = __skb_try_recv_datagram(sk, flags, destructor, peeked, + off, err, &last); if (skb) return skb; @@ -290,7 +297,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int peeked, off = 0; return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), - &peeked, &off, err); + NULL, &peeked, &off, err); } EXPORT_SYMBOL(skb_recv_datagram); @@ -323,6 +330,27 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) } EXPORT_SYMBOL(__skb_free_datagram_locked); +int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb, + unsigned int flags) +{ + int err = 0; + + if (flags & MSG_PEEK) { + err = -ENOENT; + spin_lock_bh(&sk->sk_receive_queue.lock); + if (skb == skb_peek(&sk->sk_receive_queue)) { + __skb_unlink(skb, &sk->sk_receive_queue); + atomic_dec(&skb->users); + err = 0; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + } + + atomic_inc(&sk->sk_drops); + return err; +} +EXPORT_SYMBOL(__sk_queue_drop_skb); + /** * skb_kill_datagram - Free a datagram skbuff forcibly * @sk: socket @@ -346,23 +374,10 @@ EXPORT_SYMBOL(__skb_free_datagram_locked); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) { - int err = 0; - - if (flags & MSG_PEEK) { - err = -ENOENT; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - atomic_dec(&skb->users); - err = 0; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - } + int err = __sk_queue_drop_skb(sk, skb, flags); kfree_skb(skb); - atomic_inc(&sk->sk_drops); sk_mem_reclaim_partial(sk); - return err; } EXPORT_SYMBOL(skb_kill_datagram); diff --git a/net/core/dev.c b/net/core/dev.c index 6666b28b6815..bffb5253e778 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -139,7 +139,6 @@ #include <linux/errqueue.h> #include <linux/hrtimer.h> #include <linux/netfilter_ingress.h> -#include <linux/sctp.h> #include <linux/crash_dump.h> #include "net-sysfs.h" @@ -1944,37 +1943,80 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq) } } +int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) +{ + if (dev->num_tc) { + struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + int i; + + for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { + if ((txq - tc->offset) < tc->count) + return i; + } + + return -1; + } + + return 0; +} + #ifdef CONFIG_XPS static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) -static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, - int cpu, u16 index) +static bool remove_xps_queue(struct xps_dev_maps *dev_maps, + int tci, u16 index) { struct xps_map *map = NULL; int pos; if (dev_maps) - map = xmap_dereference(dev_maps->cpu_map[cpu]); + map = xmap_dereference(dev_maps->cpu_map[tci]); + if (!map) + return false; - for (pos = 0; map && pos < map->len; pos++) { - if (map->queues[pos] == index) { - if (map->len > 1) { - map->queues[pos] = map->queues[--map->len]; - } else { - RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); - kfree_rcu(map, rcu); - map = NULL; - } + for (pos = map->len; pos--;) { + if (map->queues[pos] != index) + continue; + + if (map->len > 1) { + map->queues[pos] = map->queues[--map->len]; break; } + + RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL); + kfree_rcu(map, rcu); + return false; } - return map; + return true; } -static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +static bool remove_xps_queue_cpu(struct net_device *dev, + struct xps_dev_maps *dev_maps, + int cpu, u16 offset, u16 count) +{ + int num_tc = dev->num_tc ? : 1; + bool active = false; + int tci; + + for (tci = cpu * num_tc; num_tc--; tci++) { + int i, j; + + for (i = count, j = offset; i--; j++) { + if (!remove_xps_queue(dev_maps, cpu, j)) + break; + } + + active |= i < 0; + } + + return active; +} + +static void netif_reset_xps_queues(struct net_device *dev, u16 offset, + u16 count) { struct xps_dev_maps *dev_maps; int cpu, i; @@ -1986,21 +2028,16 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) if (!dev_maps) goto out_no_maps; - for_each_possible_cpu(cpu) { - for (i = index; i < dev->num_tx_queues; i++) { - if (!remove_xps_queue(dev_maps, cpu, i)) - break; - } - if (i == dev->num_tx_queues) - active = true; - } + for_each_possible_cpu(cpu) + active |= remove_xps_queue_cpu(dev, dev_maps, cpu, + offset, count); if (!active) { RCU_INIT_POINTER(dev->xps_maps, NULL); kfree_rcu(dev_maps, rcu); } - for (i = index; i < dev->num_tx_queues; i++) + for (i = offset + (count - 1); count--; i--) netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), NUMA_NO_NODE); @@ -2008,6 +2045,11 @@ out_no_maps: mutex_unlock(&xps_map_mutex); } +static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +{ + netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); +} + static struct xps_map *expand_xps_map(struct xps_map *map, int cpu, u16 index) { @@ -2047,20 +2089,28 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index) { struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + int i, cpu, tci, numa_node_id = -2; + int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; - int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); - int cpu, numa_node_id = -2; bool active = false; + if (dev->num_tc) { + num_tc = dev->num_tc; + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) + return -EINVAL; + } + + maps_sz = XPS_DEV_MAPS_SIZE(num_tc); + if (maps_sz < L1_CACHE_BYTES) + maps_sz = L1_CACHE_BYTES; + mutex_lock(&xps_map_mutex); dev_maps = xmap_dereference(dev->xps_maps); /* allocate memory for queue storage */ - for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, mask)) - continue; - + for_each_cpu_and(cpu, cpu_online_mask, mask) { if (!new_dev_maps) new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { @@ -2068,25 +2118,38 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, return -ENOMEM; } - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : + tci = cpu * num_tc + tc; + map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) : NULL; map = expand_xps_map(map, cpu, index); if (!map) goto error; - RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; for_each_possible_cpu(cpu) { + /* copy maps belonging to foreign traffic classes */ + for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) { + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->cpu_map[tci]); + RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + } + + /* We need to explicitly update tci as prevous loop + * could break out early if dev_maps is NULL. + */ + tci = cpu * num_tc + tc; + if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { /* add queue to CPU maps */ int pos = 0; - map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + map = xmap_dereference(new_dev_maps->cpu_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; @@ -2100,26 +2163,36 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, #endif } else if (dev_maps) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[cpu]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + map = xmap_dereference(dev_maps->cpu_map[tci]); + RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); } + /* copy maps belonging to foreign traffic classes */ + for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->cpu_map[tci]); + RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + } } rcu_assign_pointer(dev->xps_maps, new_dev_maps); /* Cleanup old maps */ - if (dev_maps) { - for_each_possible_cpu(cpu) { - new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); - map = xmap_dereference(dev_maps->cpu_map[cpu]); + if (!dev_maps) + goto out_no_old_maps; + + for_each_possible_cpu(cpu) { + for (i = num_tc, tci = cpu * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); + map = xmap_dereference(dev_maps->cpu_map[tci]); if (map && map != new_map) kfree_rcu(map, rcu); } - - kfree_rcu(dev_maps, rcu); } + kfree_rcu(dev_maps, rcu); + +out_no_old_maps: dev_maps = new_dev_maps; active = true; @@ -2134,11 +2207,12 @@ out_no_new_maps: /* removes queue from unused CPUs */ for_each_possible_cpu(cpu) { - if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) - continue; - - if (remove_xps_queue(dev_maps, cpu, index)) - active = true; + for (i = tc, tci = cpu * num_tc; i--; tci++) + active |= remove_xps_queue(dev_maps, tci, index); + if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu)) + active |= remove_xps_queue(dev_maps, tci, index); + for (i = num_tc - tc, tci++; --i; tci++) + active |= remove_xps_queue(dev_maps, tci, index); } /* free map if not active */ @@ -2154,11 +2228,14 @@ out_no_maps: error: /* remove any maps that we added */ for_each_possible_cpu(cpu) { - new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : - NULL; - if (new_map && new_map != map) - kfree(new_map); + for (i = num_tc, tci = cpu * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); + map = dev_maps ? + xmap_dereference(dev_maps->cpu_map[tci]) : + NULL; + if (new_map && new_map != map) + kfree(new_map); + } } mutex_unlock(&xps_map_mutex); @@ -2169,6 +2246,44 @@ error: EXPORT_SYMBOL(netif_set_xps_queue); #endif +void netdev_reset_tc(struct net_device *dev) +{ +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, 0); +#endif + dev->num_tc = 0; + memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); + memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); +} +EXPORT_SYMBOL(netdev_reset_tc); + +int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) +{ + if (tc >= dev->num_tc) + return -EINVAL; + +#ifdef CONFIG_XPS + netif_reset_xps_queues(dev, offset, count); +#endif + dev->tc_to_txq[tc].count = count; + dev->tc_to_txq[tc].offset = offset; + return 0; +} +EXPORT_SYMBOL(netdev_set_tc_queue); + +int netdev_set_num_tc(struct net_device *dev, u8 num_tc) +{ + if (num_tc > TC_MAX_QUEUE) + return -EINVAL; + +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, 0); +#endif + dev->num_tc = num_tc; + return 0; +} +EXPORT_SYMBOL(netdev_set_num_tc); + /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. @@ -2487,141 +2602,6 @@ out: } EXPORT_SYMBOL(skb_checksum_help); -/* skb_csum_offload_check - Driver helper function to determine if a device - * with limited checksum offload capabilities is able to offload the checksum - * for a given packet. - * - * Arguments: - * skb - sk_buff for the packet in question - * spec - contains the description of what device can offload - * csum_encapped - returns true if the checksum being offloaded is - * encpasulated. That is it is checksum for the transport header - * in the inner headers. - * checksum_help - when set indicates that helper function should - * call skb_checksum_help if offload checks fail - * - * Returns: - * true: Packet has passed the checksum checks and should be offloadable to - * the device (a driver may still need to check for additional - * restrictions of its device) - * false: Checksum is not offloadable. If checksum_help was set then - * skb_checksum_help was called to resolve checksum for non-GSO - * packets and when IP protocol is not SCTP - */ -bool __skb_csum_offload_chk(struct sk_buff *skb, - const struct skb_csum_offl_spec *spec, - bool *csum_encapped, - bool csum_help) -{ - struct iphdr *iph; - struct ipv6hdr *ipv6; - void *nhdr; - int protocol; - u8 ip_proto; - - if (skb->protocol == htons(ETH_P_8021Q) || - skb->protocol == htons(ETH_P_8021AD)) { - if (!spec->vlan_okay) - goto need_help; - } - - /* We check whether the checksum refers to a transport layer checksum in - * the outermost header or an encapsulated transport layer checksum that - * corresponds to the inner headers of the skb. If the checksum is for - * something else in the packet we need help. - */ - if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) { - /* Non-encapsulated checksum */ - protocol = eproto_to_ipproto(vlan_get_protocol(skb)); - nhdr = skb_network_header(skb); - *csum_encapped = false; - if (spec->no_not_encapped) - goto need_help; - } else if (skb->encapsulation && spec->encap_okay && - skb_checksum_start_offset(skb) == - skb_inner_transport_offset(skb)) { - /* Encapsulated checksum */ - *csum_encapped = true; - switch (skb->inner_protocol_type) { - case ENCAP_TYPE_ETHER: - protocol = eproto_to_ipproto(skb->inner_protocol); - break; - case ENCAP_TYPE_IPPROTO: - protocol = skb->inner_protocol; - break; - } - nhdr = skb_inner_network_header(skb); - } else { - goto need_help; - } - - switch (protocol) { - case IPPROTO_IP: - if (!spec->ipv4_okay) - goto need_help; - iph = nhdr; - ip_proto = iph->protocol; - if (iph->ihl != 5 && !spec->ip_options_okay) - goto need_help; - break; - case IPPROTO_IPV6: - if (!spec->ipv6_okay) - goto need_help; - if (spec->no_encapped_ipv6 && *csum_encapped) - goto need_help; - ipv6 = nhdr; - nhdr += sizeof(*ipv6); - ip_proto = ipv6->nexthdr; - break; - default: - goto need_help; - } - -ip_proto_again: - switch (ip_proto) { - case IPPROTO_TCP: - if (!spec->tcp_okay || - skb->csum_offset != offsetof(struct tcphdr, check)) - goto need_help; - break; - case IPPROTO_UDP: - if (!spec->udp_okay || - skb->csum_offset != offsetof(struct udphdr, check)) - goto need_help; - break; - case IPPROTO_SCTP: - if (!spec->sctp_okay || - skb->csum_offset != offsetof(struct sctphdr, checksum)) - goto cant_help; - break; - case NEXTHDR_HOP: - case NEXTHDR_ROUTING: - case NEXTHDR_DEST: { - u8 *opthdr = nhdr; - - if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay) - goto need_help; - - ip_proto = opthdr[0]; - nhdr += (opthdr[1] + 1) << 3; - - goto ip_proto_again; - } - default: - goto need_help; - } - - /* Passed the tests for offloading checksum */ - return true; - -need_help: - if (csum_help && !skb_shinfo(skb)->gso_size) - skb_checksum_help(skb); -cant_help: - return false; -} -EXPORT_SYMBOL(__skb_csum_offload_chk); - __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; @@ -3216,8 +3196,14 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps); if (dev_maps) { - map = rcu_dereference( - dev_maps->cpu_map[skb->sender_cpu - 1]); + unsigned int tci = skb->sender_cpu - 1; + + if (dev->num_tc) { + tci *= dev->num_tc; + tci += netdev_get_prio_tc_map(dev, skb->priority); + } + + map = rcu_dereference(dev_maps->cpu_map[tci]); if (map) { if (map->len == 1) queue_index = map->queues[0]; @@ -4491,7 +4477,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (!(skb->dev->features & NETIF_F_GRO)) goto normal; - if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) + if (skb->csum_bad) goto normal; gro_list_prepare(napi, skb); @@ -4504,7 +4490,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff skb_set_network_header(skb, skb_gro_offset(skb)); skb_reset_mac_len(skb); NAPI_GRO_CB(skb)->same_flow = 0; - NAPI_GRO_CB(skb)->flush = 0; + NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->encap_mark = 0; NAPI_GRO_CB(skb)->recursion_counter = 0; @@ -4912,26 +4898,36 @@ void __napi_schedule_irqoff(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule_irqoff); -void __napi_complete(struct napi_struct *n) +bool __napi_complete(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + /* Some drivers call us directly, instead of calling + * napi_complete_done(). + */ + if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state))) + return false; + list_del_init(&n->poll_list); smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); + return true; } EXPORT_SYMBOL(__napi_complete); -void napi_complete_done(struct napi_struct *n, int work_done) +bool napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags; /* - * don't let napi dequeue from the cpu poll list - * just in case its running on a different cpu + * 1) Don't let napi dequeue from the cpu poll list + * just in case its running on a different cpu. + * 2) If we are busy polling, do nothing here, we have + * the guarantee we will be called later. */ - if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) - return; + if (unlikely(n->state & (NAPIF_STATE_NPSVC | + NAPIF_STATE_IN_BUSY_POLL))) + return false; if (n->gro_list) { unsigned long timeout = 0; @@ -4953,6 +4949,7 @@ void napi_complete_done(struct napi_struct *n, int work_done) __napi_complete(n); local_irq_restore(flags); } + return true; } EXPORT_SYMBOL(napi_complete_done); @@ -4970,13 +4967,41 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) } #if defined(CONFIG_NET_RX_BUSY_POLL) + #define BUSY_POLL_BUDGET 8 + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) +{ + int rc; + + clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); + + local_bh_disable(); + + /* All we really want here is to re-enable device interrupts. + * Ideally, a new ndo_busy_poll_stop() could avoid another round. + */ + rc = napi->poll(napi, BUSY_POLL_BUDGET); + netpoll_poll_unlock(have_poll_lock); + if (rc == BUSY_POLL_BUDGET) + __napi_schedule(napi); + local_bh_enable(); + if (local_softirq_pending()) + do_softirq(); +} + bool sk_busy_loop(struct sock *sk, int nonblock) { unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + int (*napi_poll)(struct napi_struct *napi, int budget); int (*busy_poll)(struct napi_struct *dev); + void *have_poll_lock = NULL; struct napi_struct *napi; - int rc = false; + int rc; + +restart: + rc = false; + napi_poll = NULL; rcu_read_lock(); @@ -4987,24 +5012,33 @@ bool sk_busy_loop(struct sock *sk, int nonblock) /* Note: ndo_busy_poll method is optional in linux-4.5 */ busy_poll = napi->dev->netdev_ops->ndo_busy_poll; - do { + preempt_disable(); + for (;;) { rc = 0; local_bh_disable(); if (busy_poll) { rc = busy_poll(napi); - } else if (napi_schedule_prep(napi)) { - void *have = netpoll_poll_lock(napi); - - if (test_bit(NAPI_STATE_SCHED, &napi->state)) { - rc = napi->poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); - if (rc == BUSY_POLL_BUDGET) { - napi_complete_done(napi, rc); - napi_schedule(napi); - } - } - netpoll_poll_unlock(have); + goto count; } + if (!napi_poll) { + unsigned long val = READ_ONCE(napi->state); + + /* If multiple threads are competing for this napi, + * we avoid dirtying napi->state as much as we can. + */ + if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | + NAPIF_STATE_IN_BUSY_POLL)) + goto count; + if (cmpxchg(&napi->state, val, + val | NAPIF_STATE_IN_BUSY_POLL | + NAPIF_STATE_SCHED) != val) + goto count; + have_poll_lock = netpoll_poll_lock(napi); + napi_poll = napi->poll; + } + rc = napi_poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); +count: if (rc > 0) __NET_ADD_STATS(sock_net(sk), LINUX_MIB_BUSYPOLLRXPACKETS, rc); @@ -5013,10 +5047,26 @@ bool sk_busy_loop(struct sock *sk, int nonblock) if (rc == LL_FLUSH_FAILED) break; /* permanent failure */ - cpu_relax(); - } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && - !need_resched() && !busy_loop_timeout(end_time)); + if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || + busy_loop_timeout(end_time)) + break; + if (unlikely(need_resched())) { + if (napi_poll) + busy_poll_stop(napi, have_poll_lock); + preempt_enable(); + rcu_read_unlock(); + cond_resched(); + rc = !skb_queue_empty(&sk->sk_receive_queue); + if (rc || busy_loop_timeout(end_time)) + return rc; + goto restart; + } + cpu_relax_lowlatency(); + } + if (napi_poll) + busy_poll_stop(napi, have_poll_lock); + preempt_enable(); rc = !skb_queue_empty(&sk->sk_receive_queue); out: rcu_read_unlock(); @@ -5026,7 +5076,7 @@ EXPORT_SYMBOL(sk_busy_loop); #endif /* CONFIG_NET_RX_BUSY_POLL */ -void napi_hash_add(struct napi_struct *napi) +static void napi_hash_add(struct napi_struct *napi) { if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) @@ -5046,7 +5096,6 @@ void napi_hash_add(struct napi_struct *napi) spin_unlock(&napi_hash_lock); } -EXPORT_SYMBOL_GPL(napi_hash_add); /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi @@ -5094,7 +5143,6 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; #ifdef CONFIG_NETPOLL - spin_lock_init(&napi->poll_lock); napi->poll_owner = -1; #endif set_bit(NAPI_STATE_SCHED, &napi->state); @@ -5212,7 +5260,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) if (list_empty(&list)) { if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) - return; + goto out; break; } @@ -5230,7 +5278,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) } } - __kfree_skb_flush(); local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); @@ -5240,6 +5287,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); +out: + __kfree_skb_flush(); } struct netdev_adjacent { @@ -5270,6 +5319,13 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, return NULL; } +static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data) +{ + struct net_device *dev = data; + + return upper_dev == dev; +} + /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device @@ -5284,11 +5340,30 @@ bool netdev_has_upper_dev(struct net_device *dev, { ASSERT_RTNL(); - return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper); + return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, + upper_dev); } EXPORT_SYMBOL(netdev_has_upper_dev); /** + * netdev_has_upper_dev_all - Check if device is linked to an upper device + * @dev: device + * @upper_dev: upper device to check + * + * Find out if a device is linked to specified upper device and return true + * in case it is. Note that this checks the entire upper device chain. + * The caller must hold rcu lock. + */ + +bool netdev_has_upper_dev_all_rcu(struct net_device *dev, + struct net_device *upper_dev) +{ + return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, + upper_dev); +} +EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); + +/** * netdev_has_any_upper_dev - Check if device is linked to some device * @dev: device * @@ -5299,7 +5374,7 @@ static bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); - return !list_empty(&dev->all_adj_list.upper); + return !list_empty(&dev->adj_list.upper); } /** @@ -5326,6 +5401,20 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get); +/** + * netdev_has_any_lower_dev - Check if device is linked to some device + * @dev: device + * + * Find out if a device is linked to a lower device and return true in case + * it is. The caller must hold the RTNL lock. + */ +static bool netdev_has_any_lower_dev(struct net_device *dev) +{ + ASSERT_RTNL(); + + return !list_empty(&dev->adj_list.lower); +} + void *netdev_adjacent_get_private(struct list_head *adj_list) { struct netdev_adjacent *adj; @@ -5362,16 +5451,8 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); -/** - * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list - * @dev: device - * @iter: list_head ** of the current position - * - * Gets the next device from the dev's upper list, starting from iter - * position. The caller must hold RCU read lock. - */ -struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, - struct list_head **iter) +static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, + struct list_head **iter) { struct netdev_adjacent *upper; @@ -5379,14 +5460,41 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); - if (&upper->list == &dev->all_adj_list.upper) + if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } -EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); + +int netdev_walk_all_upper_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *udev; + struct list_head *iter; + int ret; + + for (iter = &dev->adj_list.upper, + udev = netdev_next_upper_dev_rcu(dev, &iter); + udev; + udev = netdev_next_upper_dev_rcu(dev, &iter)) { + /* first is the upper device itself */ + ret = fn(udev, data); + if (ret) + return ret; + + /* then look at all of its upper devices */ + ret = netdev_walk_all_upper_dev_rcu(udev, fn, data); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); /** * netdev_lower_get_next_private - Get the next ->private from the @@ -5469,55 +5577,90 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) } EXPORT_SYMBOL(netdev_lower_get_next); -/** - * netdev_all_lower_get_next - Get the next device from all lower neighbour list - * @dev: device - * @iter: list_head ** of the current position - * - * Gets the next netdev_adjacent from the dev's all lower neighbour - * list, starting from iter position. The caller must hold RTNL lock or - * its own locking that guarantees that the neighbour all lower - * list will remain unchanged. - */ -struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter) +static struct net_device *netdev_next_lower_dev(struct net_device *dev, + struct list_head **iter) { struct netdev_adjacent *lower; - lower = list_entry(*iter, struct netdev_adjacent, list); + lower = list_entry((*iter)->next, struct netdev_adjacent, list); - if (&lower->list == &dev->all_adj_list.lower) + if (&lower->list == &dev->adj_list.lower) return NULL; - *iter = lower->list.next; + *iter = &lower->list; return lower->dev; } -EXPORT_SYMBOL(netdev_all_lower_get_next); -/** - * netdev_all_lower_get_next_rcu - Get the next device from all - * lower neighbour list, RCU variant - * @dev: device - * @iter: list_head ** of the current position - * - * Gets the next netdev_adjacent from the dev's all lower neighbour - * list, starting from iter position. The caller must hold RCU read lock. - */ -struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, - struct list_head **iter) +int netdev_walk_all_lower_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *ldev; + struct list_head *iter; + int ret; + + for (iter = &dev->adj_list.lower, + ldev = netdev_next_lower_dev(dev, &iter); + ldev; + ldev = netdev_next_lower_dev(dev, &iter)) { + /* first is the lower device itself */ + ret = fn(ldev, data); + if (ret) + return ret; + + /* then look at all of its lower devices */ + ret = netdev_walk_all_lower_dev(ldev, fn, data); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); + +static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, + struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); - - if (&lower->list == &dev->all_adj_list.lower) + if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->dev; } -EXPORT_SYMBOL(netdev_all_lower_get_next_rcu); + +int netdev_walk_all_lower_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *ldev; + struct list_head *iter; + int ret; + + for (iter = &dev->adj_list.lower, + ldev = netdev_next_lower_dev_rcu(dev, &iter); + ldev; + ldev = netdev_next_lower_dev_rcu(dev, &iter)) { + /* first is the lower device itself */ + ret = fn(ldev, data); + if (ret) + return ret; + + /* then look at all of its lower devices */ + ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); /** * netdev_lower_get_first_private_rcu - Get the first ->private from the @@ -5590,7 +5733,6 @@ static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, - u16 ref_nr, struct list_head *dev_list, void *private, bool master) { @@ -5600,7 +5742,10 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj = __netdev_find_adj(adj_dev, dev_list); if (adj) { - adj->ref_nr += ref_nr; + adj->ref_nr += 1; + pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", + dev->name, adj_dev->name, adj->ref_nr); + return 0; } @@ -5610,12 +5755,12 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->dev = adj_dev; adj->master = master; - adj->ref_nr = ref_nr; + adj->ref_nr = 1; adj->private = private; dev_hold(adj_dev); - pr_debug("dev_hold for %s, because of link added from %s to %s\n", - adj_dev->name, dev->name, adj_dev->name); + pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", + dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); @@ -5654,17 +5799,22 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, { struct netdev_adjacent *adj; + pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", + dev->name, adj_dev->name, ref_nr); + adj = __netdev_find_adj(adj_dev, dev_list); if (!adj) { - pr_err("tried to remove device %s from %s\n", + pr_err("Adjacency does not exist for device %s from %s\n", dev->name, adj_dev->name); - BUG(); + WARN_ON(1); + return; } if (adj->ref_nr > ref_nr) { - pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name, - ref_nr, adj->ref_nr-ref_nr); + pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", + dev->name, adj_dev->name, ref_nr, + adj->ref_nr - ref_nr); adj->ref_nr -= ref_nr; return; } @@ -5676,7 +5826,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); list_del_rcu(&adj->list); - pr_debug("dev_put for %s, because link removed from %s to %s\n", + pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); dev_put(adj_dev); kfree_rcu(adj, rcu); @@ -5684,38 +5834,27 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, static int __netdev_adjacent_dev_link_lists(struct net_device *dev, struct net_device *upper_dev, - u16 ref_nr, struct list_head *up_list, struct list_head *down_list, void *private, bool master) { int ret; - ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list, + ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, master); if (ret) return ret; - ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list, + ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, false); if (ret) { - __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); + __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); return ret; } return 0; } -static int __netdev_adjacent_dev_link(struct net_device *dev, - struct net_device *upper_dev, - u16 ref_nr) -{ - return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr, - &dev->all_adj_list.upper, - &upper_dev->all_adj_list.lower, - NULL, false); -} - static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, struct net_device *upper_dev, u16 ref_nr, @@ -5726,40 +5865,19 @@ static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); } -static void __netdev_adjacent_dev_unlink(struct net_device *dev, - struct net_device *upper_dev, - u16 ref_nr) -{ - __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr, - &dev->all_adj_list.upper, - &upper_dev->all_adj_list.lower); -} - static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, struct net_device *upper_dev, void *private, bool master) { - int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1); - - if (ret) - return ret; - - ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1, - &dev->adj_list.upper, - &upper_dev->adj_list.lower, - private, master); - if (ret) { - __netdev_adjacent_dev_unlink(dev, upper_dev, 1); - return ret; - } - - return 0; + return __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower, + private, master); } static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, struct net_device *upper_dev) { - __netdev_adjacent_dev_unlink(dev, upper_dev, 1); __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, &dev->adj_list.upper, &upper_dev->adj_list.lower); @@ -5770,7 +5888,6 @@ static int __netdev_upper_dev_link(struct net_device *dev, void *upper_priv, void *upper_info) { struct netdev_notifier_changeupper_info changeupper_info; - struct netdev_adjacent *i, *j, *to_i, *to_j; int ret = 0; ASSERT_RTNL(); @@ -5779,10 +5896,10 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper)) + if (netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; - if (__netdev_find_adj(upper_dev, &dev->adj_list.upper)) + if (netdev_has_upper_dev(dev, upper_dev)) return -EEXIST; if (master && netdev_master_upper_dev_get(dev)) @@ -5804,80 +5921,15 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (ret) return ret; - /* Now that we linked these devs, make all the upper_dev's - * all_adj_list.upper visible to every dev's all_adj_list.lower an - * versa, and don't forget the devices itself. All of these - * links are non-neighbours. - */ - list_for_each_entry(i, &dev->all_adj_list.lower, list) { - list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { - pr_debug("Interlinking %s with %s, non-neighbour\n", - i->dev->name, j->dev->name); - ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr); - if (ret) - goto rollback_mesh; - } - } - - /* add dev to every upper_dev's upper device */ - list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { - pr_debug("linking %s's upper device %s with %s\n", - upper_dev->name, i->dev->name, dev->name); - ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr); - if (ret) - goto rollback_upper_mesh; - } - - /* add upper_dev to every dev's lower device */ - list_for_each_entry(i, &dev->all_adj_list.lower, list) { - pr_debug("linking %s's lower device %s with %s\n", dev->name, - i->dev->name, upper_dev->name); - ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr); - if (ret) - goto rollback_lower_mesh; - } - ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) - goto rollback_lower_mesh; + goto rollback; return 0; -rollback_lower_mesh: - to_i = i; - list_for_each_entry(i, &dev->all_adj_list.lower, list) { - if (i == to_i) - break; - __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); - } - - i = NULL; - -rollback_upper_mesh: - to_i = i; - list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { - if (i == to_i) - break; - __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); - } - - i = j = NULL; - -rollback_mesh: - to_i = i; - to_j = j; - list_for_each_entry(i, &dev->all_adj_list.lower, list) { - list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { - if (i == to_i && j == to_j) - break; - __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); - } - if (i == to_i) - break; - } - +rollback: __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); return ret; @@ -5934,7 +5986,6 @@ void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { struct netdev_notifier_changeupper_info changeupper_info; - struct netdev_adjacent *i, *j; ASSERT_RTNL(); changeupper_info.upper_dev = upper_dev; @@ -5946,23 +5997,6 @@ void netdev_upper_dev_unlink(struct net_device *dev, __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); - /* Here is the tricky part. We must remove all dev's lower - * devices from all upper_dev's upper devices and vice - * versa, to maintain the graph relationship. - */ - list_for_each_entry(i, &dev->all_adj_list.lower, list) - list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) - __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); - - /* remove also the devices itself from lower/upper device - * list - */ - list_for_each_entry(i, &dev->all_adj_list.lower, list) - __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); - - list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) - __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); - call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, &changeupper_info.info); } @@ -6500,9 +6534,18 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) if (new_mtu == dev->mtu) return 0; - /* MTU must be positive. */ - if (new_mtu < 0) + /* MTU must be positive, and in range */ + if (new_mtu < 0 || new_mtu < dev->min_mtu) { + net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n", + dev->name, new_mtu, dev->min_mtu); return -EINVAL; + } + + if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { + net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n", + dev->name, new_mtu, dev->max_mtu); + return -EINVAL; + } if (!netif_device_present(dev)) return -ENODEV; @@ -6649,26 +6692,42 @@ EXPORT_SYMBOL(dev_change_proto_down); * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device * @fd: new program fd or negative value to clear + * @flags: xdp-related flags * * Set or clear a bpf program for a device */ -int dev_change_xdp_fd(struct net_device *dev, int fd) +int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; struct bpf_prog *prog = NULL; - struct netdev_xdp xdp = {}; + struct netdev_xdp xdp; int err; + ASSERT_RTNL(); + if (!ops->ndo_xdp) return -EOPNOTSUPP; if (fd >= 0) { + if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG; + + err = ops->ndo_xdp(dev, &xdp); + if (err < 0) + return err; + if (xdp.prog_attached) + return -EBUSY; + } + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); if (IS_ERR(prog)) return PTR_ERR(prog); } + memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_SETUP_PROG; xdp.prog = prog; + err = ops->ndo_xdp(dev, &xdp); if (err < 0 && prog) bpf_prog_put(prog); @@ -6777,6 +6836,7 @@ static void rollback_registered_many(struct list_head *head) /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); + WARN_ON(netdev_has_any_lower_dev(dev)); /* Remove entries from kobject tree */ netdev_unregister_kobject(dev); @@ -7655,8 +7715,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->adj_list.upper); INIT_LIST_HEAD(&dev->adj_list.lower); - INIT_LIST_HEAD(&dev->all_adj_list.upper); - INIT_LIST_HEAD(&dev->all_adj_list.lower); INIT_LIST_HEAD(&dev->ptype_all); INIT_LIST_HEAD(&dev->ptype_specific); #ifdef CONFIG_NET_SCHED @@ -7667,7 +7725,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (!dev->tx_queue_len) { dev->priv_flags |= IFF_NO_QUEUE; - dev->tx_queue_len = 1; + dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; } dev->num_tx_queues = txqs; diff --git a/net/core/devlink.c b/net/core/devlink.c index 1b5063088f1a..2b5bf9efa720 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -341,15 +341,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, mutex_unlock(&devlink_mutex); } -static struct genl_family devlink_nl_family = { - .id = GENL_ID_GENERATE, - .name = DEVLINK_GENL_NAME, - .version = DEVLINK_GENL_VERSION, - .maxattr = DEVLINK_ATTR_MAX, - .netnsok = true, - .pre_doit = devlink_nl_pre_doit, - .post_doit = devlink_nl_post_doit, -}; +static struct genl_family devlink_nl_family; enum devlink_multicast_groups { DEVLINK_MCGRP_CONFIG, @@ -608,6 +600,8 @@ static int devlink_port_type_set(struct devlink *devlink, if (devlink->ops && devlink->ops->port_type_set) { if (port_type == DEVLINK_PORT_TYPE_NOTSET) return -EINVAL; + if (port_type == devlink_port->type) + return 0; err = devlink->ops->port_type_set(devlink_port, port_type); if (err) return err; @@ -1400,26 +1394,45 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, - u32 seq, int flags, u16 mode) + u32 seq, int flags) { + const struct devlink_ops *ops = devlink->ops; void *hdr; + int err = 0; + u16 mode; + u8 inline_mode; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; + err = devlink_nl_put_handle(msg, devlink); + if (err) + goto out; - if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode)) - goto nla_put_failure; + err = ops->eswitch_mode_get(devlink, &mode); + if (err) + goto out; + err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); + if (err) + goto out; + + if (ops->eswitch_inline_mode_get) { + err = ops->eswitch_inline_mode_get(devlink, &inline_mode); + if (err) + goto out; + err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE, + inline_mode); + if (err) + goto out; + } genlmsg_end(msg, hdr); return 0; -nla_put_failure: +out: genlmsg_cancel(msg, hdr); - return -EMSGSIZE; + return err; } static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, @@ -1428,22 +1441,17 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; struct sk_buff *msg; - u16 mode; int err; if (!ops || !ops->eswitch_mode_get) return -EOPNOTSUPP; - err = ops->eswitch_mode_get(devlink, &mode); - if (err) - return err; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET, - info->snd_portid, info->snd_seq, 0, mode); + info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); @@ -1459,15 +1467,32 @@ static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb, struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; u16 mode; + u8 inline_mode; + int err = 0; - if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) - return -EINVAL; + if (!ops) + return -EOPNOTSUPP; - mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) { + if (!ops->eswitch_mode_set) + return -EOPNOTSUPP; + mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + err = ops->eswitch_mode_set(devlink, mode); + if (err) + return err; + } - if (ops && ops->eswitch_mode_set) - return ops->eswitch_mode_set(devlink, mode); - return -EOPNOTSUPP; + if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) { + if (!ops->eswitch_inline_mode_set) + return -EOPNOTSUPP; + inline_mode = nla_get_u8( + info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]); + err = ops->eswitch_inline_mode_set(devlink, inline_mode); + if (err) + return err; + } + + return 0; } static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { @@ -1484,6 +1509,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -1618,6 +1644,20 @@ static const struct genl_ops devlink_nl_ops[] = { }, }; +static struct genl_family devlink_nl_family __ro_after_init = { + .name = DEVLINK_GENL_NAME, + .version = DEVLINK_GENL_VERSION, + .maxattr = DEVLINK_ATTR_MAX, + .netnsok = true, + .pre_doit = devlink_nl_pre_doit, + .post_doit = devlink_nl_post_doit, + .module = THIS_MODULE, + .ops = devlink_nl_ops, + .n_ops = ARRAY_SIZE(devlink_nl_ops), + .mcgrps = devlink_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), +}; + /** * devlink_alloc - Allocate new devlink instance resources * @@ -1840,9 +1880,7 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister); static int __init devlink_module_init(void) { - return genl_register_family_with_ops_groups(&devlink_nl_family, - devlink_nl_ops, - devlink_nl_mcgrps); + return genl_register_family(&devlink_nl_family); } static void __exit devlink_module_exit(void) diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 72cfb0c61125..8e0c0635ee97 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -59,12 +59,7 @@ struct dm_hw_stat_delta { unsigned long last_drop_val; }; -static struct genl_family net_drop_monitor_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = "NET_DM", - .version = 2, -}; +static struct genl_family net_drop_monitor_family; static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); @@ -351,6 +346,17 @@ static const struct genl_ops dropmon_ops[] = { }, }; +static struct genl_family net_drop_monitor_family __ro_after_init = { + .hdrsize = 0, + .name = "NET_DM", + .version = 2, + .module = THIS_MODULE, + .ops = dropmon_ops, + .n_ops = ARRAY_SIZE(dropmon_ops), + .mcgrps = dropmon_mcgrps, + .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps), +}; + static struct notifier_block dropmon_net_notifier = { .notifier_call = dropmon_net_event }; @@ -367,8 +373,7 @@ static int __init init_net_drop_monitor(void) return -ENOSPC; } - rc = genl_register_family_with_ops_groups(&net_drop_monitor_family, - dropmon_ops, dropmon_mcgrps); + rc = genl_register_family(&net_drop_monitor_family); if (rc) { pr_err("Could not create drop monitor netlink family\n"); return rc; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 047a1752ece1..e23766c7e3ba 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -119,6 +119,12 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", }; +static const char +phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = { + [ETHTOOL_ID_UNSPEC] = "Unspec", + [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift", +}; + static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { @@ -227,6 +233,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_TUNABLES) return ARRAY_SIZE(tunable_strings); + if (sset == ETH_SS_PHY_TUNABLES) + return ARRAY_SIZE(phy_tunable_strings); + if (sset == ETH_SS_PHY_STATS) { if (dev->phydev) return phy_get_sset_count(dev->phydev); @@ -253,6 +262,8 @@ static void __ethtool_get_strings(struct net_device *dev, sizeof(rss_hash_func_strings)); else if (stringset == ETH_SS_TUNABLES) memcpy(data, tunable_strings, sizeof(tunable_strings)); + else if (stringset == ETH_SS_PHY_TUNABLES) + memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings)); else if (stringset == ETH_SS_PHY_STATS) { struct phy_device *phydev = dev->phydev; @@ -2422,6 +2433,85 @@ static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr) }; } +static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna) +{ + switch (tuna->id) { + case ETHTOOL_PHY_DOWNSHIFT: + if (tuna->len != sizeof(u8) || + tuna->type_id != ETHTOOL_TUNABLE_U8) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int get_phy_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + struct phy_device *phydev = dev->phydev; + void *data; + + if (!(phydev && phydev->drv && phydev->drv->get_tunable)) + return -EOPNOTSUPP; + + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_phy_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + mutex_lock(&phydev->lock); + ret = phydev->drv->get_tunable(phydev, &tuna, data); + mutex_unlock(&phydev->lock); + if (ret) + goto out; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_to_user(useraddr, data, tuna.len)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + +static int set_phy_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + struct phy_device *phydev = dev->phydev; + void *data; + + if (!(phydev && phydev->drv && phydev->drv->set_tunable)) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_phy_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_from_user(data, useraddr, tuna.len)) + goto out; + mutex_lock(&phydev->lock); + ret = phydev->drv->set_tunable(phydev, &tuna, data); + mutex_unlock(&phydev->lock); + +out: + kfree(data); + return ret; +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) @@ -2479,6 +2569,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GET_TS_INFO: case ETHTOOL_GEEE: case ETHTOOL_GTUNABLE: + case ETHTOOL_PHY_GTUNABLE: case ETHTOOL_GLINKSETTINGS: break; default: @@ -2685,6 +2776,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SLINKSETTINGS: rc = ethtool_set_link_ksettings(dev, useraddr); break; + case ETHTOOL_PHY_GTUNABLE: + rc = get_phy_tunable(dev, useraddr); + break; + case ETHTOOL_PHY_STUNABLE: + rc = set_phy_tunable(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index be4629c344a6..b6791d94841d 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -18,6 +18,11 @@ #include <net/fib_rules.h> #include <net/ip_tunnels.h> +static const struct fib_kuid_range fib_kuid_range_unset = { + KUIDT_INIT(0), + KUIDT_INIT(~0), +}; + int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table, u32 flags) { @@ -33,6 +38,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->table = table; r->flags = flags; r->fr_net = ops->fro_net; + r->uid_range = fib_kuid_range_unset; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; @@ -172,6 +178,34 @@ void fib_rules_unregister(struct fib_rules_ops *ops) } EXPORT_SYMBOL_GPL(fib_rules_unregister); +static int uid_range_set(struct fib_kuid_range *range) +{ + return uid_valid(range->start) && uid_valid(range->end); +} + +static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb) +{ + struct fib_rule_uid_range *in; + struct fib_kuid_range out; + + in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]); + + out.start = make_kuid(current_user_ns(), in->start); + out.end = make_kuid(current_user_ns(), in->end); + + return out; +} + +static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) +{ + struct fib_rule_uid_range out = { + from_kuid_munged(current_user_ns(), range->start), + from_kuid_munged(current_user_ns(), range->end) + }; + + return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); +} + static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) @@ -193,6 +227,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) goto out; + if (uid_lt(fl->flowi_uid, rule->uid_range.start) || + uid_gt(fl->flowi_uid, rule->uid_range.end)) + goto out; + ret = ops->match(rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; @@ -305,6 +343,10 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, if (r->l3mdev != rule->l3mdev) continue; + if (!uid_eq(r->uid_range.start, rule->uid_range.start) || + !uid_eq(r->uid_range.end, rule->uid_range.end)) + continue; + if (!ops->compare(r, frh, tb)) continue; return 1; @@ -429,6 +471,21 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) if (rule->l3mdev && rule->table) goto errout_free; + if (tb[FRA_UID_RANGE]) { + if (current_user_ns() != net->user_ns) { + err = -EPERM; + goto errout_free; + } + + rule->uid_range = nla_get_kuid_range(tb); + + if (!uid_range_set(&rule->uid_range) || + !uid_lte(rule->uid_range.start, rule->uid_range.end)) + goto errout_free; + } else { + rule->uid_range = fib_kuid_range_unset; + } + if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; @@ -497,6 +554,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) struct fib_rules_ops *ops = NULL; struct fib_rule *rule, *tmp; struct nlattr *tb[FRA_MAX+1]; + struct fib_kuid_range range; int err = -EINVAL; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) @@ -516,6 +574,14 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) if (err < 0) goto errout; + if (tb[FRA_UID_RANGE]) { + range = nla_get_kuid_range(tb); + if (!uid_range_set(&range)) + goto errout; + } else { + range = fib_kuid_range_unset; + } + list_for_each_entry(rule, &ops->rules_list, list) { if (frh->action && (frh->action != rule->action)) continue; @@ -552,6 +618,11 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV]))) continue; + if (uid_range_set(&range) && + (!uid_eq(rule->uid_range.start, range.start) || + !uid_eq(rule->uid_range.end, range.end))) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -619,7 +690,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ - + nla_total_size_64bit(8); /* FRA_TUN_ID */ + + nla_total_size_64bit(8) /* FRA_TUN_ID */ + + nla_total_size(sizeof(struct fib_kuid_range)); if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -679,7 +751,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->tun_id && nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || (rule->l3mdev && - nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev))) + nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || + (uid_range_set(&rule->uid_range) && + nla_put_uid_range(skb, &rule->uid_range))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/net/core/filter.c b/net/core/filter.c index b391209838ef..56b43587d200 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -30,6 +30,7 @@ #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> +#include <linux/if_arp.h> #include <linux/gfp.h> #include <net/ip.h> #include <net/protocol.h> @@ -78,6 +79,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) return -ENOMEM; + err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); + if (err) + return err; + err = security_sock_rcv_skb(sk, skb); if (err) return err; @@ -1684,6 +1689,12 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags) { + /* Verify that a link layer header is carried */ + if (unlikely(skb->mac_header >= skb->network_header)) { + kfree_skb(skb); + return -ERANGE; + } + bpf_push_mac_rcsum(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); @@ -1692,17 +1703,10 @@ static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, u32 flags) { - switch (dev->type) { - case ARPHRD_TUNNEL: - case ARPHRD_TUNNEL6: - case ARPHRD_SIT: - case ARPHRD_IPGRE: - case ARPHRD_VOID: - case ARPHRD_NONE: - return __bpf_redirect_no_mac(skb, dev, flags); - default: + if (dev_is_mac_header_xmit(dev)) return __bpf_redirect_common(skb, dev, flags); - } + else + return __bpf_redirect_no_mac(skb, dev, flags); } BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) @@ -2190,12 +2194,53 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + u32 max_len = __bpf_skb_max_len(skb); + u32 new_len = skb->len + head_room; + int ret; + + if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || + new_len < skb->len)) + return -EINVAL; + + ret = skb_cow(skb, head_room); + if (likely(!ret)) { + /* Idea for this helper is that we currently only + * allow to expand on mac header. This means that + * skb->protocol network header, etc, stay as is. + * Compared to bpf_skb_change_tail(), we're more + * flexible due to not needing to linearize or + * reset GSO. Intention for this helper is to be + * used by an L3 skb that needs to push mac header + * for redirection into L2 device. + */ + __skb_push(skb, head_room); + memset(skb->data, 0, head_room); + skb_reset_mac_header(skb); + } + + bpf_compute_data_end(skb); + return 0; +} + +static const struct bpf_func_proto bpf_skb_change_head_proto = { + .func = bpf_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + bool bpf_helper_changes_skb_data(void *func) { if (func == bpf_skb_vlan_push || func == bpf_skb_vlan_pop || func == bpf_skb_store_bytes || func == bpf_skb_change_proto || + func == bpf_skb_change_head || func == bpf_skb_change_tail || func == bpf_skb_pull_data || func == bpf_l3_csum_replace || @@ -2544,6 +2589,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_raw_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: @@ -2628,6 +2675,79 @@ xdp_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * +cg_skb_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + default: + return sk_filter_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_inout_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + default: + return sk_filter_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_xmit_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_get_tunnel_key: + return &bpf_skb_get_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_key: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_redirect: + return &bpf_redirect_proto; + case BPF_FUNC_clone_redirect: + return &bpf_clone_redirect_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; + case BPF_FUNC_l3_csum_replace: + return &bpf_l3_csum_replace_proto; + case BPF_FUNC_l4_csum_replace: + return &bpf_l4_csum_replace_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; + default: + return lwt_inout_func_proto(func_id); + } +} + static bool __is_valid_access(int off, int size, enum bpf_access_type type) { if (off < 0 || off >= sizeof(struct __sk_buff)) @@ -2665,6 +2785,65 @@ static bool sk_filter_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static bool lwt_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + switch (off) { + case offsetof(struct __sk_buff, tc_classid): + return false; + } + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct __sk_buff, mark): + case offsetof(struct __sk_buff, priority): + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case offsetof(struct __sk_buff, data): + *reg_type = PTR_TO_PACKET; + break; + case offsetof(struct __sk_buff, data_end): + *reg_type = PTR_TO_PACKET_END; + break; + } + + return __is_valid_access(off, size, type); +} + +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + break; + default: + return false; + } + } + + if (off < 0 || off + size > sizeof(struct bpf_sock)) + return false; + + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + + if (size != sizeof(__u32)) + return false; + + return true; +} + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { @@ -2923,6 +3102,51 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } +static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, + int dst_reg, int src_reg, + int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct bpf_sock, bound_dev_if): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, sk_bound_dev_if)); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, sk_bound_dev_if)); + break; + + case offsetof(struct bpf_sock, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sock, sk_family)); + break; + + case offsetof(struct bpf_sock, type): + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, __sk_flags_offset)); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT); + break; + + case offsetof(struct bpf_sock, protocol): + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, __sk_flags_offset)); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT); + break; + } + + return insn - insn_buf; +} + static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, @@ -2990,6 +3214,31 @@ static const struct bpf_verifier_ops xdp_ops = { .convert_ctx_access = xdp_convert_ctx_access, }; +static const struct bpf_verifier_ops cg_skb_ops = { + .get_func_proto = cg_skb_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + +static const struct bpf_verifier_ops lwt_inout_ops = { + .get_func_proto = lwt_inout_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + +static const struct bpf_verifier_ops lwt_xmit_ops = { + .get_func_proto = lwt_xmit_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, +}; + +static const struct bpf_verifier_ops cg_sock_ops = { + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sock_filter_is_valid_access, + .convert_ctx_access = sock_filter_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -3010,12 +3259,42 @@ static struct bpf_prog_type_list xdp_type __read_mostly = { .type = BPF_PROG_TYPE_XDP, }; +static struct bpf_prog_type_list cg_skb_type __read_mostly = { + .ops = &cg_skb_ops, + .type = BPF_PROG_TYPE_CGROUP_SKB, +}; + +static struct bpf_prog_type_list lwt_in_type __read_mostly = { + .ops = &lwt_inout_ops, + .type = BPF_PROG_TYPE_LWT_IN, +}; + +static struct bpf_prog_type_list lwt_out_type __read_mostly = { + .ops = &lwt_inout_ops, + .type = BPF_PROG_TYPE_LWT_OUT, +}; + +static struct bpf_prog_type_list lwt_xmit_type __read_mostly = { + .ops = &lwt_xmit_ops, + .type = BPF_PROG_TYPE_LWT_XMIT, +}; + +static struct bpf_prog_type_list cg_sock_type __read_mostly = { + .ops = &cg_sock_ops, + .type = BPF_PROG_TYPE_CGROUP_SOCK +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); bpf_register_prog_type(&sched_act_type); bpf_register_prog_type(&xdp_type); + bpf_register_prog_type(&cg_skb_type); + bpf_register_prog_type(&cg_sock_type); + bpf_register_prog_type(&lwt_in_type); + bpf_register_prog_type(&lwt_out_type); + bpf_register_prog_type(&lwt_xmit_type); return 0; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index c6d8207ffa7e..1eb6f949e5b2 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -726,7 +726,7 @@ EXPORT_SYMBOL(make_flow_keys_digest); static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; -u32 __skb_get_hash_symmetric(struct sk_buff *skb) +u32 __skb_get_hash_symmetric(const struct sk_buff *skb) { struct flow_keys keys; diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c new file mode 100644 index 000000000000..71bb3e2eca08 --- /dev/null +++ b/net/core/lwt_bpf.c @@ -0,0 +1,396 @@ +/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/types.h> +#include <linux/bpf.h> +#include <net/lwtunnel.h> + +struct bpf_lwt_prog { + struct bpf_prog *prog; + char *name; +}; + +struct bpf_lwt { + struct bpf_lwt_prog in; + struct bpf_lwt_prog out; + struct bpf_lwt_prog xmit; + int family; +}; + +#define MAX_PROG_NAME 256 + +static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) +{ + return (struct bpf_lwt *)lwt->data; +} + +#define NO_REDIRECT false +#define CAN_REDIRECT true + +static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, + struct dst_entry *dst, bool can_redirect) +{ + int ret; + + /* Preempt disable is needed to protect per-cpu redirect_info between + * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and + * access to maps strictly require a rcu_read_lock() for protection, + * mixing with BH RCU lock doesn't work. + */ + preempt_disable(); + rcu_read_lock(); + bpf_compute_data_end(skb); + ret = bpf_prog_run_save_cb(lwt->prog, skb); + rcu_read_unlock(); + + switch (ret) { + case BPF_OK: + break; + + case BPF_REDIRECT: + if (unlikely(!can_redirect)) { + pr_warn_once("Illegal redirect return code in prog %s\n", + lwt->name ? : "<unknown>"); + ret = BPF_OK; + } else { + ret = skb_do_redirect(skb); + if (ret == 0) + ret = BPF_REDIRECT; + } + break; + + case BPF_DROP: + kfree_skb(skb); + ret = -EPERM; + break; + + default: + pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); + kfree_skb(skb); + ret = -EINVAL; + break; + } + + preempt_enable(); + + return ret; +} + +static int bpf_input(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->in.prog) { + ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); + if (ret < 0) + return ret; + } + + if (unlikely(!dst->lwtstate->orig_input)) { + pr_warn_once("orig_input not set on dst for prog %s\n", + bpf->out.name); + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_input(skb); +} + +static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->out.prog) { + ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); + if (ret < 0) + return ret; + } + + if (unlikely(!dst->lwtstate->orig_output)) { + pr_warn_once("orig_output not set on dst for prog %s\n", + bpf->out.name); + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_output(net, sk, skb); +} + +static int xmit_check_hhlen(struct sk_buff *skb) +{ + int hh_len = skb_dst(skb)->dev->hard_header_len; + + if (skb_headroom(skb) < hh_len) { + int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); + + if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) + return -ENOMEM; + } + + return 0; +} + +static int bpf_xmit(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->xmit.prog) { + int ret; + + ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); + switch (ret) { + case BPF_OK: + /* If the header was expanded, headroom might be too + * small for L2 header to come, expand as needed. + */ + ret = xmit_check_hhlen(skb); + if (unlikely(ret)) + return ret; + + return LWTUNNEL_XMIT_CONTINUE; + case BPF_REDIRECT: + return LWTUNNEL_XMIT_DONE; + default: + return ret; + } + } + + return LWTUNNEL_XMIT_CONTINUE; +} + +static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) +{ + if (prog->prog) + bpf_prog_put(prog->prog); + + kfree(prog->name); +} + +static void bpf_destroy_state(struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + bpf_lwt_prog_destroy(&bpf->in); + bpf_lwt_prog_destroy(&bpf->out); + bpf_lwt_prog_destroy(&bpf->xmit); +} + +static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { + [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, + [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, + .len = MAX_PROG_NAME }, +}; + +static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, + enum bpf_prog_type type) +{ + struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; + struct bpf_prog *p; + int ret; + u32 fd; + + ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) + return -EINVAL; + + prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL); + if (!prog->name) + return -ENOMEM; + + fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); + p = bpf_prog_get_type(fd, type); + if (IS_ERR(p)) + return PTR_ERR(p); + + prog->prog = p; + + return 0; +} + +static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { + [LWT_BPF_IN] = { .type = NLA_NESTED, }, + [LWT_BPF_OUT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, +}; + +static int bpf_build_state(struct net_device *dev, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts) +{ + struct nlattr *tb[LWT_BPF_MAX + 1]; + struct lwtunnel_state *newts; + struct bpf_lwt *bpf; + int ret; + + if (family != AF_INET && family != AF_INET6) + return -EAFNOSUPPORT; + + ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) + return -EINVAL; + + newts = lwtunnel_state_alloc(sizeof(*bpf)); + if (!newts) + return -ENOMEM; + + newts->type = LWTUNNEL_ENCAP_BPF; + bpf = bpf_lwt_lwtunnel(newts); + + if (tb[LWT_BPF_IN]) { + newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, + BPF_PROG_TYPE_LWT_IN); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_OUT]) { + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, + BPF_PROG_TYPE_LWT_OUT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT]) { + newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, + BPF_PROG_TYPE_LWT_XMIT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT_HEADROOM]) { + u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); + + if (headroom > LWT_BPF_MAX_HEADROOM) { + ret = -ERANGE; + goto errout; + } + + newts->headroom = headroom; + } + + bpf->family = family; + *ts = newts; + + return 0; + +errout: + bpf_destroy_state(newts); + kfree(newts); + return ret; +} + +static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, + struct bpf_lwt_prog *prog) +{ + struct nlattr *nest; + + if (!prog->prog) + return 0; + + nest = nla_nest_start(skb, attr); + if (!nest) + return -EMSGSIZE; + + if (prog->name && + nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) + return -EMSGSIZE; + + return nla_nest_end(skb, nest); +} + +static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) + return -EMSGSIZE; + + return 0; +} + +static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + int nest_len = nla_total_size(sizeof(struct nlattr)) + + nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ + 0; + + return nest_len + /* LWT_BPF_IN */ + nest_len + /* LWT_BPF_OUT */ + nest_len + /* LWT_BPF_XMIT */ + 0; +} + +int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) +{ + /* FIXME: + * The LWT state is currently rebuilt for delete requests which + * results in a new bpf_prog instance. Comparing names for now. + */ + if (!a->name && !b->name) + return 0; + + if (!a->name || !b->name) + return 1; + + return strcmp(a->name, b->name); +} + +static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); + struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); + + return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || + bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || + bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); +} + +static const struct lwtunnel_encap_ops bpf_encap_ops = { + .build_state = bpf_build_state, + .destroy_state = bpf_destroy_state, + .input = bpf_input, + .output = bpf_output, + .xmit = bpf_xmit, + .fill_encap = bpf_fill_encap_info, + .get_encap_size = bpf_encap_nlsize, + .cmp_encap = bpf_encap_cmp, +}; + +static int __init bpf_lwt_init(void) +{ + return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); +} + +subsys_initcall(bpf_lwt_init) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index e5f84c26ba1a..a5d4e866ce88 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -39,6 +39,10 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "MPLS"; case LWTUNNEL_ENCAP_ILA: return "ILA"; + case LWTUNNEL_ENCAP_SEG6: + return "SEG6"; + case LWTUNNEL_ENCAP_BPF: + return "BPF"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: @@ -130,6 +134,19 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type, } EXPORT_SYMBOL(lwtunnel_build_state); +void lwtstate_free(struct lwtunnel_state *lws) +{ + const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type]; + + if (ops->destroy_state) { + ops->destroy_state(lws); + kfree_rcu(lws, rcu); + } else { + kfree(lws); + } +} +EXPORT_SYMBOL(lwtstate_free); + int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { const struct lwtunnel_encap_ops *ops; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 2ae929f9bd06..782dd8663665 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2291,13 +2291,10 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; n != NULL; n = rcu_dereference_bh(n->next)) { - if (!net_eq(dev_net(n->dev), net)) - continue; - if (neigh_ifindex_filtered(n->dev, filter_idx)) - continue; - if (neigh_master_filtered(n->dev, filter_master_idx)) - continue; - if (idx < s_idx) + if (idx < s_idx || !net_eq(dev_net(n->dev), net)) + goto next; + if (neigh_ifindex_filtered(n->dev, filter_idx) || + neigh_master_filtered(n->dev, filter_master_idx)) goto next; if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -2332,9 +2329,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (h > s_h) s_idx = 0; for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { - if (pneigh_net(n) != net) - continue; - if (idx < s_idx) + if (idx < s_idx || pneigh_net(n) != net) goto next; if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 6e4f34721080..b0c04cf4851d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -950,10 +950,13 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) } while (--i >= new_num) { + struct kobject *kobj = &dev->_rx[i].kobj; + + if (!list_empty(&dev_net(dev)->exit_list)) + kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) - sysfs_remove_group(&dev->_rx[i].kobj, - dev->sysfs_rx_queue_group); - kobject_put(&dev->_rx[i].kobj); + sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); + kobject_put(kobj); } return error; @@ -1021,7 +1024,6 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue, return sprintf(buf, "%lu", trans_timeout); } -#ifdef CONFIG_XPS static unsigned int get_netdev_queue_index(struct netdev_queue *queue) { struct net_device *dev = queue->dev; @@ -1033,6 +1035,21 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue) return i; } +static ssize_t show_traffic_class(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + char *buf) +{ + struct net_device *dev = queue->dev; + int index = get_netdev_queue_index(queue); + int tc = netdev_txq_to_tc(dev, index); + + if (tc < 0) + return -EINVAL; + + return sprintf(buf, "%u\n", tc); +} + +#ifdef CONFIG_XPS static ssize_t show_tx_maxrate(struct netdev_queue *queue, struct netdev_queue_attribute *attribute, char *buf) @@ -1075,6 +1092,9 @@ static struct netdev_queue_attribute queue_tx_maxrate = static struct netdev_queue_attribute queue_trans_timeout = __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); +static struct netdev_queue_attribute queue_traffic_class = + __ATTR(traffic_class, S_IRUGO, show_traffic_class, NULL); + #ifdef CONFIG_BQL /* * Byte queue limits sysfs structures and functions. @@ -1190,29 +1210,38 @@ static ssize_t show_xps_map(struct netdev_queue *queue, struct netdev_queue_attribute *attribute, char *buf) { struct net_device *dev = queue->dev; + int cpu, len, num_tc = 1, tc = 0; struct xps_dev_maps *dev_maps; cpumask_var_t mask; unsigned long index; - int i, len; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; index = get_netdev_queue_index(queue); + if (dev->num_tc) { + num_tc = dev->num_tc; + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) + return -EINVAL; + } + rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps); if (dev_maps) { - for_each_possible_cpu(i) { - struct xps_map *map = - rcu_dereference(dev_maps->cpu_map[i]); - if (map) { - int j; - for (j = 0; j < map->len; j++) { - if (map->queues[j] == index) { - cpumask_set_cpu(i, mask); - break; - } + for_each_possible_cpu(cpu) { + int i, tci = cpu * num_tc + tc; + struct xps_map *map; + + map = rcu_dereference(dev_maps->cpu_map[tci]); + if (!map) + continue; + + for (i = map->len; i--;) { + if (map->queues[i] == index) { + cpumask_set_cpu(cpu, mask); + break; } } } @@ -1260,6 +1289,7 @@ static struct netdev_queue_attribute xps_cpus_attribute = static struct attribute *netdev_queue_default_attrs[] = { &queue_trans_timeout.attr, + &queue_traffic_class.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, &queue_tx_maxrate.attr, @@ -1340,6 +1370,8 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; + if (!list_empty(&dev_net(dev)->exit_list)) + queue->kobj.uevent_suppress = 1; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); #endif @@ -1525,6 +1557,9 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &(ndev->dev); + if (!list_empty(&dev_net(ndev)->exit_list)) + dev_set_uevent_suppress(dev, 1); + kobject_get(&dev->kobj); remove_queue_kobjects(ndev); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 7001da910c6b..a38feac547d7 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -55,7 +55,7 @@ static struct net_generic *net_alloc_generic(void) return ng; } -static int net_assign_generic(struct net *net, int id, void *data) +static int net_assign_generic(struct net *net, unsigned int id, void *data) { struct net_generic *ng, *old_ng; @@ -122,8 +122,7 @@ out: static void ops_free(const struct pernet_operations *ops, struct net *net) { if (ops->id && ops->size) { - int id = *ops->id; - kfree(net_generic(net, id)); + kfree(net_generic(net, *ops->id)); } } @@ -384,7 +383,14 @@ struct net *copy_net_ns(unsigned long flags, get_user_ns(user_ns); - mutex_lock(&net_mutex); + rv = mutex_lock_killable(&net_mutex); + if (rv < 0) { + net_free(net); + dec_net_namespaces(ucounts); + put_user_ns(user_ns); + return ERR_PTR(rv); + } + net->ucounts = ucounts; rv = setup_net(net, user_ns); if (rv == 0) { @@ -876,7 +882,7 @@ again: } return error; } - max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id); + max_gen_ptrs = max(max_gen_ptrs, *ops->id); } error = __register_pernet_operations(list, ops); if (error) { diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 53599bd0c82d..9424673009c1 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -171,12 +171,12 @@ static void poll_one_napi(struct napi_struct *napi) static void poll_napi(struct net_device *dev) { struct napi_struct *napi; + int cpu = smp_processor_id(); list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner != smp_processor_id() && - spin_trylock(&napi->poll_lock)) { + if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) { poll_one_napi(napi); - spin_unlock(&napi->poll_lock); + smp_store_release(&napi->poll_owner, -1); } } } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 306b8f0e03c1..8e69ce472236 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -413,7 +413,7 @@ struct pktgen_hdr { }; -static int pg_net_id __read_mostly; +static unsigned int pg_net_id __read_mostly; struct pktgen_net { struct net *net; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a6196cf844f6..c482491a63d8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1505,6 +1505,7 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, + [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, }; static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) @@ -2164,6 +2165,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_XDP]) { struct nlattr *xdp[IFLA_XDP_MAX + 1]; + u32 xdp_flags = 0; err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], ifla_xdp_policy); @@ -2174,9 +2176,19 @@ static int do_setlink(const struct sk_buff *skb, err = -EINVAL; goto errout; } + + if (xdp[IFLA_XDP_FLAGS]) { + xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]); + if (xdp_flags & ~XDP_FLAGS_MASK) { + err = -EINVAL; + goto errout; + } + } + if (xdp[IFLA_XDP_FD]) { err = dev_change_xdp_fd(dev, - nla_get_s32(xdp[IFLA_XDP_FD])); + nla_get_s32(xdp[IFLA_XDP_FD]), + xdp_flags); if (err) goto errout; status |= DO_SETLINK_NOTIFY; @@ -3165,7 +3177,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb, err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) goto out; - nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); + err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); out: netif_addr_unlock_bh(dev); return err; @@ -3671,7 +3683,7 @@ static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev, if (!size) continue; - if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) + if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) continue; attr = nla_reserve_64bit(skb, attr_id, size, @@ -3712,7 +3724,7 @@ static int rtnl_get_offload_stats_size(const struct net_device *dev) for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { - if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) + if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) continue; size = rtnl_get_offload_stats_attr_size(attr_id); nla_size += nla_total_size_64bit(size); diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index fd3ce461fbe6..88a8e429fc3e 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -12,6 +12,7 @@ #include <net/secure_seq.h> #if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) +#include <net/tcp.h> #define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; @@ -40,8 +41,8 @@ static u32 seq_scale(u32 seq) #endif #if IS_ENABLED(CONFIG_IPV6) -__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, - __be16 sport, __be16 dport) +u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport, u32 *tsoff) { u32 secret[MD5_MESSAGE_BYTES / 4]; u32 hash[MD5_DIGEST_WORDS]; @@ -58,6 +59,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, md5_transform(hash, secret); + *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0; return seq_scale(hash[0]); } EXPORT_SYMBOL(secure_tcpv6_sequence_number); @@ -86,8 +88,8 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #ifdef CONFIG_INET -__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) +u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport, u32 *tsoff) { u32 hash[MD5_DIGEST_WORDS]; @@ -99,6 +101,7 @@ __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, md5_transform(hash, net_secret); + *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0; return seq_scale(hash[0]); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1e3e0087245b..b45cd1494243 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -354,7 +354,7 @@ EXPORT_SYMBOL(build_skb); struct napi_alloc_cache { struct page_frag_cache page; - size_t skb_count; + unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -2656,7 +2656,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) struct skb_frag_struct *fragfrom, *fragto; BUG_ON(shiftlen > skb->len); - BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ + + if (skb_headlen(skb)) + return 0; todo = shiftlen; from = 0; @@ -3712,21 +3714,29 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_err_skb); +static bool is_icmp_err_skb(const struct sk_buff *skb) +{ + return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || + SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); +} + struct sk_buff *sock_dequeue_err_skb(struct sock *sk) { struct sk_buff_head *q = &sk->sk_error_queue; - struct sk_buff *skb, *skb_next; + struct sk_buff *skb, *skb_next = NULL; + bool icmp_next = false; unsigned long flags; - int err = 0; spin_lock_irqsave(&q->lock, flags); skb = __skb_dequeue(q); if (skb && (skb_next = skb_peek(q))) - err = SKB_EXT_ERR(skb_next)->ee.ee_errno; + icmp_next = is_icmp_err_skb(skb_next); spin_unlock_irqrestore(&q->lock, flags); - sk->sk_err = err; - if (err) + if (is_icmp_err_skb(skb) && !icmp_next) + sk->sk_err = 0; + + if (skb_next) sk->sk_error_report(sk); return skb; @@ -3838,10 +3848,18 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (!skb_may_tx_timestamp(sk, tsonly)) return; - if (tsonly) - skb = alloc_skb(0, GFP_ATOMIC); - else + if (tsonly) { +#ifdef CONFIG_INET + if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && + sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) + skb = tcp_get_timestamping_opt_stats(sk); + else +#endif + skb = alloc_skb(0, GFP_ATOMIC); + } else { skb = skb_clone(orig_skb, GFP_ATOMIC); + } if (!skb) return; diff --git a/net/core/sock.c b/net/core/sock.c index 00a074dbfe9b..9fa46b956bdc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -854,6 +854,13 @@ set_rcvbuf: sk->sk_tskey = 0; } } + + if (val & SOF_TIMESTAMPING_OPT_STATS && + !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { + ret = -EINVAL; + break; + } + sk->sk_tsflags = val; if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, @@ -2080,37 +2087,31 @@ void __sk_flush_backlog(struct sock *sk) */ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) { + DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc; - DEFINE_WAIT(wait); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb); + rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); return rc; } EXPORT_SYMBOL(sk_wait_data); /** - * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated + * __sk_mem_raise_allocated - increase memory_allocated * @sk: socket * @size: memory size to allocate + * @amt: pages to allocate * @kind: allocation type * - * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means - * rmem allocation. This function assumes that protocols which have - * memory_pressure use sk_wmem_queued as write buffer accounting. + * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc */ -int __sk_mem_schedule(struct sock *sk, int size, int kind) +int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct proto *prot = sk->sk_prot; - int amt = sk_mem_pages(size); - long allocated; - - sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - - allocated = sk_memory_allocated_add(sk, amt); + long allocated = sk_memory_allocated_add(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg && !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) @@ -2171,9 +2172,6 @@ suppress_allocation: trace_sock_exceed_buf_limit(sk, prot, allocated); - /* Alas. Undo changes. */ - sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - sk_memory_allocated_sub(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) @@ -2181,18 +2179,40 @@ suppress_allocation: return 0; } +EXPORT_SYMBOL(__sk_mem_raise_allocated); + +/** + * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated + * @sk: socket + * @size: memory size to allocate + * @kind: allocation type + * + * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means + * rmem allocation. This function assumes that protocols which have + * memory_pressure use sk_wmem_queued as write buffer accounting. + */ +int __sk_mem_schedule(struct sock *sk, int size, int kind) +{ + int ret, amt = sk_mem_pages(size); + + sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; + ret = __sk_mem_raise_allocated(sk, size, amt, kind); + if (!ret) + sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; + return ret; +} EXPORT_SYMBOL(__sk_mem_schedule); /** - * __sk_mem_reclaim - reclaim memory_allocated + * __sk_mem_reduce_allocated - reclaim memory_allocated * @sk: socket - * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) + * @amount: number of quanta + * + * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc */ -void __sk_mem_reclaim(struct sock *sk, int amount) +void __sk_mem_reduce_allocated(struct sock *sk, int amount) { - amount >>= SK_MEM_QUANTUM_SHIFT; sk_memory_allocated_sub(sk, amount); - sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); @@ -2201,6 +2221,19 @@ void __sk_mem_reclaim(struct sock *sk, int amount) (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } +EXPORT_SYMBOL(__sk_mem_reduce_allocated); + +/** + * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated + * @sk: socket + * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) + */ +void __sk_mem_reclaim(struct sock *sk, int amount) +{ + amount >>= SK_MEM_QUANTUM_SHIFT; + sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + __sk_mem_reduce_allocated(sk, amount); +} EXPORT_SYMBOL(__sk_mem_reclaim); int sk_set_peek_off(struct sock *sk, int val) @@ -2436,8 +2469,11 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_type = sock->type; sk->sk_wq = sock->wq; sock->sk = sk; - } else + sk->sk_uid = SOCK_INODE(sock)->i_uid; + } else { sk->sk_wq = NULL; + sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); + } rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, diff --git a/net/core/stream.c b/net/core/stream.c index 1086c8b280a8..f575bcf64af2 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -53,8 +53,8 @@ void sk_stream_write_space(struct sock *sk) */ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) { + DEFINE_WAIT_FUNC(wait, woken_wake_function); struct task_struct *tsk = current; - DEFINE_WAIT(wait); int done; do { @@ -68,13 +68,13 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) if (signal_pending(tsk)) return sock_intr_errno(*timeo_p); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, !sk->sk_err && !((1 << sk->sk_state) & - ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); - finish_wait(sk_sleep(sk), &wait); + ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait); + remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending--; } while (!done); return 0; @@ -94,16 +94,16 @@ static inline int sk_stream_closing(struct sock *sk) void sk_stream_wait_close(struct sock *sk, long timeout) { if (timeout) { - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + add_wait_queue(sk_sleep(sk), &wait); do { - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); - if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk))) + if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait)) break; } while (!signal_pending(current) && timeout); - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); } } EXPORT_SYMBOL(sk_stream_wait_close); @@ -119,16 +119,16 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) long vm_wait = 0; long current_timeo = *timeo_p; bool noblock = (*timeo_p ? false : true); - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk_stream_memory_free(sk)) current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2; + add_wait_queue(sk_sleep(sk), &wait); + while (1) { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; if (!*timeo_p) { @@ -147,7 +147,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) sk_wait_event(sk, ¤t_timeo, sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || (sk_stream_memory_free(sk) && - !vm_wait)); + !vm_wait), &wait); sk->sk_write_pending--; if (vm_wait) { @@ -161,7 +161,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) *timeo_p = current_timeo; } out: - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); return err; do_error: |