summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/dev.c182
-rw-r--r--net/core/dev.h6
-rw-r--r--net/core/dev_ioctl.c2
-rw-r--r--net/core/dst.c10
-rw-r--r--net/core/filter.c83
-rw-r--r--net/core/gso_test.c278
-rw-r--r--net/core/netclassid_cgroup.c6
-rw-r--r--net/core/netdev-genl.c12
-rw-r--r--net/core/page_pool.c31
-rw-r--r--net/core/pktgen.c102
-rw-r--r--net/core/rtnetlink.c152
-rw-r--r--net/core/selftests.c9
-rw-r--r--net/core/skbuff.c27
-rw-r--r--net/core/sock.c228
-rw-r--r--net/core/xdp.c4
16 files changed, 868 insertions, 265 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 731db2eaa610..0cb734cbc24b 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -40,3 +40,4 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
obj-$(CONFIG_OF) += of_net.o
+obj-$(CONFIG_NET_TEST) += gso_test.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 9f3f8930c691..0d548431f3fa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1057,7 +1057,7 @@ EXPORT_SYMBOL(dev_valid_name);
* __dev_alloc_name - allocate a name for a device
* @net: network namespace to allocate the device name in
* @name: name format string
- * @buf: scratch buffer and result name string
+ * @res: result name string
*
* Passed a format string - eg "lt%d" it will try and find a suitable
* id. It scans list of devices to build up a free map, then chooses
@@ -1068,106 +1068,79 @@ EXPORT_SYMBOL(dev_valid_name);
* Returns the number of the unit assigned or a negative errno code.
*/
-static int __dev_alloc_name(struct net *net, const char *name, char *buf)
+static int __dev_alloc_name(struct net *net, const char *name, char *res)
{
int i = 0;
const char *p;
const int max_netdevices = 8*PAGE_SIZE;
unsigned long *inuse;
struct net_device *d;
+ char buf[IFNAMSIZ];
- if (!dev_valid_name(name))
- return -EINVAL;
-
+ /* Verify the string as this thing may have come from the user.
+ * There must be one "%d" and no other "%" characters.
+ */
p = strchr(name, '%');
- if (p) {
- /*
- * Verify the string as this thing may have come from
- * the user. There must be either one "%d" and no other "%"
- * characters.
- */
- if (p[1] != 'd' || strchr(p + 2, '%'))
- return -EINVAL;
-
- /* Use one page as a bit array of possible slots */
- inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
- if (!inuse)
- return -ENOMEM;
+ if (!p || p[1] != 'd' || strchr(p + 2, '%'))
+ return -EINVAL;
- for_each_netdev(net, d) {
- struct netdev_name_node *name_node;
+ /* Use one page as a bit array of possible slots */
+ inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
+ if (!inuse)
+ return -ENOMEM;
- netdev_for_each_altname(d, name_node) {
- if (!sscanf(name_node->name, name, &i))
- continue;
- if (i < 0 || i >= max_netdevices)
- continue;
+ for_each_netdev(net, d) {
+ struct netdev_name_node *name_node;
- /* avoid cases where sscanf is not exact inverse of printf */
- snprintf(buf, IFNAMSIZ, name, i);
- if (!strncmp(buf, name_node->name, IFNAMSIZ))
- __set_bit(i, inuse);
- }
- if (!sscanf(d->name, name, &i))
+ netdev_for_each_altname(d, name_node) {
+ if (!sscanf(name_node->name, name, &i))
continue;
if (i < 0 || i >= max_netdevices)
continue;
- /* avoid cases where sscanf is not exact inverse of printf */
+ /* avoid cases where sscanf is not exact inverse of printf */
snprintf(buf, IFNAMSIZ, name, i);
- if (!strncmp(buf, d->name, IFNAMSIZ))
+ if (!strncmp(buf, name_node->name, IFNAMSIZ))
__set_bit(i, inuse);
}
+ if (!sscanf(d->name, name, &i))
+ continue;
+ if (i < 0 || i >= max_netdevices)
+ continue;
- i = find_first_zero_bit(inuse, max_netdevices);
- bitmap_free(inuse);
+ /* avoid cases where sscanf is not exact inverse of printf */
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!strncmp(buf, d->name, IFNAMSIZ))
+ __set_bit(i, inuse);
}
- snprintf(buf, IFNAMSIZ, name, i);
- if (!netdev_name_in_use(net, buf))
- return i;
+ i = find_first_zero_bit(inuse, max_netdevices);
+ bitmap_free(inuse);
+ if (i == max_netdevices)
+ return -ENFILE;
- /* It is possible to run out of possible slots
- * when the name is long and there isn't enough space left
- * for the digits, or if all bits are used.
- */
- return -ENFILE;
+ snprintf(res, IFNAMSIZ, name, i);
+ return i;
}
+/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
static int dev_prep_valid_name(struct net *net, struct net_device *dev,
- const char *want_name, char *out_name)
+ const char *want_name, char *out_name,
+ int dup_errno)
{
- int ret;
-
if (!dev_valid_name(want_name))
return -EINVAL;
- if (strchr(want_name, '%')) {
- ret = __dev_alloc_name(net, want_name, out_name);
- return ret < 0 ? ret : 0;
- } else if (netdev_name_in_use(net, want_name)) {
- return -EEXIST;
- } else if (out_name != want_name) {
- strscpy(out_name, want_name, IFNAMSIZ);
- }
+ if (strchr(want_name, '%'))
+ return __dev_alloc_name(net, want_name, out_name);
+ if (netdev_name_in_use(net, want_name))
+ return -dup_errno;
+ if (out_name != want_name)
+ strscpy(out_name, want_name, IFNAMSIZ);
return 0;
}
-static int dev_alloc_name_ns(struct net *net,
- struct net_device *dev,
- const char *name)
-{
- char buf[IFNAMSIZ];
- int ret;
-
- BUG_ON(!net);
- ret = __dev_alloc_name(net, name, buf);
- if (ret >= 0)
- strscpy(dev->name, buf, IFNAMSIZ);
- return ret;
-}
-
/**
* dev_alloc_name - allocate a name for a device
* @dev: device
@@ -1184,20 +1157,17 @@ static int dev_alloc_name_ns(struct net *net,
int dev_alloc_name(struct net_device *dev, const char *name)
{
- return dev_alloc_name_ns(dev_net(dev), dev, name);
+ return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
}
EXPORT_SYMBOL(dev_alloc_name);
static int dev_get_valid_name(struct net *net, struct net_device *dev,
const char *name)
{
- char buf[IFNAMSIZ];
int ret;
- ret = dev_prep_valid_name(net, dev, name, buf);
- if (ret >= 0)
- strscpy(dev->name, buf, IFNAMSIZ);
- return ret;
+ ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
+ return ret < 0 ? ret : 0;
}
/**
@@ -3939,7 +3909,8 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
#endif /* CONFIG_NET_EGRESS */
#ifdef CONFIG_NET_XGRESS
-static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
+static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
+ enum skb_drop_reason *drop_reason)
{
int ret = TC_ACT_UNSPEC;
#ifdef CONFIG_NET_CLS_ACT
@@ -3951,12 +3922,14 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
tc_skb_cb(skb)->mru = 0;
tc_skb_cb(skb)->post_ct = false;
+ res.drop_reason = *drop_reason;
mini_qdisc_bstats_cpu_update(miniq, skb);
ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
/* Only tcf related quirks below. */
switch (ret) {
case TC_ACT_SHOT:
+ *drop_reason = res.drop_reason;
mini_qdisc_qstats_cpu_drop(miniq);
break;
case TC_ACT_OK:
@@ -4006,6 +3979,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
struct net_device *orig_dev, bool *another)
{
struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
int sch_ret;
if (!entry)
@@ -4023,7 +3997,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
if (sch_ret != TC_ACT_UNSPEC)
goto ingress_verdict;
}
- sch_ret = tc_run(tcx_entry(entry), skb);
+ sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
ingress_verdict:
switch (sch_ret) {
case TC_ACT_REDIRECT:
@@ -4040,7 +4014,7 @@ ingress_verdict:
*ret = NET_RX_SUCCESS;
return NULL;
case TC_ACT_SHOT:
- kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
+ kfree_skb_reason(skb, drop_reason);
*ret = NET_RX_DROP;
return NULL;
/* used by tc_run */
@@ -4061,6 +4035,7 @@ static __always_inline struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
int sch_ret;
if (!entry)
@@ -4074,7 +4049,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
if (sch_ret != TC_ACT_UNSPEC)
goto egress_verdict;
}
- sch_ret = tc_run(tcx_entry(entry), skb);
+ sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
egress_verdict:
switch (sch_ret) {
case TC_ACT_REDIRECT:
@@ -4083,7 +4058,7 @@ egress_verdict:
*ret = NET_XMIT_SUCCESS;
return NULL;
case TC_ACT_SHOT:
- kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
+ kfree_skb_reason(skb, drop_reason);
*ret = NET_XMIT_DROP;
return NULL;
/* used by tc_run */
@@ -6552,9 +6527,11 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)
* accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
- if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+ if (napi_is_scheduled(n)) {
work = n->poll(n, weight);
trace_napi_poll(n, work, weight);
+
+ xdp_do_check_flushed(n);
}
if (unlikely(work > weight))
@@ -9052,6 +9029,28 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
}
EXPORT_SYMBOL(netdev_port_same_parent_id);
+static void netdev_dpll_pin_assign(struct net_device *dev, struct dpll_pin *dpll_pin)
+{
+#if IS_ENABLED(CONFIG_DPLL)
+ rtnl_lock();
+ dev->dpll_pin = dpll_pin;
+ rtnl_unlock();
+#endif
+}
+
+void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)
+{
+ WARN_ON(!dpll_pin);
+ netdev_dpll_pin_assign(dev, dpll_pin);
+}
+EXPORT_SYMBOL(netdev_dpll_pin_set);
+
+void netdev_dpll_pin_clear(struct net_device *dev)
+{
+ netdev_dpll_pin_assign(dev, NULL);
+}
+EXPORT_SYMBOL(netdev_dpll_pin_clear);
+
/**
* dev_change_proto_down - set carrier according to proto_down.
*
@@ -10504,7 +10503,8 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
}
EXPORT_SYMBOL(netdev_stats_to_stats64);
-struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
+static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
+ struct net_device *dev)
{
struct net_device_core_stats __percpu *p;
@@ -10517,7 +10517,23 @@ struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device
/* This READ_ONCE() pairs with the cmpxchg() above */
return READ_ONCE(dev->core_stats);
}
-EXPORT_SYMBOL(netdev_core_stats_alloc);
+
+noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
+{
+ /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
+ struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
+ unsigned long __percpu *field;
+
+ if (unlikely(!p)) {
+ p = netdev_core_stats_alloc(dev);
+ if (!p)
+ return;
+ }
+
+ field = (__force unsigned long __percpu *)((__force void *)p + offset);
+ this_cpu_inc(*field);
+}
+EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
/**
* dev_get_stats - get network device statistics
@@ -11091,7 +11107,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
/* We get here if we can't use the current device name */
if (!pat)
goto out;
- err = dev_prep_valid_name(net, dev, pat, new_name);
+ err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
if (err < 0)
goto out;
}
diff --git a/net/core/dev.h b/net/core/dev.h
index fa2e9c5c4122..5aa45f0fd4ae 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -139,4 +139,10 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
}
int rps_cpumask_housekeeping(struct cpumask *mask);
+
+#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
+void xdp_do_check_flushed(struct napi_struct *napi);
+#else
+static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
+#endif
#endif
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index b46aedc36939..feeddf95f450 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -382,7 +382,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
if (err)
return err;
- err = dsa_master_hwtstamp_validate(dev, &kernel_cfg, &extack);
+ err = dsa_conduit_hwtstamp_validate(dev, &kernel_cfg, &extack);
if (err) {
if (extack._msg)
netdev_err(dev, "%s\n", extack._msg);
diff --git a/net/core/dst.c b/net/core/dst.c
index 980e2fd2f013..6838d3212c37 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -45,7 +45,7 @@ const struct dst_metrics dst_default_metrics = {
EXPORT_SYMBOL(dst_default_metrics);
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
- struct net_device *dev, int initial_ref, int initial_obsolete,
+ struct net_device *dev, int initial_obsolete,
unsigned short flags)
{
dst->dev = dev;
@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
dst->tclassid = 0;
#endif
dst->lwtstate = NULL;
- rcuref_init(&dst->__rcuref, initial_ref);
+ rcuref_init(&dst->__rcuref, 1);
INIT_LIST_HEAD(&dst->rt_uncached);
dst->__use = 0;
dst->lastuse = jiffies;
@@ -77,7 +77,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
EXPORT_SYMBOL(dst_init);
void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
- int initial_ref, int initial_obsolete, unsigned short flags)
+ int initial_obsolete, unsigned short flags)
{
struct dst_entry *dst;
@@ -90,7 +90,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
if (!dst)
return NULL;
- dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);
+ dst_init(dst, ops, dev, initial_obsolete, flags);
return dst;
}
@@ -270,7 +270,7 @@ static void __metadata_dst_init(struct metadata_dst *md_dst,
struct dst_entry *dst;
dst = &md_dst->dst;
- dst_init(dst, &dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE,
+ dst_init(dst, &dst_blackhole_ops, NULL, DST_OBSOLETE_NONE,
DST_METADATA | DST_NOCOUNT);
memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
md_dst->type = type;
diff --git a/net/core/filter.c b/net/core/filter.c
index a094694899c9..21d75108c2e9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -81,6 +81,9 @@
#include <net/xdp.h>
#include <net/mptcp.h>
#include <net/netfilter/nf_conntrack_bpf.h>
+#include <linux/un.h>
+
+#include "dev.h"
static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id);
@@ -4207,6 +4210,20 @@ void xdp_do_flush(void)
}
EXPORT_SYMBOL_GPL(xdp_do_flush);
+#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
+void xdp_do_check_flushed(struct napi_struct *napi)
+{
+ bool ret;
+
+ ret = dev_check_flush();
+ ret |= cpu_map_check_flush();
+ ret |= xsk_map_check_flush();
+
+ WARN_ONCE(ret, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
+ napi->poll);
+}
+#endif
+
void bpf_clear_redirect_map(struct bpf_map *map)
{
struct bpf_redirect_info *ri;
@@ -5850,6 +5867,9 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
params->rt_metric = res.fi->fib_priority;
params->ifindex = dev->ifindex;
+ if (flags & BPF_FIB_LOOKUP_SRC)
+ params->ipv4_src = fib_result_prefsrc(net, &res);
+
/* xdp and cls_bpf programs are run in RCU-bh so
* rcu_read_lock_bh is not needed here
*/
@@ -5992,6 +6012,18 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
params->rt_metric = res.f6i->fib6_metric;
params->ifindex = dev->ifindex;
+ if (flags & BPF_FIB_LOOKUP_SRC) {
+ if (res.f6i->fib6_prefsrc.plen) {
+ *src = res.f6i->fib6_prefsrc.addr;
+ } else {
+ err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev,
+ &fl6.daddr, 0,
+ src);
+ if (err)
+ return BPF_FIB_LKUP_RET_NO_SRC_ADDR;
+ }
+ }
+
if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
goto set_fwd_params;
@@ -6010,7 +6042,8 @@ set_fwd_params:
#endif
#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
- BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID)
+ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \
+ BPF_FIB_LOOKUP_SRC)
BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
struct bpf_fib_lookup *, params, int, plen, u32, flags)
@@ -7858,14 +7891,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
return &bpf_sock_addr_setsockopt_proto;
default:
return NULL;
@@ -7876,14 +7914,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
return &bpf_sock_addr_getsockopt_proto;
default:
return NULL;
@@ -8931,8 +8974,8 @@ static bool sock_addr_is_valid_access(int off, int size,
if (off % size != 0)
return false;
- /* Disallow access to IPv6 fields from IPv4 contex and vise
- * versa.
+ /* Disallow access to fields not belonging to the attach type's address
+ * family.
*/
switch (off) {
case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
@@ -11752,6 +11795,27 @@ __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags,
return 0;
}
+
+__bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
+ const u8 *sun_path, u32 sun_path__sz)
+{
+ struct sockaddr_un *un;
+
+ if (sa_kern->sk->sk_family != AF_UNIX)
+ return -EINVAL;
+
+ /* We do not allow changing the address to unnamed or larger than the
+ * maximum allowed address size for a unix sockaddr.
+ */
+ if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX)
+ return -EINVAL;
+
+ un = (struct sockaddr_un *)sa_kern->uaddr;
+ memcpy(un->sun_path, sun_path, sun_path__sz);
+ sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz;
+
+ return 0;
+}
__diag_pop();
int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
@@ -11776,6 +11840,10 @@ BTF_SET8_START(bpf_kfunc_check_set_xdp)
BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
BTF_SET8_END(bpf_kfunc_check_set_xdp)
+BTF_SET8_START(bpf_kfunc_check_set_sock_addr)
+BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
+BTF_SET8_END(bpf_kfunc_check_set_sock_addr)
+
static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
.owner = THIS_MODULE,
.set = &bpf_kfunc_check_set_skb,
@@ -11786,6 +11854,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
.set = &bpf_kfunc_check_set_xdp,
};
+static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_sock_addr,
+};
+
static int __init bpf_kfunc_init(void)
{
int ret;
@@ -11800,7 +11873,9 @@ static int __init bpf_kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
- return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ &bpf_kfunc_set_sock_addr);
}
late_initcall(bpf_kfunc_init);
diff --git a/net/core/gso_test.c b/net/core/gso_test.c
new file mode 100644
index 000000000000..ceb684be4cbf
--- /dev/null
+++ b/net/core/gso_test.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <kunit/test.h>
+#include <linux/skbuff.h>
+
+static const char hdr[] = "abcdefgh";
+#define GSO_TEST_SIZE 1000
+
+static void __init_skb(struct sk_buff *skb)
+{
+ skb_reset_mac_header(skb);
+ memcpy(skb_mac_header(skb), hdr, sizeof(hdr));
+
+ /* skb_segment expects skb->data at start of payload */
+ skb_pull(skb, sizeof(hdr));
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+
+ /* proto is arbitrary, as long as not ETH_P_TEB or vlan */
+ skb->protocol = htons(ETH_P_ATALK);
+ skb_shinfo(skb)->gso_size = GSO_TEST_SIZE;
+}
+
+enum gso_test_nr {
+ GSO_TEST_LINEAR,
+ GSO_TEST_NO_GSO,
+ GSO_TEST_FRAGS,
+ GSO_TEST_FRAGS_PURE,
+ GSO_TEST_GSO_PARTIAL,
+ GSO_TEST_FRAG_LIST,
+ GSO_TEST_FRAG_LIST_PURE,
+ GSO_TEST_FRAG_LIST_NON_UNIFORM,
+ GSO_TEST_GSO_BY_FRAGS,
+};
+
+struct gso_test_case {
+ enum gso_test_nr id;
+ const char *name;
+
+ /* input */
+ unsigned int linear_len;
+ unsigned int nr_frags;
+ const unsigned int *frags;
+ unsigned int nr_frag_skbs;
+ const unsigned int *frag_skbs;
+
+ /* output as expected */
+ unsigned int nr_segs;
+ const unsigned int *segs;
+};
+
+static struct gso_test_case cases[] = {
+ {
+ .id = GSO_TEST_NO_GSO,
+ .name = "no_gso",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_segs = 1,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE },
+ },
+ {
+ .id = GSO_TEST_LINEAR,
+ .name = "linear",
+ .linear_len = GSO_TEST_SIZE + GSO_TEST_SIZE + 1,
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 },
+ },
+ {
+ .id = GSO_TEST_FRAGS,
+ .name = "frags",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frags = 2,
+ .frags = (const unsigned int[]) { GSO_TEST_SIZE, 1 },
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 },
+ },
+ {
+ .id = GSO_TEST_FRAGS_PURE,
+ .name = "frags_pure",
+ .nr_frags = 3,
+ .frags = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 },
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 },
+ },
+ {
+ .id = GSO_TEST_GSO_PARTIAL,
+ .name = "gso_partial",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frags = 2,
+ .frags = (const unsigned int[]) { GSO_TEST_SIZE, 3 },
+ .nr_segs = 2,
+ .segs = (const unsigned int[]) { 2 * GSO_TEST_SIZE, 3 },
+ },
+ {
+ /* commit 89319d3801d1: frag_list on mss boundaries */
+ .id = GSO_TEST_FRAG_LIST,
+ .name = "frag_list",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frag_skbs = 2,
+ .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE },
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE },
+ },
+ {
+ .id = GSO_TEST_FRAG_LIST_PURE,
+ .name = "frag_list_pure",
+ .nr_frag_skbs = 2,
+ .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE },
+ .nr_segs = 2,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE },
+ },
+ {
+ /* commit 43170c4e0ba7: GRO of frag_list trains */
+ .id = GSO_TEST_FRAG_LIST_NON_UNIFORM,
+ .name = "frag_list_non_uniform",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frag_skbs = 4,
+ .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, 1, GSO_TEST_SIZE, 2 },
+ .nr_segs = 4,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE, 3 },
+ },
+ {
+ /* commit 3953c46c3ac7 ("sk_buff: allow segmenting based on frag sizes") and
+ * commit 90017accff61 ("sctp: Add GSO support")
+ *
+ * "there will be a cover skb with protocol headers and
+ * children ones containing the actual segments"
+ */
+ .id = GSO_TEST_GSO_BY_FRAGS,
+ .name = "gso_by_frags",
+ .nr_frag_skbs = 4,
+ .frag_skbs = (const unsigned int[]) { 100, 200, 300, 400 },
+ .nr_segs = 4,
+ .segs = (const unsigned int[]) { 100, 200, 300, 400 },
+ },
+};
+
+static void gso_test_case_to_desc(struct gso_test_case *t, char *desc)
+{
+ sprintf(desc, "%s", t->name);
+}
+
+KUNIT_ARRAY_PARAM(gso_test, cases, gso_test_case_to_desc);
+
+static void gso_test_func(struct kunit *test)
+{
+ const int shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ struct sk_buff *skb, *segs, *cur, *next, *last;
+ const struct gso_test_case *tcase;
+ netdev_features_t features;
+ struct page *page;
+ int i;
+
+ tcase = test->param_value;
+
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+ skb = build_skb(page_address(page), sizeof(hdr) + tcase->linear_len + shinfo_size);
+ KUNIT_ASSERT_NOT_NULL(test, skb);
+ __skb_put(skb, sizeof(hdr) + tcase->linear_len);
+
+ __init_skb(skb);
+
+ if (tcase->nr_frags) {
+ unsigned int pg_off = 0;
+
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+ page_ref_add(page, tcase->nr_frags - 1);
+
+ for (i = 0; i < tcase->nr_frags; i++) {
+ skb_fill_page_desc(skb, i, page, pg_off, tcase->frags[i]);
+ pg_off += tcase->frags[i];
+ }
+
+ KUNIT_ASSERT_LE(test, pg_off, PAGE_SIZE);
+
+ skb->data_len = pg_off;
+ skb->len += skb->data_len;
+ skb->truesize += skb->data_len;
+ }
+
+ if (tcase->frag_skbs) {
+ unsigned int total_size = 0, total_true_size = 0, alloc_size = 0;
+ struct sk_buff *frag_skb, *prev = NULL;
+
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+ page_ref_add(page, tcase->nr_frag_skbs - 1);
+
+ for (i = 0; i < tcase->nr_frag_skbs; i++) {
+ unsigned int frag_size;
+
+ frag_size = tcase->frag_skbs[i];
+ frag_skb = build_skb(page_address(page) + alloc_size,
+ frag_size + shinfo_size);
+ KUNIT_ASSERT_NOT_NULL(test, frag_skb);
+ __skb_put(frag_skb, frag_size);
+
+ if (prev)
+ prev->next = frag_skb;
+ else
+ skb_shinfo(skb)->frag_list = frag_skb;
+ prev = frag_skb;
+
+ total_size += frag_size;
+ total_true_size += frag_skb->truesize;
+ alloc_size += frag_size + shinfo_size;
+ }
+
+ KUNIT_ASSERT_LE(test, alloc_size, PAGE_SIZE);
+
+ skb->len += total_size;
+ skb->data_len += total_size;
+ skb->truesize += total_true_size;
+
+ if (tcase->id == GSO_TEST_GSO_BY_FRAGS)
+ skb_shinfo(skb)->gso_size = GSO_BY_FRAGS;
+ }
+
+ features = NETIF_F_SG | NETIF_F_HW_CSUM;
+ if (tcase->id == GSO_TEST_GSO_PARTIAL)
+ features |= NETIF_F_GSO_PARTIAL;
+
+ /* TODO: this should also work with SG,
+ * rather than hit BUG_ON(i >= nfrags)
+ */
+ if (tcase->id == GSO_TEST_FRAG_LIST_NON_UNIFORM)
+ features &= ~NETIF_F_SG;
+
+ segs = skb_segment(skb, features);
+ if (IS_ERR(segs)) {
+ KUNIT_FAIL(test, "segs error %lld", PTR_ERR(segs));
+ goto free_gso_skb;
+ } else if (!segs) {
+ KUNIT_FAIL(test, "no segments");
+ goto free_gso_skb;
+ }
+
+ last = segs->prev;
+ for (cur = segs, i = 0; cur; cur = next, i++) {
+ next = cur->next;
+
+ KUNIT_ASSERT_EQ(test, cur->len, sizeof(hdr) + tcase->segs[i]);
+
+ /* segs have skb->data pointing to the mac header */
+ KUNIT_ASSERT_PTR_EQ(test, skb_mac_header(cur), cur->data);
+ KUNIT_ASSERT_PTR_EQ(test, skb_network_header(cur), cur->data + sizeof(hdr));
+
+ /* header was copied to all segs */
+ KUNIT_ASSERT_EQ(test, memcmp(skb_mac_header(cur), hdr, sizeof(hdr)), 0);
+
+ /* last seg can be found through segs->prev pointer */
+ if (!next)
+ KUNIT_ASSERT_PTR_EQ(test, cur, last);
+
+ consume_skb(cur);
+ }
+
+ KUNIT_ASSERT_EQ(test, i, tcase->nr_segs);
+
+free_gso_skb:
+ consume_skb(skb);
+}
+
+static struct kunit_case gso_test_cases[] = {
+ KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params),
+ {}
+};
+
+static struct kunit_suite gso_test_suite = {
+ .name = "net_core_gso",
+ .test_cases = gso_test_cases,
+};
+
+kunit_test_suite(gso_test_suite);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("KUnit tests for segmentation offload");
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index d6a70aeaa503..d22f0919821e 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -88,6 +88,12 @@ static void update_classid_task(struct task_struct *p, u32 classid)
};
unsigned int fd = 0;
+ /* Only update the leader task, when many threads in this task,
+ * so it can avoid the useless traversal.
+ */
+ if (p != p->group_leader)
+ return;
+
do {
task_lock(p);
fd = iterate_fd(p->files, fd, update_classid_sock, &ctx);
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index c1aea8b756b6..fe61f85bcf33 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -5,6 +5,7 @@
#include <linux/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/sock.h>
+#include <net/xdp.h>
#include "netdev-genl-gen.h"
@@ -12,15 +13,24 @@ static int
netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
const struct genl_info *info)
{
+ u64 xdp_rx_meta = 0;
void *hdr;
hdr = genlmsg_iput(rsp, info);
if (!hdr)
return -EMSGSIZE;
+#define XDP_METADATA_KFUNC(_, flag, __, xmo) \
+ if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \
+ xdp_rx_meta |= flag;
+XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
- netdev->xdp_features, NETDEV_A_DEV_PAD)) {
+ netdev->xdp_features, NETDEV_A_DEV_PAD) ||
+ nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
+ xdp_rx_meta, NETDEV_A_DEV_PAD)) {
genlmsg_cancel(rsp, hdr);
return -EINVAL;
}
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 77cb75e63aca..5e409b98aba0 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -211,10 +211,6 @@ static int page_pool_init(struct page_pool *pool,
*/
}
- if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
- pool->p.flags & PP_FLAG_PAGE_FRAG)
- return -EINVAL;
-
#ifdef CONFIG_PAGE_POOL_STATS
pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
if (!pool->recycle_stats)
@@ -359,12 +355,20 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
if (dma_mapping_error(pool->p.dev, dma))
return false;
- page_pool_set_dma_addr(page, dma);
+ if (page_pool_set_dma_addr(page, dma))
+ goto unmap_failed;
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
return true;
+
+unmap_failed:
+ WARN_ON_ONCE("unexpected DMA address, please report to netdev@");
+ dma_unmap_page_attrs(pool->p.dev, dma,
+ PAGE_SIZE << pool->p.order, pool->p.dma_dir,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
+ return false;
}
static void page_pool_set_pp_info(struct page_pool *pool,
@@ -372,6 +376,14 @@ static void page_pool_set_pp_info(struct page_pool *pool,
{
page->pp = pool;
page->pp_magic |= PP_SIGNATURE;
+
+ /* Ensuring all pages have been split into one fragment initially:
+ * page_pool_set_pp_info() is only called once for every page when it
+ * is allocated from the page allocator and page_pool_fragment_page()
+ * is dirtying the same cache line as the page->pp_magic above, so
+ * the overhead is negligible.
+ */
+ page_pool_fragment_page(page, 1);
if (pool->p.init_callback)
pool->p.init_callback(page, pool->p.init_arg);
}
@@ -668,7 +680,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
struct page *page = virt_to_head_page(data[i]);
/* It is not the last user for the page frag case */
- if (!page_pool_is_last_frag(pool, page))
+ if (!page_pool_is_last_frag(page))
continue;
page = __page_pool_put_page(pool, page, -1, false);
@@ -744,8 +756,7 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
unsigned int max_size = PAGE_SIZE << pool->p.order;
struct page *page = pool->frag_page;
- if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
- size > max_size))
+ if (WARN_ON(size > max_size))
return NULL;
size = ALIGN(size, dma_get_cache_alignment());
@@ -798,7 +809,7 @@ static void page_pool_empty_ring(struct page_pool *pool)
}
}
-static void page_pool_free(struct page_pool *pool)
+static void __page_pool_destroy(struct page_pool *pool)
{
if (pool->disconnect)
pool->disconnect(pool);
@@ -849,7 +860,7 @@ static int page_pool_release(struct page_pool *pool)
page_pool_scrub(pool);
inflight = page_pool_inflight(pool);
if (!inflight)
- page_pool_free(pool);
+ __page_pool_destroy(pool);
return inflight;
}
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 4d1696677c48..8afcfadf8d5a 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -200,6 +200,7 @@
pf(VID_RND) /* Random VLAN ID */ \
pf(SVID_RND) /* Random SVLAN ID */ \
pf(NODE) /* Node memory alloc*/ \
+ pf(SHARED) /* Shared SKB */ \
#define pf(flag) flag##_SHIFT,
enum pkt_flags {
@@ -1198,7 +1199,8 @@ static ssize_t pktgen_if_write(struct file *file,
((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
return -ENOTSUPP;
- if (value > 0 && pkt_dev->n_imix_entries > 0)
+ if (value > 0 && (pkt_dev->n_imix_entries > 0 ||
+ !(pkt_dev->flags & F_SHARED)))
return -EINVAL;
i += len;
@@ -1257,6 +1259,10 @@ static ssize_t pktgen_if_write(struct file *file,
((pkt_dev->xmit_mode == M_START_XMIT) &&
(!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
return -ENOTSUPP;
+
+ if (value > 1 && !(pkt_dev->flags & F_SHARED))
+ return -EINVAL;
+
pkt_dev->burst = value < 1 ? 1 : value;
sprintf(pg_result, "OK: burst=%u", pkt_dev->burst);
return count;
@@ -1318,9 +1324,10 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "flag")) {
+ bool disable = false;
__u32 flag;
char f[32];
- bool disable = false;
+ char *end;
memset(f, 0, 32);
len = strn_len(&user_buffer[i], sizeof(f) - 1);
@@ -1332,28 +1339,42 @@ static ssize_t pktgen_if_write(struct file *file,
i += len;
flag = pktgen_read_flag(f, &disable);
-
if (flag) {
- if (disable)
+ if (disable) {
+ /* If "clone_skb", or "burst" parameters are
+ * configured, it means that the skb still
+ * needs to be referenced by the pktgen, so
+ * the skb must be shared.
+ */
+ if (flag == F_SHARED && (pkt_dev->clone_skb ||
+ pkt_dev->burst > 1))
+ return -EINVAL;
pkt_dev->flags &= ~flag;
- else
+ } else {
pkt_dev->flags |= flag;
- } else {
- sprintf(pg_result,
- "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
- f,
- "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
- "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
- "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
- "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
- "NO_TIMESTAMP, "
-#ifdef CONFIG_XFRM
- "IPSEC, "
-#endif
- "NODE_ALLOC\n");
+ }
+
+ sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
return count;
}
- sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
+
+ /* Unknown flag */
+ end = pkt_dev->result + sizeof(pkt_dev->result);
+ pg_result += sprintf(pg_result,
+ "Flag -:%s:- unknown\n"
+ "Available flags, (prepend ! to un-set flag):\n", f);
+
+ for (int n = 0; n < NR_PKT_FLAGS && pg_result < end; n++) {
+ if (!IS_ENABLED(CONFIG_XFRM) && n == IPSEC_SHIFT)
+ continue;
+ pg_result += snprintf(pg_result, end - pg_result,
+ "%s, ", pkt_flag_names[n]);
+ }
+ if (!WARN_ON_ONCE(pg_result >= end)) {
+ /* Remove the comma and whitespace at the end */
+ *(pg_result - 2) = '\0';
+ }
+
return count;
}
if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
@@ -3440,12 +3461,24 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
static void pktgen_xmit(struct pktgen_dev *pkt_dev)
{
- unsigned int burst = READ_ONCE(pkt_dev->burst);
+ bool skb_shared = !!(READ_ONCE(pkt_dev->flags) & F_SHARED);
struct net_device *odev = pkt_dev->odev;
struct netdev_queue *txq;
+ unsigned int burst = 1;
struct sk_buff *skb;
+ int clone_skb = 0;
int ret;
+ /* If 'skb_shared' is false, the read of possible
+ * new values (if any) for 'burst' and 'clone_skb' will be skipped to
+ * prevent some concurrent changes from slipping in. And the stabilized
+ * config will be read in during the next run of pktgen_xmit.
+ */
+ if (skb_shared) {
+ burst = READ_ONCE(pkt_dev->burst);
+ clone_skb = READ_ONCE(pkt_dev->clone_skb);
+ }
+
/* If device is offline, then don't send */
if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) {
pktgen_stop_device(pkt_dev);
@@ -3462,7 +3495,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
/* If no skb or clone count exhausted then get new one */
if (!pkt_dev->skb || (pkt_dev->last_ok &&
- ++pkt_dev->clone_count >= pkt_dev->clone_skb)) {
+ ++pkt_dev->clone_count >= clone_skb)) {
/* build a new pkt */
kfree_skb(pkt_dev->skb);
@@ -3483,7 +3516,8 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) {
skb = pkt_dev->skb;
skb->protocol = eth_type_trans(skb, skb->dev);
- refcount_add(burst, &skb->users);
+ if (skb_shared)
+ refcount_add(burst, &skb->users);
local_bh_disable();
do {
ret = netif_receive_skb(skb);
@@ -3491,6 +3525,10 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->errors++;
pkt_dev->sofar++;
pkt_dev->seq_num++;
+ if (unlikely(!skb_shared)) {
+ pkt_dev->skb = NULL;
+ break;
+ }
if (refcount_read(&skb->users) != burst) {
/* skb was queued by rps/rfs or taps,
* so cannot reuse this skb
@@ -3509,9 +3547,14 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
goto out; /* Skips xmit_mode M_START_XMIT */
} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
local_bh_disable();
- refcount_inc(&pkt_dev->skb->users);
+ if (skb_shared)
+ refcount_inc(&pkt_dev->skb->users);
ret = dev_queue_xmit(pkt_dev->skb);
+
+ if (!skb_shared && dev_xmit_complete(ret))
+ pkt_dev->skb = NULL;
+
switch (ret) {
case NET_XMIT_SUCCESS:
pkt_dev->sofar++;
@@ -3549,11 +3592,15 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->last_ok = 0;
goto unlock;
}
- refcount_add(burst, &pkt_dev->skb->users);
+ if (skb_shared)
+ refcount_add(burst, &pkt_dev->skb->users);
xmit_more:
ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0);
+ if (!skb_shared && dev_xmit_complete(ret))
+ pkt_dev->skb = NULL;
+
switch (ret) {
case NETDEV_TX_OK:
pkt_dev->last_ok = 1;
@@ -3575,7 +3622,8 @@ xmit_more:
fallthrough;
case NETDEV_TX_BUSY:
/* Retry it next time */
- refcount_dec(&(pkt_dev->skb->users));
+ if (skb_shared)
+ refcount_dec(&pkt_dev->skb->users);
pkt_dev->last_ok = 0;
}
if (unlikely(burst))
@@ -3588,7 +3636,8 @@ out:
/* If pkt_dev->count is zero, then run forever */
if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
- pktgen_wait_for_skb(pkt_dev);
+ if (pkt_dev->skb)
+ pktgen_wait_for_skb(pkt_dev);
/* Done with this */
pktgen_stop_device(pkt_dev);
@@ -3771,6 +3820,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->svlan_id = 0xffff;
pkt_dev->burst = 1;
pkt_dev->node = NUMA_NO_NODE;
+ pkt_dev->flags = F_SHARED; /* SKB shared by default */
err = pktgen_setup_dev(t->net, pkt_dev, ifname);
if (err)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 53c377d054f0..e8431c6c8490 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -57,6 +57,7 @@
#if IS_ENABLED(CONFIG_IPV6)
#include <net/addrconf.h>
#endif
+#include <linux/dpll.h>
#include "dev.h"
@@ -1055,6 +1056,15 @@ static size_t rtnl_devlink_port_size(const struct net_device *dev)
return size;
}
+static size_t rtnl_dpll_pin_size(const struct net_device *dev)
+{
+ size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */
+
+ size += dpll_msg_pin_handle_size(netdev_dpll_pin(dev));
+
+ return size;
+}
+
static noinline size_t if_nlmsg_size(const struct net_device *dev,
u32 ext_filter_mask)
{
@@ -1111,6 +1121,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ rtnl_prop_list_size(dev)
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
+ rtnl_devlink_port_size(dev)
+ + rtnl_dpll_pin_size(dev)
+ 0;
}
@@ -1774,6 +1785,28 @@ nest_cancel:
return ret;
}
+static int rtnl_fill_dpll_pin(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct nlattr *dpll_pin_nest;
+ int ret;
+
+ dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN);
+ if (!dpll_pin_nest)
+ return -EMSGSIZE;
+
+ ret = dpll_msg_add_pin_handle(skb, netdev_dpll_pin(dev));
+ if (ret < 0)
+ goto nest_cancel;
+
+ nla_nest_end(skb, dpll_pin_nest);
+ return 0;
+
+nest_cancel:
+ nla_nest_cancel(skb, dpll_pin_nest);
+ return ret;
+}
+
static int rtnl_fill_ifinfo(struct sk_buff *skb,
struct net_device *dev, struct net *src_net,
int type, u32 pid, u32 seq, u32 change,
@@ -1916,6 +1949,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
if (rtnl_fill_devlink_port(skb, dev))
goto nla_put_failure;
+ if (rtnl_fill_dpll_pin(skb, dev))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -4331,13 +4367,6 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm,
}
EXPORT_SYMBOL(ndo_dflt_fdb_del);
-static const struct nla_policy fdb_del_bulk_policy[NDA_MAX + 1] = {
- [NDA_VLAN] = { .type = NLA_U16 },
- [NDA_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1),
- [NDA_NDM_STATE_MASK] = { .type = NLA_U16 },
- [NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 },
-};
-
static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -4358,8 +4387,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX,
NULL, extack);
} else {
- err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX,
- fdb_del_bulk_policy, extack);
+ /* For bulk delete, the drivers will parse the message with
+ * policy.
+ */
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
}
if (err < 0)
return err;
@@ -4382,6 +4413,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
}
addr = nla_data(tb[NDA_LLADDR]);
+
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
+ if (err)
+ return err;
}
if (dev->type != ARPHRD_ETHER) {
@@ -4389,10 +4424,6 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
}
- err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
- if (err)
- return err;
-
err = -EOPNOTSUPP;
/* Support fdb on master device the net/bridge default case */
@@ -4406,8 +4437,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack);
} else {
if (ops->ndo_fdb_del_bulk)
- err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid,
- extack);
+ err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
}
if (err)
@@ -4428,8 +4458,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
/* in case err was cleared by NTF_MASTER call */
err = -EOPNOTSUPP;
if (ops->ndo_fdb_del_bulk)
- err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid,
- extack);
+ err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
}
if (!err) {
@@ -6190,6 +6219,93 @@ out:
return skb->len;
}
+static int rtnl_validate_mdb_entry_get(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct br_mdb_entry *entry = nla_data(attr);
+
+ if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
+ NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
+ return -EINVAL;
+ }
+
+ if (entry->ifindex) {
+ NL_SET_ERR_MSG(extack, "Entry ifindex cannot be specified");
+ return -EINVAL;
+ }
+
+ if (entry->state) {
+ NL_SET_ERR_MSG(extack, "Entry state cannot be specified");
+ return -EINVAL;
+ }
+
+ if (entry->flags) {
+ NL_SET_ERR_MSG(extack, "Entry flags cannot be specified");
+ return -EINVAL;
+ }
+
+ if (entry->vid >= VLAN_VID_MASK) {
+ NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
+ return -EINVAL;
+ }
+
+ if (entry->addr.proto != htons(ETH_P_IP) &&
+ entry->addr.proto != htons(ETH_P_IPV6) &&
+ entry->addr.proto != 0) {
+ NL_SET_ERR_MSG(extack, "Unknown entry protocol");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy mdba_get_policy[MDBA_GET_ENTRY_MAX + 1] = {
+ [MDBA_GET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
+ rtnl_validate_mdb_entry_get,
+ sizeof(struct br_mdb_entry)),
+ [MDBA_GET_ENTRY_ATTRS] = { .type = NLA_NESTED },
+};
+
+static int rtnl_mdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[MDBA_GET_ENTRY_MAX + 1];
+ struct net *net = sock_net(in_skb->sk);
+ struct br_port_msg *bpm;
+ struct net_device *dev;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct br_port_msg), tb,
+ MDBA_GET_ENTRY_MAX, mdba_get_policy, extack);
+ if (err)
+ return err;
+
+ bpm = nlmsg_data(nlh);
+ if (!bpm->ifindex) {
+ NL_SET_ERR_MSG(extack, "Invalid ifindex");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, bpm->ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Device doesn't exist");
+ return -ENODEV;
+ }
+
+ if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_GET_ENTRY)) {
+ NL_SET_ERR_MSG(extack, "Missing MDBA_GET_ENTRY attribute");
+ return -EINVAL;
+ }
+
+ if (!dev->netdev_ops->ndo_mdb_get) {
+ NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
+ return -EOPNOTSUPP;
+ }
+
+ return dev->netdev_ops->ndo_mdb_get(dev, tb, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, extack);
+}
+
static int rtnl_validate_mdb_entry(const struct nlattr *attr,
struct netlink_ext_ack *extack)
{
@@ -6566,7 +6682,7 @@ void __init rtnetlink_init(void)
0);
rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, rtnl_mdb_dump, 0);
+ rtnl_register(PF_BRIDGE, RTM_GETMDB, rtnl_mdb_get, rtnl_mdb_dump, 0);
rtnl_register(PF_BRIDGE, RTM_NEWMDB, rtnl_mdb_add, NULL, 0);
rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL, 0);
}
diff --git a/net/core/selftests.c b/net/core/selftests.c
index acb1ee97bbd3..94fe3146a959 100644
--- a/net/core/selftests.c
+++ b/net/core/selftests.c
@@ -397,14 +397,11 @@ EXPORT_SYMBOL_GPL(net_selftest_get_count);
void net_selftest_get_strings(u8 *data)
{
- u8 *p = data;
int i;
- for (i = 0; i < net_selftest_get_count(); i++) {
- snprintf(p, ETH_GSTRING_LEN, "%2d. %s", i + 1,
- net_selftests[i].name);
- p += ETH_GSTRING_LEN;
- }
+ for (i = 0; i < net_selftest_get_count(); i++)
+ ethtool_sprintf(&data, "%2d. %s", i + 1,
+ net_selftests[i].name);
}
EXPORT_SYMBOL_GPL(net_selftest_get_strings);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2bfa6a7ba244..b157efea5dea 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -848,6 +848,8 @@ EXPORT_SYMBOL(__napi_alloc_skb);
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size, unsigned int truesize)
{
+ DEBUG_NET_WARN_ON_ONCE(size > truesize);
+
skb_fill_page_desc(skb, i, page, off, size);
skb->len += size;
skb->data_len += size;
@@ -860,6 +862,8 @@ void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
{
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ DEBUG_NET_WARN_ON_ONCE(size > truesize);
+
skb_frag_size_add(frag, size);
skb->len += size;
skb->data_len += size;
@@ -3719,10 +3723,19 @@ EXPORT_SYMBOL(skb_dequeue_tail);
void skb_queue_purge_reason(struct sk_buff_head *list,
enum skb_drop_reason reason)
{
- struct sk_buff *skb;
+ struct sk_buff_head tmp;
+ unsigned long flags;
+
+ if (skb_queue_empty_lockless(list))
+ return;
- while ((skb = skb_dequeue(list)) != NULL)
- kfree_skb_reason(skb, reason);
+ __skb_queue_head_init(&tmp);
+
+ spin_lock_irqsave(&list->lock, flags);
+ skb_queue_splice_init(list, &tmp);
+ spin_unlock_irqrestore(&list->lock, flags);
+
+ __skb_queue_purge_reason(&tmp, reason);
}
EXPORT_SYMBOL(skb_queue_purge_reason);
@@ -4255,6 +4268,7 @@ static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
unsigned int to, struct ts_config *config)
{
+ unsigned int patlen = config->ops->get_pattern_len(config);
struct ts_state state;
unsigned int ret;
@@ -4266,7 +4280,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
ret = textsearch_find(config, &state);
- return (ret <= to - from ? ret : UINT_MAX);
+ return (ret + patlen <= to - from ? ret : UINT_MAX);
}
EXPORT_SYMBOL(skb_find_text);
@@ -5150,6 +5164,9 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
bool icmp_next = false;
unsigned long flags;
+ if (skb_queue_empty_lockless(q))
+ return NULL;
+
spin_lock_irqsave(&q->lock, flags);
skb = __skb_dequeue(q);
if (skb && (skb_next = skb_peek(q))) {
@@ -5749,7 +5766,7 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
/* In general, avoid mixing page_pool and non-page_pool allocated
* pages within the same SKB. Additionally avoid dealing with clones
* with page_pool pages, in case the SKB is using page_pool fragment
- * references (PP_FLAG_PAGE_FRAG). Since we only take full page
+ * references (page_pool_alloc_frag()). Since we only take full page
* references for cloned SKBs at the moment that would result in
* inconsistent reference counts.
* In theory we could take full references if @from is cloned and
diff --git a/net/core/sock.c b/net/core/sock.c
index 16584e2dd648..1d28e3e87970 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -600,7 +600,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
- sk->sk_dst_pending_confirm = 0;
+ WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
dst_release(dst);
return NULL;
@@ -759,7 +759,7 @@ out:
return ret;
}
-bool sk_mc_loop(struct sock *sk)
+bool sk_mc_loop(const struct sock *sk)
{
if (dev_recursion_level())
return false;
@@ -771,7 +771,7 @@ bool sk_mc_loop(struct sock *sk)
return inet_test_bit(MC_LOOP, sk);
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- return inet6_sk(sk)->mc_loop;
+ return inet6_test_bit(MC6_LOOP, sk);
#endif
}
WARN_ON_ONCE(1);
@@ -806,9 +806,7 @@ EXPORT_SYMBOL(sock_no_linger);
void sock_set_priority(struct sock *sk, u32 priority)
{
- lock_sock(sk);
WRITE_ONCE(sk->sk_priority, priority);
- release_sock(sk);
}
EXPORT_SYMBOL(sock_set_priority);
@@ -1118,6 +1116,83 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
valbool = val ? 1 : 0;
+ /* handle options which do not require locking the socket. */
+ switch (optname) {
+ case SO_PRIORITY:
+ if ((val >= 0 && val <= 6) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ sock_set_priority(sk, val);
+ return 0;
+ }
+ return -EPERM;
+ case SO_PASSSEC:
+ assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
+ return 0;
+ case SO_PASSCRED:
+ assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
+ return 0;
+ case SO_PASSPIDFD:
+ assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
+ return 0;
+ case SO_TYPE:
+ case SO_PROTOCOL:
+ case SO_DOMAIN:
+ case SO_ERROR:
+ return -ENOPROTOOPT;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ if (val < 0)
+ return -EINVAL;
+ WRITE_ONCE(sk->sk_ll_usec, val);
+ return 0;
+ case SO_PREFER_BUSY_POLL:
+ if (valbool && !sockopt_capable(CAP_NET_ADMIN))
+ return -EPERM;
+ WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
+ return 0;
+ case SO_BUSY_POLL_BUDGET:
+ if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
+ !sockopt_capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (val < 0 || val > U16_MAX)
+ return -EINVAL;
+ WRITE_ONCE(sk->sk_busy_poll_budget, val);
+ return 0;
+#endif
+ case SO_MAX_PACING_RATE:
+ {
+ unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
+ unsigned long pacing_rate;
+
+ if (sizeof(ulval) != sizeof(val) &&
+ optlen >= sizeof(ulval) &&
+ copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
+ return -EFAULT;
+ }
+ if (ulval != ~0UL)
+ cmpxchg(&sk->sk_pacing_status,
+ SK_PACING_NONE,
+ SK_PACING_NEEDED);
+ /* Pairs with READ_ONCE() from sk_getsockopt() */
+ WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
+ pacing_rate = READ_ONCE(sk->sk_pacing_rate);
+ if (ulval < pacing_rate)
+ WRITE_ONCE(sk->sk_pacing_rate, ulval);
+ return 0;
+ }
+ case SO_TXREHASH:
+ if (val < -1 || val > 1)
+ return -EINVAL;
+ if ((u8)val == SOCK_TXREHASH_DEFAULT)
+ val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
+ /* Paired with READ_ONCE() in tcp_rtx_synack()
+ * and sk_getsockopt().
+ */
+ WRITE_ONCE(sk->sk_txrehash, (u8)val);
+ return 0;
+ }
+
sockopt_lock_sock(sk);
switch (optname) {
@@ -1133,12 +1208,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
case SO_REUSEPORT:
sk->sk_reuseport = valbool;
break;
- case SO_TYPE:
- case SO_PROTOCOL:
- case SO_DOMAIN:
- case SO_ERROR:
- ret = -ENOPROTOOPT;
- break;
case SO_DONTROUTE:
sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
sk_dst_reset(sk);
@@ -1213,15 +1282,6 @@ set_sndbuf:
sk->sk_no_check_tx = valbool;
break;
- case SO_PRIORITY:
- if ((val >= 0 && val <= 6) ||
- sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
- sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
- WRITE_ONCE(sk->sk_priority, val);
- else
- ret = -EPERM;
- break;
-
case SO_LINGER:
if (optlen < sizeof(ling)) {
ret = -EINVAL; /* 1003.1g */
@@ -1247,14 +1307,6 @@ set_sndbuf:
case SO_BSDCOMPAT:
break;
- case SO_PASSCRED:
- assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
- break;
-
- case SO_PASSPIDFD:
- assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
- break;
-
case SO_TIMESTAMP_OLD:
case SO_TIMESTAMP_NEW:
case SO_TIMESTAMPNS_OLD:
@@ -1360,9 +1412,6 @@ set_sndbuf:
sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
break;
- case SO_PASSSEC:
- assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
- break;
case SO_MARK:
if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
@@ -1404,50 +1453,7 @@ set_sndbuf:
sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
break;
-#ifdef CONFIG_NET_RX_BUSY_POLL
- case SO_BUSY_POLL:
- if (val < 0)
- ret = -EINVAL;
- else
- WRITE_ONCE(sk->sk_ll_usec, val);
- break;
- case SO_PREFER_BUSY_POLL:
- if (valbool && !sockopt_capable(CAP_NET_ADMIN))
- ret = -EPERM;
- else
- WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
- break;
- case SO_BUSY_POLL_BUDGET:
- if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
- ret = -EPERM;
- } else {
- if (val < 0 || val > U16_MAX)
- ret = -EINVAL;
- else
- WRITE_ONCE(sk->sk_busy_poll_budget, val);
- }
- break;
-#endif
- case SO_MAX_PACING_RATE:
- {
- unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
-
- if (sizeof(ulval) != sizeof(val) &&
- optlen >= sizeof(ulval) &&
- copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
- ret = -EFAULT;
- break;
- }
- if (ulval != ~0UL)
- cmpxchg(&sk->sk_pacing_status,
- SK_PACING_NONE,
- SK_PACING_NEEDED);
- /* Pairs with READ_ONCE() from sk_getsockopt() */
- WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
- sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
- break;
- }
case SO_INCOMING_CPU:
reuseport_update_incoming_cpu(sk, val);
break;
@@ -1532,19 +1538,6 @@ set_sndbuf:
break;
}
- case SO_TXREHASH:
- if (val < -1 || val > 1) {
- ret = -EINVAL;
- break;
- }
- if ((u8)val == SOCK_TXREHASH_DEFAULT)
- val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
- /* Paired with READ_ONCE() in tcp_rtx_synack()
- * and sk_getsockopt().
- */
- WRITE_ONCE(sk->sk_txrehash, (u8)val);
- break;
-
default:
ret = -ENOPROTOOPT;
break;
@@ -3001,6 +2994,11 @@ void __sk_flush_backlog(struct sock *sk)
{
spin_lock_bh(&sk->sk_lock.slock);
__release_sock(sk);
+
+ if (sk->sk_prot->release_cb)
+ INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
+ tcp_release_cb, sk);
+
spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL_GPL(__sk_flush_backlog);
@@ -3037,21 +3035,29 @@ EXPORT_SYMBOL(sk_wait_data);
* @amt: pages to allocate
* @kind: allocation type
*
- * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
+ * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
+ *
+ * Unlike the globally shared limits among the sockets under same protocol,
+ * consuming the budget of a memcg won't have direct effect on other ones.
+ * So be optimistic about memcg's tolerance, and leave the callers to decide
+ * whether or not to raise allocated through sk_under_memory_pressure() or
+ * its variants.
*/
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
- bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
+ struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
struct proto *prot = sk->sk_prot;
- bool charged = true;
+ bool charged = false;
long allocated;
sk_memory_allocated_add(sk, amt);
allocated = sk_memory_allocated(sk);
- if (memcg_charge &&
- !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
- gfp_memcg_charge())))
- goto suppress_allocation;
+
+ if (memcg) {
+ if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
+ goto suppress_allocation;
+ charged = true;
+ }
/* Under limit. */
if (allocated <= sk_prot_mem_limits(sk, 0)) {
@@ -3067,7 +3073,14 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
if (allocated > sk_prot_mem_limits(sk, 2))
goto suppress_allocation;
- /* guarantee minimum buffer size under pressure */
+ /* Guarantee minimum buffer size under pressure (either global
+ * or memcg) to make sure features described in RFC 7323 (TCP
+ * Extensions for High Performance) work properly.
+ *
+ * This rule does NOT stand when exceeds global or memcg's hard
+ * limit, or else a DoS attack can be taken place by spawning
+ * lots of sockets whose usage are under minimum buffer size.
+ */
if (kind == SK_MEM_RECV) {
if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
return 1;
@@ -3086,8 +3099,17 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
if (sk_has_memory_pressure(sk)) {
u64 alloc;
- if (!sk_under_memory_pressure(sk))
+ /* The following 'average' heuristic is within the
+ * scope of global accounting, so it only makes
+ * sense for global memory pressure.
+ */
+ if (!sk_under_global_memory_pressure(sk))
return 1;
+
+ /* Try to be fair among all the sockets under global
+ * pressure by allowing the ones that below average
+ * usage to raise.
+ */
alloc = sk_sockets_allocated_read_positive(sk);
if (sk_prot_mem_limits(sk, 2) > alloc *
sk_mem_pages(sk->sk_wmem_queued +
@@ -3106,8 +3128,8 @@ suppress_allocation:
*/
if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
/* Force charge with __GFP_NOFAIL */
- if (memcg_charge && !charged) {
- mem_cgroup_charge_skmem(sk->sk_memcg, amt,
+ if (memcg && !charged) {
+ mem_cgroup_charge_skmem(memcg, amt,
gfp_memcg_charge() | __GFP_NOFAIL);
}
return 1;
@@ -3119,8 +3141,8 @@ suppress_allocation:
sk_memory_allocated_sub(sk, amt);
- if (memcg_charge && charged)
- mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
+ if (charged)
+ mem_cgroup_uncharge_skmem(memcg, amt);
return 0;
}
@@ -3519,11 +3541,9 @@ void release_sock(struct sock *sk)
if (sk->sk_backlog.tail)
__release_sock(sk);
- /* Warning : release_cb() might need to release sk ownership,
- * ie call sock_release_ownership(sk) before us.
- */
if (sk->sk_prot->release_cb)
- sk->sk_prot->release_cb(sk);
+ INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
+ tcp_release_cb, sk);
sock_release_ownership(sk);
if (waitqueue_active(&sk->sk_lock.wq))
diff --git a/net/core/xdp.c b/net/core/xdp.c
index a70670fe9a2d..df4789ab512d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -741,7 +741,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
__diag_pop();
BTF_SET8_START(xdp_metadata_kfunc_ids)
-#define XDP_METADATA_KFUNC(_, name) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
+#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC
BTF_SET8_END(xdp_metadata_kfunc_ids)
@@ -752,7 +752,7 @@ static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
};
BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted)
-#define XDP_METADATA_KFUNC(name, str) BTF_ID(func, str)
+#define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str)
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC