summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/dev.c370
-rw-r--r--net/core/dev_ioctl.c187
-rw-r--r--net/core/dst.c2
-rw-r--r--net/core/filter.c15
-rw-r--r--net/core/flow_dissector.c55
-rw-r--r--net/core/flow_offload.c7
-rw-r--r--net/core/lwt_bpf.c7
-rw-r--r--net/core/net-sysfs.c1
-rw-r--r--net/core/netdev-genl.c54
-rw-r--r--net/core/of_net.c1
-rw-r--r--net/core/page_pool.c87
-rw-r--r--net/core/rtnetlink.c11
-rw-r--r--net/core/scm.c3
-rw-r--r--net/core/skbuff.c174
-rw-r--r--net/core/skmsg.c8
-rw-r--r--net/core/sock.c63
-rw-r--r--net/core/xdp.c2
17 files changed, 707 insertions, 340 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 69a3e544676c..ccff2b6ef958 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -107,6 +107,7 @@
#include <net/pkt_cls.h>
#include <net/checksum.h>
#include <net/xfrm.h>
+#include <net/tcx.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -132,6 +133,7 @@
#include <trace/events/net.h>
#include <trace/events/skb.h>
#include <trace/events/qdisc.h>
+#include <trace/events/xdp.h>
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
@@ -150,11 +152,11 @@
#include <linux/pm_runtime.h>
#include <linux/prandom.h>
#include <linux/once_lite.h>
+#include <net/netdev_rx_queue.h>
#include "dev.h"
#include "net-sysfs.h"
-
static DEFINE_SPINLOCK(ptype_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
struct list_head ptype_all __read_mostly; /* Taps */
@@ -388,6 +390,8 @@ static void list_netdevice(struct net_device *dev)
hlist_add_head_rcu(&dev->index_hlist,
dev_index_hash(net, dev->ifindex));
write_unlock(&dev_base_lock);
+ /* We reserved the ifindex, this can't fail */
+ WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
dev_base_seq_inc(net);
}
@@ -397,8 +401,12 @@ static void list_netdevice(struct net_device *dev)
*/
static void unlist_netdevice(struct net_device *dev, bool lock)
{
+ struct net *net = dev_net(dev);
+
ASSERT_RTNL();
+ xa_erase(&net->dev_by_index, dev->ifindex);
+
/* Unlink dev from the device chain */
if (lock)
write_lock(&dev_base_lock);
@@ -2384,8 +2392,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
struct xps_map *map = NULL;
int pos;
- if (dev_maps)
- map = xmap_dereference(dev_maps->attr_map[tci]);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
if (!map)
return false;
@@ -3882,69 +3889,201 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
EXPORT_SYMBOL(dev_loopback_xmit);
#ifdef CONFIG_NET_EGRESS
-static struct sk_buff *
-sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
+static struct netdev_queue *
+netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
{
+ int qm = skb_get_queue_mapping(skb);
+
+ return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
+}
+
+static bool netdev_xmit_txqueue_skipped(void)
+{
+ return __this_cpu_read(softnet_data.xmit.skip_txqueue);
+}
+
+void netdev_xmit_skip_txqueue(bool skip)
+{
+ __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
+}
+EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
+#endif /* CONFIG_NET_EGRESS */
+
+#ifdef CONFIG_NET_XGRESS
+static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
+{
+ int ret = TC_ACT_UNSPEC;
#ifdef CONFIG_NET_CLS_ACT
- struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
- struct tcf_result cl_res;
+ struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
+ struct tcf_result res;
if (!miniq)
- return skb;
+ return ret;
- /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
tc_skb_cb(skb)->mru = 0;
tc_skb_cb(skb)->post_ct = false;
- mini_qdisc_bstats_cpu_update(miniq, skb);
- switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
+ mini_qdisc_bstats_cpu_update(miniq, skb);
+ ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
+ /* Only tcf related quirks below. */
+ switch (ret) {
+ case TC_ACT_SHOT:
+ mini_qdisc_qstats_cpu_drop(miniq);
+ break;
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
- skb->tc_index = TC_H_MIN(cl_res.classid);
+ skb->tc_index = TC_H_MIN(res.classid);
break;
+ }
+#endif /* CONFIG_NET_CLS_ACT */
+ return ret;
+}
+
+static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);
+
+void tcx_inc(void)
+{
+ static_branch_inc(&tcx_needed_key);
+}
+
+void tcx_dec(void)
+{
+ static_branch_dec(&tcx_needed_key);
+}
+
+static __always_inline enum tcx_action_base
+tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
+ const bool needs_mac)
+{
+ const struct bpf_mprog_fp *fp;
+ const struct bpf_prog *prog;
+ int ret = TCX_NEXT;
+
+ if (needs_mac)
+ __skb_push(skb, skb->mac_len);
+ bpf_mprog_foreach_prog(entry, fp, prog) {
+ bpf_compute_data_pointers(skb);
+ ret = bpf_prog_run(prog, skb);
+ if (ret != TCX_NEXT)
+ break;
+ }
+ if (needs_mac)
+ __skb_pull(skb, skb->mac_len);
+ return tcx_action_code(skb, ret);
+}
+
+static __always_inline struct sk_buff *
+sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
+ struct net_device *orig_dev, bool *another)
+{
+ struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
+ int sch_ret;
+
+ if (!entry)
+ return skb;
+ if (*pt_prev) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
+ tcx_set_ingress(skb, true);
+
+ if (static_branch_unlikely(&tcx_needed_key)) {
+ sch_ret = tcx_run(entry, skb, true);
+ if (sch_ret != TC_ACT_UNSPEC)
+ goto ingress_verdict;
+ }
+ sch_ret = tc_run(tcx_entry(entry), skb);
+ingress_verdict:
+ switch (sch_ret) {
+ case TC_ACT_REDIRECT:
+ /* skb_mac_header check was done by BPF, so we can safely
+ * push the L2 header back before redirecting to another
+ * netdev.
+ */
+ __skb_push(skb, skb->mac_len);
+ if (skb_do_redirect(skb) == -EAGAIN) {
+ __skb_pull(skb, skb->mac_len);
+ *another = true;
+ break;
+ }
+ *ret = NET_RX_SUCCESS;
+ return NULL;
case TC_ACT_SHOT:
- mini_qdisc_qstats_cpu_drop(miniq);
- *ret = NET_XMIT_DROP;
- kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
+ kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
+ *ret = NET_RX_DROP;
return NULL;
+ /* used by tc_run */
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
- *ret = NET_XMIT_SUCCESS;
consume_skb(skb);
+ fallthrough;
+ case TC_ACT_CONSUMED:
+ *ret = NET_RX_SUCCESS;
return NULL;
+ }
+
+ return skb;
+}
+
+static __always_inline struct sk_buff *
+sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
+{
+ struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
+ int sch_ret;
+
+ if (!entry)
+ return skb;
+
+ /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
+ * already set by the caller.
+ */
+ if (static_branch_unlikely(&tcx_needed_key)) {
+ sch_ret = tcx_run(entry, skb, false);
+ if (sch_ret != TC_ACT_UNSPEC)
+ goto egress_verdict;
+ }
+ sch_ret = tc_run(tcx_entry(entry), skb);
+egress_verdict:
+ switch (sch_ret) {
case TC_ACT_REDIRECT:
/* No need to push/pop skb's mac_header here on egress! */
skb_do_redirect(skb);
*ret = NET_XMIT_SUCCESS;
return NULL;
- default:
- break;
+ case TC_ACT_SHOT:
+ kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
+ *ret = NET_XMIT_DROP;
+ return NULL;
+ /* used by tc_run */
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ consume_skb(skb);
+ fallthrough;
+ case TC_ACT_CONSUMED:
+ *ret = NET_XMIT_SUCCESS;
+ return NULL;
}
-#endif /* CONFIG_NET_CLS_ACT */
return skb;
}
-
-static struct netdev_queue *
-netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
-{
- int qm = skb_get_queue_mapping(skb);
-
- return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
-}
-
-static bool netdev_xmit_txqueue_skipped(void)
+#else
+static __always_inline struct sk_buff *
+sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
+ struct net_device *orig_dev, bool *another)
{
- return __this_cpu_read(softnet_data.xmit.skip_txqueue);
+ return skb;
}
-void netdev_xmit_skip_txqueue(bool skip)
+static __always_inline struct sk_buff *
+sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
- __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
+ return skb;
}
-EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
-#endif /* CONFIG_NET_EGRESS */
+#endif /* CONFIG_NET_XGRESS */
#ifdef CONFIG_XPS
static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
@@ -4128,9 +4267,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
skb_update_prio(skb);
qdisc_pkt_len_init(skb);
-#ifdef CONFIG_NET_CLS_ACT
- skb->tc_at_ingress = 0;
-#endif
+ tcx_set_ingress(skb, false);
#ifdef CONFIG_NET_EGRESS
if (static_branch_unlikely(&egress_needed_key)) {
if (nf_hook_egress_active()) {
@@ -5064,72 +5201,6 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev,
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#endif
-static inline struct sk_buff *
-sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
- struct net_device *orig_dev, bool *another)
-{
-#ifdef CONFIG_NET_CLS_ACT
- struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
- struct tcf_result cl_res;
-
- /* If there's at least one ingress present somewhere (so
- * we get here via enabled static key), remaining devices
- * that are not configured with an ingress qdisc will bail
- * out here.
- */
- if (!miniq)
- return skb;
-
- if (*pt_prev) {
- *ret = deliver_skb(skb, *pt_prev, orig_dev);
- *pt_prev = NULL;
- }
-
- qdisc_skb_cb(skb)->pkt_len = skb->len;
- tc_skb_cb(skb)->mru = 0;
- tc_skb_cb(skb)->post_ct = false;
- skb->tc_at_ingress = 1;
- mini_qdisc_bstats_cpu_update(miniq, skb);
-
- switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
- case TC_ACT_OK:
- case TC_ACT_RECLASSIFY:
- skb->tc_index = TC_H_MIN(cl_res.classid);
- break;
- case TC_ACT_SHOT:
- mini_qdisc_qstats_cpu_drop(miniq);
- kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
- *ret = NET_RX_DROP;
- return NULL;
- case TC_ACT_STOLEN:
- case TC_ACT_QUEUED:
- case TC_ACT_TRAP:
- consume_skb(skb);
- *ret = NET_RX_SUCCESS;
- return NULL;
- case TC_ACT_REDIRECT:
- /* skb_mac_header check was done by cls/act_bpf, so
- * we can safely push the L2 header back before
- * redirecting to another netdev
- */
- __skb_push(skb, skb->mac_len);
- if (skb_do_redirect(skb) == -EAGAIN) {
- __skb_pull(skb, skb->mac_len);
- *another = true;
- break;
- }
- *ret = NET_RX_SUCCESS;
- return NULL;
- case TC_ACT_CONSUMED:
- *ret = NET_RX_SUCCESS;
- return NULL;
- default:
- break;
- }
-#endif /* CONFIG_NET_CLS_ACT */
- return skb;
-}
-
/**
* netdev_is_rx_handler_busy - check if receive handler is registered
* @dev: device to check
@@ -6316,12 +6387,8 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
* softirq mode will happen in the next round of napi_schedule().
* This should not cause hiccups/stalls to the live traffic.
*/
- list_for_each_entry(napi, &dev->napi_list, dev_list) {
- if (threaded)
- set_bit(NAPI_STATE_THREADED, &napi->state);
- else
- clear_bit(NAPI_STATE_THREADED, &napi->state);
- }
+ list_for_each_entry(napi, &dev->napi_list, dev_list)
+ assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
return err;
}
@@ -9413,6 +9480,7 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct net *net = current->nsproxy->net_ns;
struct bpf_link_primer link_primer;
+ struct netlink_ext_ack extack = {};
struct bpf_xdp_link *link;
struct net_device *dev;
int err, fd;
@@ -9440,12 +9508,13 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
goto unlock;
}
- err = dev_xdp_attach_link(dev, NULL, link);
+ err = dev_xdp_attach_link(dev, &extack, link);
rtnl_unlock();
if (err) {
link->dev = NULL;
bpf_link_cleanup(&link_primer);
+ trace_bpf_xdp_link_attach_failed(extack._msg);
goto out_put_dev;
}
@@ -9509,23 +9578,40 @@ err_out:
}
/**
- * dev_new_index - allocate an ifindex
- * @net: the applicable net namespace
+ * dev_index_reserve() - allocate an ifindex in a namespace
+ * @net: the applicable net namespace
+ * @ifindex: requested ifindex, pass %0 to get one allocated
+ *
+ * Allocate a ifindex for a new device. Caller must either use the ifindex
+ * to store the device (via list_netdevice()) or call dev_index_release()
+ * to give the index up.
*
- * Returns a suitable unique value for a new device interface
- * number. The caller must hold the rtnl semaphore or the
- * dev_base_lock to be sure it remains unique.
+ * Return: a suitable unique value for a new device interface number or -errno.
*/
-static int dev_new_index(struct net *net)
+static int dev_index_reserve(struct net *net, u32 ifindex)
{
- int ifindex = net->ifindex;
+ int err;
- for (;;) {
- if (++ifindex <= 0)
- ifindex = 1;
- if (!__dev_get_by_index(net, ifindex))
- return net->ifindex = ifindex;
+ if (ifindex > INT_MAX) {
+ DEBUG_NET_WARN_ON_ONCE(1);
+ return -EINVAL;
}
+
+ if (!ifindex)
+ err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
+ xa_limit_31b, &net->ifindex, GFP_KERNEL);
+ else
+ err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
+ if (err < 0)
+ return err;
+
+ return ifindex;
+}
+
+static void dev_index_release(struct net *net, int ifindex)
+{
+ /* Expect only unused indexes, unlist_netdevice() removes the used */
+ WARN_ON(xa_erase(&net->dev_by_index, ifindex));
}
/* Delayed registration/unregisteration */
@@ -9995,11 +10081,10 @@ int register_netdevice(struct net_device *dev)
goto err_uninit;
}
- ret = -EBUSY;
- if (!dev->ifindex)
- dev->ifindex = dev_new_index(net);
- else if (__dev_get_by_index(net, dev->ifindex))
+ ret = dev_index_reserve(net, dev->ifindex);
+ if (ret < 0)
goto err_uninit;
+ dev->ifindex = ret;
/* Transfer changeable features to wanted_features and enable
* software offloads (GSO and GRO).
@@ -10046,7 +10131,7 @@ int register_netdevice(struct net_device *dev)
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
ret = notifier_to_errno(ret);
if (ret)
- goto err_uninit;
+ goto err_ifindex_release;
ret = netdev_register_kobject(dev);
write_lock(&dev_base_lock);
@@ -10102,6 +10187,8 @@ out:
err_uninit_notify:
call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
+err_ifindex_release:
+ dev_index_release(net, dev->ifindex);
err_uninit:
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
@@ -10617,6 +10704,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev_net_set(dev, &init_net);
dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
+ dev->xdp_zc_max_segs = 1;
dev->gso_max_segs = GSO_MAX_SEGS;
dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
@@ -10838,7 +10926,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
/* Shutdown queueing discipline. */
dev_shutdown(dev);
-
+ dev_tcx_uninstall(dev);
dev_xdp_uninstall(dev);
bpf_dev_bound_netdev_unregister(dev);
@@ -10978,9 +11066,19 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
}
/* Check that new_ifindex isn't used yet. */
- err = -EBUSY;
- if (new_ifindex && __dev_get_by_index(net, new_ifindex))
- goto out;
+ if (new_ifindex) {
+ err = dev_index_reserve(net, new_ifindex);
+ if (err < 0)
+ goto out;
+ } else {
+ /* If there is an ifindex conflict assign a new one */
+ err = dev_index_reserve(net, dev->ifindex);
+ if (err == -EBUSY)
+ err = dev_index_reserve(net, 0);
+ if (err < 0)
+ goto out;
+ new_ifindex = err;
+ }
/*
* And now a mini version of register_netdevice unregister_netdevice.
@@ -11008,13 +11106,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
rcu_barrier();
new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
- /* If there is an ifindex conflict assign a new one */
- if (!new_ifindex) {
- if (__dev_get_by_index(net, dev->ifindex))
- new_ifindex = dev_new_index(net);
- else
- new_ifindex = dev->ifindex;
- }
rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
new_ifindex);
@@ -11192,6 +11283,8 @@ static int __net_init netdev_init(struct net *net)
if (net->dev_index_head == NULL)
goto err_idx;
+ xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
+
RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
return 0;
@@ -11289,6 +11382,7 @@ static void __net_exit netdev_exit(struct net *net)
{
kfree(net->dev_name_head);
kfree(net->dev_index_head);
+ xa_destroy(&net->dev_by_index);
if (net != &init_net)
WARN_ON_ONCE(!list_empty(&net->dev_base_head));
}
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 3730945ee294..b46aedc36939 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -5,6 +5,7 @@
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/net_tstamp.h>
+#include <linux/phylib_stubs.h>
#include <linux/wireless.h>
#include <linux/if_bridge.h>
#include <net/dsa_stubs.h>
@@ -252,14 +253,121 @@ static int dev_eth_ioctl(struct net_device *dev,
return ops->ndo_eth_ioctl(dev, ifr, cmd);
}
+/**
+ * dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC
+ * or of attached phylib PHY
+ * @dev: Network device
+ * @cfg: Timestamping configuration structure
+ *
+ * Helper for enforcing a common policy that phylib timestamping, if available,
+ * should take precedence in front of hardware timestamping provided by the
+ * netdev.
+ *
+ * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and
+ * there only exists a phydev->mii_ts->hwtstamp() method. So this will return
+ * -EOPNOTSUPP for phylib for now, which is still more accurate than letting
+ * the netdev handle the GET request.
+ */
+static int dev_get_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg)
+{
+ if (phy_has_hwtstamp(dev->phydev))
+ return phy_hwtstamp_get(dev->phydev, cfg);
+
+ return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
+}
+
static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
{
- return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP);
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct kernel_hwtstamp_config kernel_cfg = {};
+ struct hwtstamp_config cfg;
+ int err;
+
+ if (!ops->ndo_hwtstamp_get)
+ return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ kernel_cfg.ifr = ifr;
+ err = dev_get_hwtstamp_phylib(dev, &kernel_cfg);
+ if (err)
+ return err;
+
+ /* If the request was resolved through an unconverted driver, omit
+ * the copy_to_user(), since the implementation has already done that
+ */
+ if (!kernel_cfg.copied_to_user) {
+ hwtstamp_config_from_kernel(&cfg, &kernel_cfg);
+
+ if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/**
+ * dev_set_hwtstamp_phylib() - Change hardware timestamping of NIC
+ * or of attached phylib PHY
+ * @dev: Network device
+ * @cfg: Timestamping configuration structure
+ * @extack: Netlink extended ack message structure, for error reporting
+ *
+ * Helper for enforcing a common policy that phylib timestamping, if available,
+ * should take precedence in front of hardware timestamping provided by the
+ * netdev. If the netdev driver needs to perform specific actions even for PHY
+ * timestamping to work properly (a switch port must trap the timestamped
+ * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in
+ * dev->priv_flags.
+ */
+static int dev_set_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ bool phy_ts = phy_has_hwtstamp(dev->phydev);
+ struct kernel_hwtstamp_config old_cfg = {};
+ bool changed = false;
+ int err;
+
+ cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV;
+
+ if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) {
+ err = ops->ndo_hwtstamp_get(dev, &old_cfg);
+ if (err)
+ return err;
+ }
+
+ if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) {
+ err = ops->ndo_hwtstamp_set(dev, cfg, extack);
+ if (err) {
+ if (extack->_msg)
+ netdev_err(dev, "%s\n", extack->_msg);
+ return err;
+ }
+ }
+
+ if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS))
+ changed = kernel_hwtstamp_config_changed(&old_cfg, cfg);
+
+ if (phy_ts) {
+ err = phy_hwtstamp_set(dev->phydev, cfg, extack);
+ if (err) {
+ if (changed)
+ ops->ndo_hwtstamp_set(dev, &old_cfg, NULL);
+ return err;
+ }
+ }
+
+ return 0;
}
static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
{
- struct kernel_hwtstamp_config kernel_cfg;
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct kernel_hwtstamp_config kernel_cfg = {};
struct netlink_ext_ack extack = {};
struct hwtstamp_config cfg;
int err;
@@ -268,6 +376,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
return -EFAULT;
hwtstamp_config_to_kernel(&kernel_cfg, &cfg);
+ kernel_cfg.ifr = ifr;
err = net_hwtstamp_validate(&kernel_cfg);
if (err)
@@ -280,8 +389,80 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
return err;
}
- return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP);
+ if (!ops->ndo_hwtstamp_set)
+ return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack);
+ if (err)
+ return err;
+
+ /* The driver may have modified the configuration, so copy the
+ * updated version of it back to user space
+ */
+ if (!kernel_cfg.copied_to_user) {
+ hwtstamp_config_from_kernel(&cfg, &kernel_cfg);
+
+ if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd,
+ struct kernel_hwtstamp_config *kernel_cfg)
+{
+ struct ifreq ifrr;
+ int err;
+
+ strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ);
+ ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru;
+
+ err = dev_eth_ioctl(dev, &ifrr, cmd);
+ if (err)
+ return err;
+
+ kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru;
+ kernel_cfg->copied_to_user = true;
+
+ return 0;
+}
+
+int generic_hwtstamp_get_lower(struct net_device *dev,
+ struct kernel_hwtstamp_config *kernel_cfg)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ if (ops->ndo_hwtstamp_get)
+ return dev_get_hwtstamp_phylib(dev, kernel_cfg);
+
+ /* Legacy path: unconverted lower driver */
+ return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg);
+}
+EXPORT_SYMBOL(generic_hwtstamp_get_lower);
+
+int generic_hwtstamp_set_lower(struct net_device *dev,
+ struct kernel_hwtstamp_config *kernel_cfg,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ if (ops->ndo_hwtstamp_set)
+ return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
+
+ /* Legacy path: unconverted lower driver */
+ return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg);
}
+EXPORT_SYMBOL(generic_hwtstamp_set_lower);
static int dev_siocbond(struct net_device *dev,
struct ifreq *ifr, unsigned int cmd)
diff --git a/net/core/dst.c b/net/core/dst.c
index 79d9306ad1ee..980e2fd2f013 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -152,7 +152,7 @@ void dst_dev_put(struct dst_entry *dst)
dst->obsolete = DST_OBSOLETE_DEAD;
if (dst->ops->ifdown)
- dst->ops->ifdown(dst, dev, true);
+ dst->ops->ifdown(dst, dev);
dst->input = dst_discard;
dst->output = dst_discard_out;
dst->dev = blackhole_netdev;
diff --git a/net/core/filter.c b/net/core/filter.c
index 28a59596987a..a094694899c9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4339,13 +4339,8 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
enum bpf_map_type map_type = ri->map_type;
- if (map_type == BPF_MAP_TYPE_XSKMAP) {
- /* XDP_REDIRECT is not supported AF_XDP yet. */
- if (unlikely(xdp_buff_has_frags(xdp)))
- return -EOPNOTSUPP;
-
+ if (map_type == BPF_MAP_TYPE_XSKMAP)
return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
- }
return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
xdp_prog);
@@ -7350,8 +7345,8 @@ BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
return -EOPNOTSUPP;
if (unlikely(dev_net(skb->dev) != sock_net(sk)))
return -ENETUNREACH;
- if (unlikely(sk_fullsock(sk) && sk->sk_reuseport))
- return -ESOCKTNOSUPPORT;
+ if (sk_unhashed(sk))
+ return -EOPNOTSUPP;
if (sk_is_refcounted(sk) &&
unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
return -ENOENT;
@@ -9306,7 +9301,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
__u8 value_reg = si->dst_reg;
__u8 skb_reg = si->src_reg;
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_XGRESS
/* If the tstamp_type is read,
* the bpf prog is aware the tstamp could have delivery time.
* Thus, read skb->tstamp as is if tstamp_type_access is true.
@@ -9340,7 +9335,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
__u8 value_reg = si->src_reg;
__u8 skb_reg = si->dst_reg;
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_XGRESS
/* If the tstamp_type is read,
* the bpf prog is aware the tstamp could have delivery time.
* Thus, write skb->tstamp as is if tstamp_type_access is true.
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 85a2d0d9bd39..89d15ceaf9af 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -40,7 +40,7 @@
static void dissector_set_key(struct flow_dissector *flow_dissector,
enum flow_dissector_key_id key_id)
{
- flow_dissector->used_keys |= (1 << key_id);
+ flow_dissector->used_keys |= (1ULL << key_id);
}
void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
@@ -205,6 +205,50 @@ static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen);
}
+static void __skb_flow_dissect_ah(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, int hlen)
+{
+ struct flow_dissector_key_ipsec *key_ah;
+ struct ip_auth_hdr _hdr, *hdr;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC))
+ return;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr)
+ return;
+
+ key_ah = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPSEC,
+ target_container);
+
+ key_ah->spi = hdr->spi;
+}
+
+static void __skb_flow_dissect_esp(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, int hlen)
+{
+ struct flow_dissector_key_ipsec *key_esp;
+ struct ip_esp_hdr _hdr, *hdr;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC))
+ return;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr)
+ return;
+
+ key_esp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPSEC,
+ target_container);
+
+ key_esp->spi = hdr->spi;
+}
+
static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container, const void *data,
@@ -1571,7 +1615,14 @@ ip_proto_again:
__skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container,
data, nhoff, hlen);
break;
-
+ case IPPROTO_ESP:
+ __skb_flow_dissect_esp(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
+ case IPPROTO_AH:
+ __skb_flow_dissect_ah(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
default:
break;
}
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index acfc1f88ea79..bc5169482710 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -146,6 +146,13 @@ void flow_rule_match_tcp(const struct flow_rule *rule,
}
EXPORT_SYMBOL(flow_rule_match_tcp);
+void flow_rule_match_ipsec(const struct flow_rule *rule,
+ struct flow_match_ipsec *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPSEC, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ipsec);
+
void flow_rule_match_icmp(const struct flow_rule *rule,
struct flow_match_icmp *out)
{
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 8b6b5e72b217..4a0797f0a154 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -60,9 +60,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
ret = BPF_OK;
} else {
skb_reset_mac_header(skb);
- ret = skb_do_redirect(skb);
- if (ret == 0)
- ret = BPF_REDIRECT;
+ skb_do_redirect(skb);
+ ret = BPF_REDIRECT;
}
break;
@@ -255,7 +254,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
if (unlikely(err))
- return err;
+ return net_xmit_errno(err);
/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
return LWTUNNEL_XMIT_DONE;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 15e3f4606b5f..fccaa5bac0ed 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -23,6 +23,7 @@
#include <linux/of.h>
#include <linux/of_net.h>
#include <linux/cpu.h>
+#include <net/netdev_rx_queue.h>
#include "dev.h"
#include "net-sysfs.h"
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index a4270fafdf11..c1aea8b756b6 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -10,11 +10,11 @@
static int
netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
- u32 portid, u32 seq, int flags, u32 cmd)
+ const struct genl_info *info)
{
void *hdr;
- hdr = genlmsg_put(rsp, portid, seq, &netdev_nl_family, flags, cmd);
+ hdr = genlmsg_iput(rsp, info);
if (!hdr)
return -EMSGSIZE;
@@ -25,6 +25,14 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
return -EINVAL;
}
+ if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) {
+ if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
+ netdev->xdp_zc_max_segs)) {
+ genlmsg_cancel(rsp, hdr);
+ return -EINVAL;
+ }
+ }
+
genlmsg_end(rsp, hdr);
return 0;
@@ -33,17 +41,20 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
static void
netdev_genl_dev_notify(struct net_device *netdev, int cmd)
{
+ struct genl_info info;
struct sk_buff *ntf;
if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev),
NETDEV_NLGRP_MGMT))
return;
+ genl_info_init_ntf(&info, &netdev_nl_family, cmd);
+
ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!ntf)
return;
- if (netdev_nl_dev_fill(netdev, ntf, 0, 0, 0, cmd)) {
+ if (netdev_nl_dev_fill(netdev, ntf, &info)) {
nlmsg_free(ntf);
return;
}
@@ -72,8 +83,7 @@ int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info)
netdev = __dev_get_by_index(genl_info_net(info), ifindex);
if (netdev)
- err = netdev_nl_dev_fill(netdev, rsp, info->snd_portid,
- info->snd_seq, 0, info->genlhdr->cmd);
+ err = netdev_nl_dev_fill(netdev, rsp, info);
else
err = -ENODEV;
@@ -93,43 +103,19 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
struct net_device *netdev;
- int idx = 0, s_idx;
- int h, s_h;
- int err;
-
- s_h = cb->args[0];
- s_idx = cb->args[1];
+ int err = 0;
rtnl_lock();
-
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- struct hlist_head *head;
-
- idx = 0;
- head = &net->dev_index_head[h];
- hlist_for_each_entry(netdev, head, index_hlist) {
- if (idx < s_idx)
- goto cont;
- err = netdev_nl_dev_fill(netdev, skb,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, 0,
- NETDEV_CMD_DEV_GET);
- if (err < 0)
- break;
-cont:
- idx++;
- }
+ for_each_netdev_dump(net, netdev, cb->args[0]) {
+ err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb));
+ if (err < 0)
+ break;
}
-
rtnl_unlock();
if (err != -EMSGSIZE)
return err;
- cb->args[1] = idx;
- cb->args[0] = h;
- cb->seq = net->dev_base_seq;
-
return skb->len;
}
diff --git a/net/core/of_net.c b/net/core/of_net.c
index 55d3fe229269..93ea425b9248 100644
--- a/net/core/of_net.c
+++ b/net/core/of_net.c
@@ -8,6 +8,7 @@
#include <linux/kernel.h>
#include <linux/of_net.h>
#include <linux/of_platform.h>
+#include <linux/platform_device.h>
#include <linux/phy.h>
#include <linux/export.h>
#include <linux/device.h>
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index a3e12a61d456..77cb75e63aca 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -10,7 +10,7 @@
#include <linux/slab.h>
#include <linux/device.h>
-#include <net/page_pool.h>
+#include <net/page_pool/helpers.h>
#include <net/xdp.h>
#include <linux/dma-direction.h>
@@ -58,6 +58,17 @@ static const char pp_stats[][ETH_GSTRING_LEN] = {
"rx_pp_recycle_released_ref",
};
+/**
+ * page_pool_get_stats() - fetch page pool stats
+ * @pool: pool from which page was allocated
+ * @stats: struct page_pool_stats to fill in
+ *
+ * Retrieve statistics about the page_pool. This API is only available
+ * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
+ * A pointer to a caller allocated struct page_pool_stats structure
+ * is passed to this API which is filled in. The caller can then report
+ * those stats to the user (perhaps via ethtool, debugfs, etc.).
+ */
bool page_pool_get_stats(struct page_pool *pool,
struct page_pool_stats *stats)
{
@@ -224,6 +235,10 @@ static int page_pool_init(struct page_pool *pool,
return 0;
}
+/**
+ * page_pool_create() - create a page pool.
+ * @params: parameters, see struct page_pool_params
+ */
struct page_pool *page_pool_create(const struct page_pool_params *params)
{
struct page_pool *pool;
@@ -492,7 +507,7 @@ static s32 page_pool_inflight(struct page_pool *pool)
* a regular page (that will eventually be returned to the normal
* page-allocator via put_page).
*/
-void page_pool_release_page(struct page_pool *pool, struct page *page)
+static void page_pool_return_page(struct page_pool *pool, struct page *page)
{
dma_addr_t dma;
int count;
@@ -518,13 +533,6 @@ skip_dma_unmap:
*/
count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
trace_page_pool_state_release(pool, page, count);
-}
-EXPORT_SYMBOL(page_pool_release_page);
-
-/* Return a page to the page allocator, cleaning up our state */
-static void page_pool_return_page(struct page_pool *pool, struct page *page)
-{
- page_pool_release_page(pool, page);
put_page(page);
/* An optimization would be to call __free_pages(page, pool->p.order)
@@ -579,6 +587,8 @@ static __always_inline struct page *
__page_pool_put_page(struct page_pool *pool, struct page *page,
unsigned int dma_sync_size, bool allow_direct)
{
+ lockdep_assert_no_hardirq();
+
/* This allocator is optimized for the XDP mode that uses
* one-frame-per-page, but have fallbacks that act like the
* regular page allocator APIs.
@@ -616,9 +626,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
* will be invoking put_page.
*/
recycle_stat_inc(pool, released_refcnt);
- /* Do not replace this with page_pool_return_page() */
- page_pool_release_page(pool, page);
- put_page(page);
+ page_pool_return_page(pool, page);
return NULL;
}
@@ -635,7 +643,21 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
}
EXPORT_SYMBOL(page_pool_put_defragged_page);
-/* Caller must not use data area after call, as this function overwrites it */
+/**
+ * page_pool_put_page_bulk() - release references on multiple pages
+ * @pool: pool from which pages were allocated
+ * @data: array holding page pointers
+ * @count: number of pages in @data
+ *
+ * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
+ * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
+ * will release leftover pages to the page allocator.
+ * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
+ * completion loop for the XDP_REDIRECT use case.
+ *
+ * Please note the caller must not use data area after running
+ * page_pool_put_page_bulk(), as this function overwrites it.
+ */
void page_pool_put_page_bulk(struct page_pool *pool, void **data,
int count)
{
@@ -915,42 +937,3 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
}
}
EXPORT_SYMBOL(page_pool_update_nid);
-
-bool page_pool_return_skb_page(struct page *page, bool napi_safe)
-{
- struct napi_struct *napi;
- struct page_pool *pp;
- bool allow_direct;
-
- page = compound_head(page);
-
- /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
- * in order to preserve any existing bits, such as bit 0 for the
- * head page of compound page and bit 1 for pfmemalloc page, so
- * mask those bits for freeing side when doing below checking,
- * and page_is_pfmemalloc() is checked in __page_pool_put_page()
- * to avoid recycling the pfmemalloc page.
- */
- if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
- return false;
-
- pp = page->pp;
-
- /* Allow direct recycle if we have reasons to believe that we are
- * in the same context as the consumer would run, so there's
- * no possible race.
- */
- napi = READ_ONCE(pp->p.napi);
- allow_direct = napi_safe && napi &&
- READ_ONCE(napi->list_owner) == smp_processor_id();
-
- /* Driver set this to memory recycling info. Reset it on recycle.
- * This will *not* work for NIC using a split-page memory model.
- * The page will be returned to the pool here regardless of the
- * 'flipped' fragment being in use or not.
- */
- page_pool_put_full_page(pp, page, allow_direct);
-
- return true;
-}
-EXPORT_SYMBOL(page_pool_return_skb_page);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 00c94d9622b4..4a2ec33bfb51 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -61,7 +61,7 @@
#include "dev.h"
#define RTNL_MAX_TYPE 50
-#define RTNL_SLAVE_MAX_TYPE 43
+#define RTNL_SLAVE_MAX_TYPE 44
struct rtnl_link {
rtnl_doit_func doit;
@@ -1273,7 +1273,6 @@ static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb,
static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct net_device *dev,
int vfs_num,
- struct nlattr *vfinfo,
u32 ext_filter_mask)
{
struct ifla_vf_rss_query_en vf_rss_query_en;
@@ -1343,7 +1342,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
vf_trust.setting = ivi.trusted;
vf = nla_nest_start_noflag(skb, IFLA_VF_INFO);
if (!vf)
- goto nla_put_vfinfo_failure;
+ return -EMSGSIZE;
if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) ||
nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
@@ -1414,8 +1413,6 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
nla_put_vf_failure:
nla_nest_cancel(skb, vf);
-nla_put_vfinfo_failure:
- nla_nest_cancel(skb, vfinfo);
return -EMSGSIZE;
}
@@ -1441,8 +1438,10 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb,
return -EMSGSIZE;
for (i = 0; i < num_vfs; i++) {
- if (rtnl_fill_vfinfo(skb, dev, i, vfinfo, ext_filter_mask))
+ if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) {
+ nla_nest_cancel(skb, vfinfo);
return -EMSGSIZE;
+ }
}
nla_nest_end(skb, vfinfo);
diff --git a/net/core/scm.c b/net/core/scm.c
index 3cd7dd377e53..880027ecf516 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -130,6 +130,7 @@ EXPORT_SYMBOL(__scm_destroy);
int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
{
+ const struct proto_ops *ops = READ_ONCE(sock->ops);
struct cmsghdr *cmsg;
int err;
@@ -153,7 +154,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
switch (cmsg->cmsg_type)
{
case SCM_RIGHTS:
- if (!sock->ops || sock->ops->family != PF_UNIX)
+ if (!ops || ops->family != PF_UNIX)
goto error;
err=scm_fp_copy(cmsg, &p->fp);
if (err<0)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a298992060e6..45707059082f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -73,7 +73,7 @@
#include <net/mpls.h>
#include <net/mptcp.h>
#include <net/mctp.h>
-#include <net/page_pool.h>
+#include <net/page_pool/helpers.h>
#include <net/dropreason.h>
#include <linux/uaccess.h>
@@ -879,11 +879,56 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+bool napi_pp_put_page(struct page *page, bool napi_safe)
+{
+ bool allow_direct = false;
+ struct page_pool *pp;
+
+ page = compound_head(page);
+
+ /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
+ * in order to preserve any existing bits, such as bit 0 for the
+ * head page of compound page and bit 1 for pfmemalloc page, so
+ * mask those bits for freeing side when doing below checking,
+ * and page_is_pfmemalloc() is checked in __page_pool_put_page()
+ * to avoid recycling the pfmemalloc page.
+ */
+ if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
+ return false;
+
+ pp = page->pp;
+
+ /* Allow direct recycle if we have reasons to believe that we are
+ * in the same context as the consumer would run, so there's
+ * no possible race.
+ * __page_pool_put_page() makes sure we're not in hardirq context
+ * and interrupts are enabled prior to accessing the cache.
+ */
+ if (napi_safe || in_softirq()) {
+ const struct napi_struct *napi = READ_ONCE(pp->p.napi);
+
+ allow_direct = napi &&
+ READ_ONCE(napi->list_owner) == smp_processor_id();
+ }
+
+ /* Driver set this to memory recycling info. Reset it on recycle.
+ * This will *not* work for NIC using a split-page memory model.
+ * The page will be returned to the pool here regardless of the
+ * 'flipped' fragment being in use or not.
+ */
+ page_pool_put_full_page(pp, page, allow_direct);
+
+ return true;
+}
+EXPORT_SYMBOL(napi_pp_put_page);
+#endif
+
static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
{
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
return false;
- return page_pool_return_skb_page(virt_to_page(data), napi_safe);
+ return napi_pp_put_page(virt_to_page(data), napi_safe);
}
static void skb_kfree_head(void *head, unsigned int end_offset)
@@ -3656,20 +3701,23 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
EXPORT_SYMBOL(skb_dequeue_tail);
/**
- * skb_queue_purge - empty a list
+ * skb_queue_purge_reason - empty a list
* @list: list to empty
+ * @reason: drop reason
*
* Delete all buffers on an &sk_buff list. Each buffer is removed from
* the list and one reference dropped. This function takes the list
* lock and is atomic with respect to other list locking functions.
*/
-void skb_queue_purge(struct sk_buff_head *list)
+void skb_queue_purge_reason(struct sk_buff_head *list,
+ enum skb_drop_reason reason)
{
struct sk_buff *skb;
+
while ((skb = skb_dequeue(list)) != NULL)
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
}
-EXPORT_SYMBOL(skb_queue_purge);
+EXPORT_SYMBOL(skb_queue_purge_reason);
/**
* skb_rbtree_purge - empty a skb rbtree
@@ -3697,6 +3745,27 @@ unsigned int skb_rbtree_purge(struct rb_root *root)
return sum;
}
+void skb_errqueue_purge(struct sk_buff_head *list)
+{
+ struct sk_buff *skb, *next;
+ struct sk_buff_head kill;
+ unsigned long flags;
+
+ __skb_queue_head_init(&kill);
+
+ spin_lock_irqsave(&list->lock, flags);
+ skb_queue_walk_safe(list, skb, next) {
+ if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY ||
+ SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING)
+ continue;
+ __skb_unlink(skb, list);
+ __skb_queue_tail(&kill, skb);
+ }
+ spin_unlock_irqrestore(&list->lock, flags);
+ __skb_queue_purge(&kill);
+}
+EXPORT_SYMBOL(skb_errqueue_purge);
+
/**
* skb_queue_head - queue a buffer at the list head
* @list: list to use
@@ -4716,23 +4785,13 @@ static const u8 skb_ext_type_len[] = {
static __always_inline unsigned int skb_ext_total_length(void)
{
- return SKB_EXT_CHUNKSIZEOF(struct skb_ext) +
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
-#endif
-#ifdef CONFIG_XFRM
- skb_ext_type_len[SKB_EXT_SEC_PATH] +
-#endif
-#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
- skb_ext_type_len[TC_SKB_EXT] +
-#endif
-#if IS_ENABLED(CONFIG_MPTCP)
- skb_ext_type_len[SKB_EXT_MPTCP] +
-#endif
-#if IS_ENABLED(CONFIG_MCTP_FLOWS)
- skb_ext_type_len[SKB_EXT_MCTP] +
-#endif
- 0;
+ unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++)
+ l += skb_ext_type_len[i];
+
+ return l;
}
static void skb_extensions_init(void)
@@ -4750,12 +4809,23 @@ static void skb_extensions_init(void)
static void skb_extensions_init(void) {}
#endif
+/* The SKB kmem_cache slab is critical for network performance. Never
+ * merge/alias the slab with similar sized objects. This avoids fragmentation
+ * that hurts performance of kmem_cache_{alloc,free}_bulk APIs.
+ */
+#ifndef CONFIG_SLUB_TINY
+#define FLAG_SKB_NO_MERGE SLAB_NO_MERGE
+#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */
+#define FLAG_SKB_NO_MERGE 0
+#endif
+
void __init skb_init(void)
{
skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
sizeof(struct sk_buff),
0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|
+ FLAG_SKB_NO_MERGE,
offsetof(struct sk_buff, cb),
sizeof_field(struct sk_buff, cb),
NULL);
@@ -6204,7 +6274,7 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
*
* @header_len: size of linear part
* @data_len: needed length in frags
- * @max_page_order: max page order desired.
+ * @order: max page order desired.
* @errcode: pointer to error code if any
* @gfp_mask: allocation mask
*
@@ -6212,21 +6282,17 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
*/
struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
unsigned long data_len,
- int max_page_order,
+ int order,
int *errcode,
gfp_t gfp_mask)
{
- int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
unsigned long chunk;
struct sk_buff *skb;
struct page *page;
- int i;
+ int nr_frags = 0;
*errcode = -EMSGSIZE;
- /* Note this test could be relaxed, if we succeed to allocate
- * high order pages...
- */
- if (npages > MAX_SKB_FRAGS)
+ if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order)))
return NULL;
*errcode = -ENOBUFS;
@@ -6234,34 +6300,32 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
if (!skb)
return NULL;
- skb->truesize += npages << PAGE_SHIFT;
-
- for (i = 0; npages > 0; i++) {
- int order = max_page_order;
-
- while (order) {
- if (npages >= 1 << order) {
- page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP |
- __GFP_NOWARN,
- order);
- if (page)
- goto fill_page;
- /* Do not retry other high order allocations */
- order = 1;
- max_page_order = 0;
- }
+ while (data_len) {
+ if (nr_frags == MAX_SKB_FRAGS - 1)
+ goto failure;
+ while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order))
order--;
+
+ if (order) {
+ page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_COMP |
+ __GFP_NOWARN,
+ order);
+ if (!page) {
+ order--;
+ continue;
+ }
+ } else {
+ page = alloc_page(gfp_mask);
+ if (!page)
+ goto failure;
}
- page = alloc_page(gfp_mask);
- if (!page)
- goto failure;
-fill_page:
chunk = min_t(unsigned long, data_len,
PAGE_SIZE << order);
- skb_fill_page_desc(skb, i, page, 0, chunk);
+ skb_fill_page_desc(skb, nr_frags, page, 0, chunk);
+ nr_frags++;
+ skb->truesize += (PAGE_SIZE << order);
data_len -= chunk;
- npages -= 1 << order;
}
return skb;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index ef1a2eb6520b..a0659fc29bcc 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -1204,13 +1204,17 @@ out:
static void sk_psock_verdict_data_ready(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
+ const struct proto_ops *ops;
int copied;
trace_sk_data_ready(sk);
- if (unlikely(!sock || !sock->ops || !sock->ops->read_skb))
+ if (unlikely(!sock))
return;
- copied = sock->ops->read_skb(sk, sk_psock_verdict_recv);
+ ops = READ_ONCE(sock->ops);
+ if (!ops || !ops->read_skb)
+ return;
+ copied = ops->read_skb(sk, sk_psock_verdict_recv);
if (copied >= 0) {
struct sk_psock *psock;
diff --git a/net/core/sock.c b/net/core/sock.c
index c9cffb7acbea..666a17cab4f5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -767,7 +767,7 @@ bool sk_mc_loop(struct sock *sk)
return true;
switch (sk->sk_family) {
case AF_INET:
- return inet_sk(sk)->mc_loop;
+ return inet_test_bit(MC_LOOP, sk);
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
return inet6_sk(sk)->mc_loop;
@@ -797,7 +797,7 @@ EXPORT_SYMBOL(sock_set_reuseport);
void sock_no_linger(struct sock *sk)
{
lock_sock(sk);
- sk->sk_lingertime = 0;
+ WRITE_ONCE(sk->sk_lingertime, 0);
sock_set_flag(sk, SOCK_LINGER);
release_sock(sk);
}
@@ -1230,15 +1230,15 @@ set_sndbuf:
ret = -EFAULT;
break;
}
- if (!ling.l_onoff)
+ if (!ling.l_onoff) {
sock_reset_flag(sk, SOCK_LINGER);
- else {
-#if (BITS_PER_LONG == 32)
- if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
- sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
+ } else {
+ unsigned long t_sec = ling.l_linger;
+
+ if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
+ WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
else
-#endif
- sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
+ WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
sock_set_flag(sk, SOCK_LINGER);
}
break;
@@ -1247,17 +1247,11 @@ set_sndbuf:
break;
case SO_PASSCRED:
- if (valbool)
- set_bit(SOCK_PASSCRED, &sock->flags);
- else
- clear_bit(SOCK_PASSCRED, &sock->flags);
+ assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
break;
case SO_PASSPIDFD:
- if (valbool)
- set_bit(SOCK_PASSPIDFD, &sock->flags);
- else
- clear_bit(SOCK_PASSPIDFD, &sock->flags);
+ assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
break;
case SO_TIMESTAMP_OLD:
@@ -1283,14 +1277,19 @@ set_sndbuf:
break;
case SO_RCVLOWAT:
+ {
+ int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
+
if (val < 0)
val = INT_MAX;
- if (sock && sock->ops->set_rcvlowat)
- ret = sock->ops->set_rcvlowat(sk, val);
+ if (sock)
+ set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
+ if (set_rcvlowat)
+ ret = set_rcvlowat(sk, val);
else
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
break;
-
+ }
case SO_RCVTIMEO_OLD:
case SO_RCVTIMEO_NEW:
ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
@@ -1361,10 +1360,7 @@ set_sndbuf:
break;
case SO_PASSSEC:
- if (valbool)
- set_bit(SOCK_PASSSEC, &sock->flags);
- else
- clear_bit(SOCK_PASSSEC, &sock->flags);
+ assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
break;
case SO_MARK:
if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
@@ -1388,11 +1384,16 @@ set_sndbuf:
break;
case SO_PEEK_OFF:
- if (sock->ops->set_peek_off)
- ret = sock->ops->set_peek_off(sk, val);
+ {
+ int (*set_peek_off)(struct sock *sk, int val);
+
+ set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
+ if (set_peek_off)
+ ret = set_peek_off(sk, val);
else
ret = -EOPNOTSUPP;
break;
+ }
case SO_NOFCS:
sock_valbool_flag(sk, SOCK_NOFCS, valbool);
@@ -1691,7 +1692,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
case SO_LINGER:
lv = sizeof(v.ling);
v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
- v.ling.l_linger = sk->sk_lingertime / HZ;
+ v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
break;
case SO_BSDCOMPAT:
@@ -1823,14 +1824,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
case SO_PEERNAME:
{
- char address[128];
+ struct sockaddr_storage address;
- lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
+ lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
if (lv < 0)
return -ENOTCONN;
if (lv < len)
return -EINVAL;
- if (copy_to_sockptr(optval, address, len))
+ if (copy_to_sockptr(optval, &address, len))
return -EFAULT;
goto lenout;
}
@@ -1867,7 +1868,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
break;
case SO_PEEK_OFF:
- if (!sock->ops->set_peek_off)
+ if (!READ_ONCE(sock->ops)->set_peek_off)
return -EOPNOTSUPP;
v.val = READ_ONCE(sk->sk_peek_off);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 8362130bf085..a70670fe9a2d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -14,7 +14,7 @@
#include <linux/idr.h>
#include <linux/rhashtable.h>
#include <linux/bug.h>
-#include <net/page_pool.h>
+#include <net/page_pool/helpers.h>
#include <net/xdp.h>
#include <net/xdp_priv.h> /* struct xdp_mem_allocator */