diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/dev.c | 370 | ||||
-rw-r--r-- | net/core/dev_ioctl.c | 187 | ||||
-rw-r--r-- | net/core/dst.c | 2 | ||||
-rw-r--r-- | net/core/filter.c | 15 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 55 | ||||
-rw-r--r-- | net/core/flow_offload.c | 7 | ||||
-rw-r--r-- | net/core/lwt_bpf.c | 7 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 1 | ||||
-rw-r--r-- | net/core/netdev-genl.c | 54 | ||||
-rw-r--r-- | net/core/of_net.c | 1 | ||||
-rw-r--r-- | net/core/page_pool.c | 87 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 11 | ||||
-rw-r--r-- | net/core/scm.c | 3 | ||||
-rw-r--r-- | net/core/skbuff.c | 174 | ||||
-rw-r--r-- | net/core/skmsg.c | 8 | ||||
-rw-r--r-- | net/core/sock.c | 63 | ||||
-rw-r--r-- | net/core/xdp.c | 2 |
17 files changed, 707 insertions, 340 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 69a3e544676c..ccff2b6ef958 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -107,6 +107,7 @@ #include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> +#include <net/tcx.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/module.h> @@ -132,6 +133,7 @@ #include <trace/events/net.h> #include <trace/events/skb.h> #include <trace/events/qdisc.h> +#include <trace/events/xdp.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> @@ -150,11 +152,11 @@ #include <linux/pm_runtime.h> #include <linux/prandom.h> #include <linux/once_lite.h> +#include <net/netdev_rx_queue.h> #include "dev.h" #include "net-sysfs.h" - static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; struct list_head ptype_all __read_mostly; /* Taps */ @@ -388,6 +390,8 @@ static void list_netdevice(struct net_device *dev) hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); write_unlock(&dev_base_lock); + /* We reserved the ifindex, this can't fail */ + WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL)); dev_base_seq_inc(net); } @@ -397,8 +401,12 @@ static void list_netdevice(struct net_device *dev) */ static void unlist_netdevice(struct net_device *dev, bool lock) { + struct net *net = dev_net(dev); + ASSERT_RTNL(); + xa_erase(&net->dev_by_index, dev->ifindex); + /* Unlink dev from the device chain */ if (lock) write_lock(&dev_base_lock); @@ -2384,8 +2392,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, struct xps_map *map = NULL; int pos; - if (dev_maps) - map = xmap_dereference(dev_maps->attr_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; @@ -3882,69 +3889,201 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) EXPORT_SYMBOL(dev_loopback_xmit); #ifdef CONFIG_NET_EGRESS -static struct sk_buff * -sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +static struct netdev_queue * +netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) { + int qm = skb_get_queue_mapping(skb); + + return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); +} + +static bool netdev_xmit_txqueue_skipped(void) +{ + return __this_cpu_read(softnet_data.xmit.skip_txqueue); +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); +#endif /* CONFIG_NET_EGRESS */ + +#ifdef CONFIG_NET_XGRESS +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb) +{ + int ret = TC_ACT_UNSPEC; #ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); - struct tcf_result cl_res; + struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); + struct tcf_result res; if (!miniq) - return skb; + return ret; - /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; - mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { + mini_qdisc_bstats_cpu_update(miniq, skb); + ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); + /* Only tcf related quirks below. */ + switch (ret) { + case TC_ACT_SHOT: + mini_qdisc_qstats_cpu_drop(miniq); + break; case TC_ACT_OK: case TC_ACT_RECLASSIFY: - skb->tc_index = TC_H_MIN(cl_res.classid); + skb->tc_index = TC_H_MIN(res.classid); break; + } +#endif /* CONFIG_NET_CLS_ACT */ + return ret; +} + +static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); + +void tcx_inc(void) +{ + static_branch_inc(&tcx_needed_key); +} + +void tcx_dec(void) +{ + static_branch_dec(&tcx_needed_key); +} + +static __always_inline enum tcx_action_base +tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, + const bool needs_mac) +{ + const struct bpf_mprog_fp *fp; + const struct bpf_prog *prog; + int ret = TCX_NEXT; + + if (needs_mac) + __skb_push(skb, skb->mac_len); + bpf_mprog_foreach_prog(entry, fp, prog) { + bpf_compute_data_pointers(skb); + ret = bpf_prog_run(prog, skb); + if (ret != TCX_NEXT) + break; + } + if (needs_mac) + __skb_pull(skb, skb->mac_len); + return tcx_action_code(skb, ret); +} + +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); + int sch_ret; + + if (!entry) + return skb; + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + qdisc_skb_cb(skb)->pkt_len = skb->len; + tcx_set_ingress(skb, true); + + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, true); + if (sch_ret != TC_ACT_UNSPEC) + goto ingress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb); +ingress_verdict: + switch (sch_ret) { + case TC_ACT_REDIRECT: + /* skb_mac_header check was done by BPF, so we can safely + * push the L2 header back before redirecting to another + * netdev. + */ + __skb_push(skb, skb->mac_len); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } + *ret = NET_RX_SUCCESS; + return NULL; case TC_ACT_SHOT: - mini_qdisc_qstats_cpu_drop(miniq); - *ret = NET_XMIT_DROP; - kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); + *ret = NET_RX_DROP; return NULL; + /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: - *ret = NET_XMIT_SUCCESS; consume_skb(skb); + fallthrough; + case TC_ACT_CONSUMED: + *ret = NET_RX_SUCCESS; return NULL; + } + + return skb; +} + +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); + int sch_ret; + + if (!entry) + return skb; + + /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was + * already set by the caller. + */ + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, false); + if (sch_ret != TC_ACT_UNSPEC) + goto egress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb); +egress_verdict: + switch (sch_ret) { case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; return NULL; - default: - break; + case TC_ACT_SHOT: + kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); + *ret = NET_XMIT_DROP; + return NULL; + /* used by tc_run */ + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + consume_skb(skb); + fallthrough; + case TC_ACT_CONSUMED: + *ret = NET_XMIT_SUCCESS; + return NULL; } -#endif /* CONFIG_NET_CLS_ACT */ return skb; } - -static struct netdev_queue * -netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) -{ - int qm = skb_get_queue_mapping(skb); - - return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); -} - -static bool netdev_xmit_txqueue_skipped(void) +#else +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) { - return __this_cpu_read(softnet_data.xmit.skip_txqueue); + return skb; } -void netdev_xmit_skip_txqueue(bool skip) +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { - __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); + return skb; } -EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); -#endif /* CONFIG_NET_EGRESS */ +#endif /* CONFIG_NET_XGRESS */ #ifdef CONFIG_XPS static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, @@ -4128,9 +4267,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_update_prio(skb); qdisc_pkt_len_init(skb); -#ifdef CONFIG_NET_CLS_ACT - skb->tc_at_ingress = 0; -#endif + tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { if (nf_hook_egress_active()) { @@ -5064,72 +5201,6 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev, EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -static inline struct sk_buff * -sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev, bool *another) -{ -#ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); - struct tcf_result cl_res; - - /* If there's at least one ingress present somewhere (so - * we get here via enabled static key), remaining devices - * that are not configured with an ingress qdisc will bail - * out here. - */ - if (!miniq) - return skb; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - - qdisc_skb_cb(skb)->pkt_len = skb->len; - tc_skb_cb(skb)->mru = 0; - tc_skb_cb(skb)->post_ct = false; - skb->tc_at_ingress = 1; - mini_qdisc_bstats_cpu_update(miniq, skb); - - switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { - case TC_ACT_OK: - case TC_ACT_RECLASSIFY: - skb->tc_index = TC_H_MIN(cl_res.classid); - break; - case TC_ACT_SHOT: - mini_qdisc_qstats_cpu_drop(miniq); - kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); - *ret = NET_RX_DROP; - return NULL; - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - case TC_ACT_TRAP: - consume_skb(skb); - *ret = NET_RX_SUCCESS; - return NULL; - case TC_ACT_REDIRECT: - /* skb_mac_header check was done by cls/act_bpf, so - * we can safely push the L2 header back before - * redirecting to another netdev - */ - __skb_push(skb, skb->mac_len); - if (skb_do_redirect(skb) == -EAGAIN) { - __skb_pull(skb, skb->mac_len); - *another = true; - break; - } - *ret = NET_RX_SUCCESS; - return NULL; - case TC_ACT_CONSUMED: - *ret = NET_RX_SUCCESS; - return NULL; - default: - break; - } -#endif /* CONFIG_NET_CLS_ACT */ - return skb; -} - /** * netdev_is_rx_handler_busy - check if receive handler is registered * @dev: device to check @@ -6316,12 +6387,8 @@ int dev_set_threaded(struct net_device *dev, bool threaded) * softirq mode will happen in the next round of napi_schedule(). * This should not cause hiccups/stalls to the live traffic. */ - list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (threaded) - set_bit(NAPI_STATE_THREADED, &napi->state); - else - clear_bit(NAPI_STATE_THREADED, &napi->state); - } + list_for_each_entry(napi, &dev->napi_list, dev_list) + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); return err; } @@ -9413,6 +9480,7 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct net *net = current->nsproxy->net_ns; struct bpf_link_primer link_primer; + struct netlink_ext_ack extack = {}; struct bpf_xdp_link *link; struct net_device *dev; int err, fd; @@ -9440,12 +9508,13 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto unlock; } - err = dev_xdp_attach_link(dev, NULL, link); + err = dev_xdp_attach_link(dev, &extack, link); rtnl_unlock(); if (err) { link->dev = NULL; bpf_link_cleanup(&link_primer); + trace_bpf_xdp_link_attach_failed(extack._msg); goto out_put_dev; } @@ -9509,23 +9578,40 @@ err_out: } /** - * dev_new_index - allocate an ifindex - * @net: the applicable net namespace + * dev_index_reserve() - allocate an ifindex in a namespace + * @net: the applicable net namespace + * @ifindex: requested ifindex, pass %0 to get one allocated + * + * Allocate a ifindex for a new device. Caller must either use the ifindex + * to store the device (via list_netdevice()) or call dev_index_release() + * to give the index up. * - * Returns a suitable unique value for a new device interface - * number. The caller must hold the rtnl semaphore or the - * dev_base_lock to be sure it remains unique. + * Return: a suitable unique value for a new device interface number or -errno. */ -static int dev_new_index(struct net *net) +static int dev_index_reserve(struct net *net, u32 ifindex) { - int ifindex = net->ifindex; + int err; - for (;;) { - if (++ifindex <= 0) - ifindex = 1; - if (!__dev_get_by_index(net, ifindex)) - return net->ifindex = ifindex; + if (ifindex > INT_MAX) { + DEBUG_NET_WARN_ON_ONCE(1); + return -EINVAL; } + + if (!ifindex) + err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL, + xa_limit_31b, &net->ifindex, GFP_KERNEL); + else + err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL); + if (err < 0) + return err; + + return ifindex; +} + +static void dev_index_release(struct net *net, int ifindex) +{ + /* Expect only unused indexes, unlist_netdevice() removes the used */ + WARN_ON(xa_erase(&net->dev_by_index, ifindex)); } /* Delayed registration/unregisteration */ @@ -9995,11 +10081,10 @@ int register_netdevice(struct net_device *dev) goto err_uninit; } - ret = -EBUSY; - if (!dev->ifindex) - dev->ifindex = dev_new_index(net); - else if (__dev_get_by_index(net, dev->ifindex)) + ret = dev_index_reserve(net, dev->ifindex); + if (ret < 0) goto err_uninit; + dev->ifindex = ret; /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). @@ -10046,7 +10131,7 @@ int register_netdevice(struct net_device *dev) ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) - goto err_uninit; + goto err_ifindex_release; ret = netdev_register_kobject(dev); write_lock(&dev_base_lock); @@ -10102,6 +10187,8 @@ out: err_uninit_notify: call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); +err_ifindex_release: + dev_index_release(net, dev->ifindex); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -10617,6 +10704,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); dev->gso_max_size = GSO_LEGACY_MAX_SIZE; + dev->xdp_zc_max_segs = 1; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; @@ -10838,7 +10926,7 @@ void unregister_netdevice_many_notify(struct list_head *head, /* Shutdown queueing discipline. */ dev_shutdown(dev); - + dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); @@ -10978,9 +11066,19 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, } /* Check that new_ifindex isn't used yet. */ - err = -EBUSY; - if (new_ifindex && __dev_get_by_index(net, new_ifindex)) - goto out; + if (new_ifindex) { + err = dev_index_reserve(net, new_ifindex); + if (err < 0) + goto out; + } else { + /* If there is an ifindex conflict assign a new one */ + err = dev_index_reserve(net, dev->ifindex); + if (err == -EBUSY) + err = dev_index_reserve(net, 0); + if (err < 0) + goto out; + new_ifindex = err; + } /* * And now a mini version of register_netdevice unregister_netdevice. @@ -11008,13 +11106,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, rcu_barrier(); new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); - /* If there is an ifindex conflict assign a new one */ - if (!new_ifindex) { - if (__dev_get_by_index(net, dev->ifindex)) - new_ifindex = dev_new_index(net); - else - new_ifindex = dev->ifindex; - } rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); @@ -11192,6 +11283,8 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; + xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1); + RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); return 0; @@ -11289,6 +11382,7 @@ static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); + xa_destroy(&net->dev_by_index); if (net != &init_net) WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 3730945ee294..b46aedc36939 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -5,6 +5,7 @@ #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> +#include <linux/phylib_stubs.h> #include <linux/wireless.h> #include <linux/if_bridge.h> #include <net/dsa_stubs.h> @@ -252,14 +253,121 @@ static int dev_eth_ioctl(struct net_device *dev, return ops->ndo_eth_ioctl(dev, ifr, cmd); } +/** + * dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC + * or of attached phylib PHY + * @dev: Network device + * @cfg: Timestamping configuration structure + * + * Helper for enforcing a common policy that phylib timestamping, if available, + * should take precedence in front of hardware timestamping provided by the + * netdev. + * + * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and + * there only exists a phydev->mii_ts->hwtstamp() method. So this will return + * -EOPNOTSUPP for phylib for now, which is still more accurate than letting + * the netdev handle the GET request. + */ +static int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) +{ + if (phy_has_hwtstamp(dev->phydev)) + return phy_hwtstamp_get(dev->phydev, cfg); + + return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); +} + static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) { - return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); + const struct net_device_ops *ops = dev->netdev_ops; + struct kernel_hwtstamp_config kernel_cfg = {}; + struct hwtstamp_config cfg; + int err; + + if (!ops->ndo_hwtstamp_get) + return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */ + + if (!netif_device_present(dev)) + return -ENODEV; + + kernel_cfg.ifr = ifr; + err = dev_get_hwtstamp_phylib(dev, &kernel_cfg); + if (err) + return err; + + /* If the request was resolved through an unconverted driver, omit + * the copy_to_user(), since the implementation has already done that + */ + if (!kernel_cfg.copied_to_user) { + hwtstamp_config_from_kernel(&cfg, &kernel_cfg); + + if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) + return -EFAULT; + } + + return 0; +} + +/** + * dev_set_hwtstamp_phylib() - Change hardware timestamping of NIC + * or of attached phylib PHY + * @dev: Network device + * @cfg: Timestamping configuration structure + * @extack: Netlink extended ack message structure, for error reporting + * + * Helper for enforcing a common policy that phylib timestamping, if available, + * should take precedence in front of hardware timestamping provided by the + * netdev. If the netdev driver needs to perform specific actions even for PHY + * timestamping to work properly (a switch port must trap the timestamped + * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in + * dev->priv_flags. + */ +static int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + bool phy_ts = phy_has_hwtstamp(dev->phydev); + struct kernel_hwtstamp_config old_cfg = {}; + bool changed = false; + int err; + + cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; + + if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + err = ops->ndo_hwtstamp_get(dev, &old_cfg); + if (err) + return err; + } + + if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + err = ops->ndo_hwtstamp_set(dev, cfg, extack); + if (err) { + if (extack->_msg) + netdev_err(dev, "%s\n", extack->_msg); + return err; + } + } + + if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) + changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); + + if (phy_ts) { + err = phy_hwtstamp_set(dev->phydev, cfg, extack); + if (err) { + if (changed) + ops->ndo_hwtstamp_set(dev, &old_cfg, NULL); + return err; + } + } + + return 0; } static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) { - struct kernel_hwtstamp_config kernel_cfg; + const struct net_device_ops *ops = dev->netdev_ops; + struct kernel_hwtstamp_config kernel_cfg = {}; struct netlink_ext_ack extack = {}; struct hwtstamp_config cfg; int err; @@ -268,6 +376,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) return -EFAULT; hwtstamp_config_to_kernel(&kernel_cfg, &cfg); + kernel_cfg.ifr = ifr; err = net_hwtstamp_validate(&kernel_cfg); if (err) @@ -280,8 +389,80 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) return err; } - return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); + if (!ops->ndo_hwtstamp_set) + return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */ + + if (!netif_device_present(dev)) + return -ENODEV; + + err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack); + if (err) + return err; + + /* The driver may have modified the configuration, so copy the + * updated version of it back to user space + */ + if (!kernel_cfg.copied_to_user) { + hwtstamp_config_from_kernel(&cfg, &kernel_cfg); + + if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) + return -EFAULT; + } + + return 0; +} + +static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, + struct kernel_hwtstamp_config *kernel_cfg) +{ + struct ifreq ifrr; + int err; + + strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); + ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; + + err = dev_eth_ioctl(dev, &ifrr, cmd); + if (err) + return err; + + kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru; + kernel_cfg->copied_to_user = true; + + return 0; +} + +int generic_hwtstamp_get_lower(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_cfg) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!netif_device_present(dev)) + return -ENODEV; + + if (ops->ndo_hwtstamp_get) + return dev_get_hwtstamp_phylib(dev, kernel_cfg); + + /* Legacy path: unconverted lower driver */ + return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); +} +EXPORT_SYMBOL(generic_hwtstamp_get_lower); + +int generic_hwtstamp_set_lower(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_cfg, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!netif_device_present(dev)) + return -ENODEV; + + if (ops->ndo_hwtstamp_set) + return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + + /* Legacy path: unconverted lower driver */ + return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); } +EXPORT_SYMBOL(generic_hwtstamp_set_lower); static int dev_siocbond(struct net_device *dev, struct ifreq *ifr, unsigned int cmd) diff --git a/net/core/dst.c b/net/core/dst.c index 79d9306ad1ee..980e2fd2f013 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -152,7 +152,7 @@ void dst_dev_put(struct dst_entry *dst) dst->obsolete = DST_OBSOLETE_DEAD; if (dst->ops->ifdown) - dst->ops->ifdown(dst, dev, true); + dst->ops->ifdown(dst, dev); dst->input = dst_discard; dst->output = dst_discard_out; dst->dev = blackhole_netdev; diff --git a/net/core/filter.c b/net/core/filter.c index 28a59596987a..a094694899c9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4339,13 +4339,8 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); enum bpf_map_type map_type = ri->map_type; - if (map_type == BPF_MAP_TYPE_XSKMAP) { - /* XDP_REDIRECT is not supported AF_XDP yet. */ - if (unlikely(xdp_buff_has_frags(xdp))) - return -EOPNOTSUPP; - + if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); - } return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), xdp_prog); @@ -7350,8 +7345,8 @@ BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) return -EOPNOTSUPP; if (unlikely(dev_net(skb->dev) != sock_net(sk))) return -ENETUNREACH; - if (unlikely(sk_fullsock(sk) && sk->sk_reuseport)) - return -ESOCKTNOSUPPORT; + if (sk_unhashed(sk)) + return -EOPNOTSUPP; if (sk_is_refcounted(sk) && unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) return -ENOENT; @@ -9306,7 +9301,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, read skb->tstamp as is if tstamp_type_access is true. @@ -9340,7 +9335,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, __u8 value_reg = si->src_reg; __u8 skb_reg = si->dst_reg; -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 85a2d0d9bd39..89d15ceaf9af 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -40,7 +40,7 @@ static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { - flow_dissector->used_keys |= (1 << key_id); + flow_dissector->used_keys |= (1ULL << key_id); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, @@ -205,6 +205,50 @@ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } +static void __skb_flow_dissect_ah(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_ipsec *key_ah; + struct ip_auth_hdr _hdr, *hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_ah = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPSEC, + target_container); + + key_ah->spi = hdr->spi; +} + +static void __skb_flow_dissect_esp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_ipsec *key_esp; + struct ip_esp_hdr _hdr, *hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_esp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPSEC, + target_container); + + key_esp->spi = hdr->spi; +} + static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, @@ -1571,7 +1615,14 @@ ip_proto_again: __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, data, nhoff, hlen); break; - + case IPPROTO_ESP: + __skb_flow_dissect_esp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + case IPPROTO_AH: + __skb_flow_dissect_ah(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; default: break; } diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index acfc1f88ea79..bc5169482710 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -146,6 +146,13 @@ void flow_rule_match_tcp(const struct flow_rule *rule, } EXPORT_SYMBOL(flow_rule_match_tcp); +void flow_rule_match_ipsec(const struct flow_rule *rule, + struct flow_match_ipsec *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPSEC, out); +} +EXPORT_SYMBOL(flow_rule_match_ipsec); + void flow_rule_match_icmp(const struct flow_rule *rule, struct flow_match_icmp *out) { diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 8b6b5e72b217..4a0797f0a154 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -60,9 +60,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, ret = BPF_OK; } else { skb_reset_mac_header(skb); - ret = skb_do_redirect(skb); - if (ret == 0) - ret = BPF_REDIRECT; + skb_do_redirect(skb); + ret = BPF_REDIRECT; } break; @@ -255,7 +254,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(err)) - return err; + return net_xmit_errno(err); /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ return LWTUNNEL_XMIT_DONE; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 15e3f4606b5f..fccaa5bac0ed 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -23,6 +23,7 @@ #include <linux/of.h> #include <linux/of_net.h> #include <linux/cpu.h> +#include <net/netdev_rx_queue.h> #include "dev.h" #include "net-sysfs.h" diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index a4270fafdf11..c1aea8b756b6 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -10,11 +10,11 @@ static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, - u32 portid, u32 seq, int flags, u32 cmd) + const struct genl_info *info) { void *hdr; - hdr = genlmsg_put(rsp, portid, seq, &netdev_nl_family, flags, cmd); + hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; @@ -25,6 +25,14 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, return -EINVAL; } + if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { + if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + netdev->xdp_zc_max_segs)) { + genlmsg_cancel(rsp, hdr); + return -EINVAL; + } + } + genlmsg_end(rsp, hdr); return 0; @@ -33,17 +41,20 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, static void netdev_genl_dev_notify(struct net_device *netdev, int cmd) { + struct genl_info info; struct sk_buff *ntf; if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev), NETDEV_NLGRP_MGMT)) return; + genl_info_init_ntf(&info, &netdev_nl_family, cmd); + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ntf) return; - if (netdev_nl_dev_fill(netdev, ntf, 0, 0, 0, cmd)) { + if (netdev_nl_dev_fill(netdev, ntf, &info)) { nlmsg_free(ntf); return; } @@ -72,8 +83,7 @@ int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) netdev = __dev_get_by_index(genl_info_net(info), ifindex); if (netdev) - err = netdev_nl_dev_fill(netdev, rsp, info->snd_portid, - info->snd_seq, 0, info->genlhdr->cmd); + err = netdev_nl_dev_fill(netdev, rsp, info); else err = -ENODEV; @@ -93,43 +103,19 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct net_device *netdev; - int idx = 0, s_idx; - int h, s_h; - int err; - - s_h = cb->args[0]; - s_idx = cb->args[1]; + int err = 0; rtnl_lock(); - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - struct hlist_head *head; - - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(netdev, head, index_hlist) { - if (idx < s_idx) - goto cont; - err = netdev_nl_dev_fill(netdev, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - NETDEV_CMD_DEV_GET); - if (err < 0) - break; -cont: - idx++; - } + for_each_netdev_dump(net, netdev, cb->args[0]) { + err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); + if (err < 0) + break; } - rtnl_unlock(); if (err != -EMSGSIZE) return err; - cb->args[1] = idx; - cb->args[0] = h; - cb->seq = net->dev_base_seq; - return skb->len; } diff --git a/net/core/of_net.c b/net/core/of_net.c index 55d3fe229269..93ea425b9248 100644 --- a/net/core/of_net.c +++ b/net/core/of_net.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <linux/of_net.h> #include <linux/of_platform.h> +#include <linux/platform_device.h> #include <linux/phy.h> #include <linux/export.h> #include <linux/device.h> diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a3e12a61d456..77cb75e63aca 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -10,7 +10,7 @@ #include <linux/slab.h> #include <linux/device.h> -#include <net/page_pool.h> +#include <net/page_pool/helpers.h> #include <net/xdp.h> #include <linux/dma-direction.h> @@ -58,6 +58,17 @@ static const char pp_stats[][ETH_GSTRING_LEN] = { "rx_pp_recycle_released_ref", }; +/** + * page_pool_get_stats() - fetch page pool stats + * @pool: pool from which page was allocated + * @stats: struct page_pool_stats to fill in + * + * Retrieve statistics about the page_pool. This API is only available + * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. + * A pointer to a caller allocated struct page_pool_stats structure + * is passed to this API which is filled in. The caller can then report + * those stats to the user (perhaps via ethtool, debugfs, etc.). + */ bool page_pool_get_stats(struct page_pool *pool, struct page_pool_stats *stats) { @@ -224,6 +235,10 @@ static int page_pool_init(struct page_pool *pool, return 0; } +/** + * page_pool_create() - create a page pool. + * @params: parameters, see struct page_pool_params + */ struct page_pool *page_pool_create(const struct page_pool_params *params) { struct page_pool *pool; @@ -492,7 +507,7 @@ static s32 page_pool_inflight(struct page_pool *pool) * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ -void page_pool_release_page(struct page_pool *pool, struct page *page) +static void page_pool_return_page(struct page_pool *pool, struct page *page) { dma_addr_t dma; int count; @@ -518,13 +533,6 @@ skip_dma_unmap: */ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); -} -EXPORT_SYMBOL(page_pool_release_page); - -/* Return a page to the page allocator, cleaning up our state */ -static void page_pool_return_page(struct page_pool *pool, struct page *page) -{ - page_pool_release_page(pool, page); put_page(page); /* An optimization would be to call __free_pages(page, pool->p.order) @@ -579,6 +587,8 @@ static __always_inline struct page * __page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + lockdep_assert_no_hardirq(); + /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. @@ -616,9 +626,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * will be invoking put_page. */ recycle_stat_inc(pool, released_refcnt); - /* Do not replace this with page_pool_return_page() */ - page_pool_release_page(pool, page); - put_page(page); + page_pool_return_page(pool, page); return NULL; } @@ -635,7 +643,21 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, } EXPORT_SYMBOL(page_pool_put_defragged_page); -/* Caller must not use data area after call, as this function overwrites it */ +/** + * page_pool_put_page_bulk() - release references on multiple pages + * @pool: pool from which pages were allocated + * @data: array holding page pointers + * @count: number of pages in @data + * + * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring + * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() + * will release leftover pages to the page allocator. + * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx + * completion loop for the XDP_REDIRECT use case. + * + * Please note the caller must not use data area after running + * page_pool_put_page_bulk(), as this function overwrites it. + */ void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count) { @@ -915,42 +937,3 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); - -bool page_pool_return_skb_page(struct page *page, bool napi_safe) -{ - struct napi_struct *napi; - struct page_pool *pp; - bool allow_direct; - - page = compound_head(page); - - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) - return false; - - pp = page->pp; - - /* Allow direct recycle if we have reasons to believe that we are - * in the same context as the consumer would run, so there's - * no possible race. - */ - napi = READ_ONCE(pp->p.napi); - allow_direct = napi_safe && napi && - READ_ONCE(napi->list_owner) == smp_processor_id(); - - /* Driver set this to memory recycling info. Reset it on recycle. - * This will *not* work for NIC using a split-page memory model. - * The page will be returned to the pool here regardless of the - * 'flipped' fragment being in use or not. - */ - page_pool_put_full_page(pp, page, allow_direct); - - return true; -} -EXPORT_SYMBOL(page_pool_return_skb_page); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 00c94d9622b4..4a2ec33bfb51 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -61,7 +61,7 @@ #include "dev.h" #define RTNL_MAX_TYPE 50 -#define RTNL_SLAVE_MAX_TYPE 43 +#define RTNL_SLAVE_MAX_TYPE 44 struct rtnl_link { rtnl_doit_func doit; @@ -1273,7 +1273,6 @@ static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, - struct nlattr *vfinfo, u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; @@ -1343,7 +1342,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_trust.setting = ivi.trusted; vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); if (!vf) - goto nla_put_vfinfo_failure; + return -EMSGSIZE; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || @@ -1414,8 +1413,6 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_vf_failure: nla_nest_cancel(skb, vf); -nla_put_vfinfo_failure: - nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } @@ -1441,8 +1438,10 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { - if (rtnl_fill_vfinfo(skb, dev, i, vfinfo, ext_filter_mask)) + if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) { + nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; + } } nla_nest_end(skb, vfinfo); diff --git a/net/core/scm.c b/net/core/scm.c index 3cd7dd377e53..880027ecf516 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -130,6 +130,7 @@ EXPORT_SYMBOL(__scm_destroy); int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { + const struct proto_ops *ops = READ_ONCE(sock->ops); struct cmsghdr *cmsg; int err; @@ -153,7 +154,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) switch (cmsg->cmsg_type) { case SCM_RIGHTS: - if (!sock->ops || sock->ops->family != PF_UNIX) + if (!ops || ops->family != PF_UNIX) goto error; err=scm_fp_copy(cmsg, &p->fp); if (err<0) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a298992060e6..45707059082f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -73,7 +73,7 @@ #include <net/mpls.h> #include <net/mptcp.h> #include <net/mctp.h> -#include <net/page_pool.h> +#include <net/page_pool/helpers.h> #include <net/dropreason.h> #include <linux/uaccess.h> @@ -879,11 +879,56 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } +#if IS_ENABLED(CONFIG_PAGE_POOL) +bool napi_pp_put_page(struct page *page, bool napi_safe) +{ + bool allow_direct = false; + struct page_pool *pp; + + page = compound_head(page); + + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation + * in order to preserve any existing bits, such as bit 0 for the + * head page of compound page and bit 1 for pfmemalloc page, so + * mask those bits for freeing side when doing below checking, + * and page_is_pfmemalloc() is checked in __page_pool_put_page() + * to avoid recycling the pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + return false; + + pp = page->pp; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + * __page_pool_put_page() makes sure we're not in hardirq context + * and interrupts are enabled prior to accessing the cache. + */ + if (napi_safe || in_softirq()) { + const struct napi_struct *napi = READ_ONCE(pp->p.napi); + + allow_direct = napi && + READ_ONCE(napi->list_owner) == smp_processor_id(); + } + + /* Driver set this to memory recycling info. Reset it on recycle. + * This will *not* work for NIC using a split-page memory model. + * The page will be returned to the pool here regardless of the + * 'flipped' fragment being in use or not. + */ + page_pool_put_full_page(pp, page, allow_direct); + + return true; +} +EXPORT_SYMBOL(napi_pp_put_page); +#endif + static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return page_pool_return_skb_page(virt_to_page(data), napi_safe); + return napi_pp_put_page(virt_to_page(data), napi_safe); } static void skb_kfree_head(void *head, unsigned int end_offset) @@ -3656,20 +3701,23 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) EXPORT_SYMBOL(skb_dequeue_tail); /** - * skb_queue_purge - empty a list + * skb_queue_purge_reason - empty a list * @list: list to empty + * @reason: drop reason * * Delete all buffers on an &sk_buff list. Each buffer is removed from * the list and one reference dropped. This function takes the list * lock and is atomic with respect to other list locking functions. */ -void skb_queue_purge(struct sk_buff_head *list) +void skb_queue_purge_reason(struct sk_buff_head *list, + enum skb_drop_reason reason) { struct sk_buff *skb; + while ((skb = skb_dequeue(list)) != NULL) - kfree_skb(skb); + kfree_skb_reason(skb, reason); } -EXPORT_SYMBOL(skb_queue_purge); +EXPORT_SYMBOL(skb_queue_purge_reason); /** * skb_rbtree_purge - empty a skb rbtree @@ -3697,6 +3745,27 @@ unsigned int skb_rbtree_purge(struct rb_root *root) return sum; } +void skb_errqueue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb, *next; + struct sk_buff_head kill; + unsigned long flags; + + __skb_queue_head_init(&kill); + + spin_lock_irqsave(&list->lock, flags); + skb_queue_walk_safe(list, skb, next) { + if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY || + SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) + continue; + __skb_unlink(skb, list); + __skb_queue_tail(&kill, skb); + } + spin_unlock_irqrestore(&list->lock, flags); + __skb_queue_purge(&kill); +} +EXPORT_SYMBOL(skb_errqueue_purge); + /** * skb_queue_head - queue a buffer at the list head * @list: list to use @@ -4716,23 +4785,13 @@ static const u8 skb_ext_type_len[] = { static __always_inline unsigned int skb_ext_total_length(void) { - return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - skb_ext_type_len[SKB_EXT_BRIDGE_NF] + -#endif -#ifdef CONFIG_XFRM - skb_ext_type_len[SKB_EXT_SEC_PATH] + -#endif -#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - skb_ext_type_len[TC_SKB_EXT] + -#endif -#if IS_ENABLED(CONFIG_MPTCP) - skb_ext_type_len[SKB_EXT_MPTCP] + -#endif -#if IS_ENABLED(CONFIG_MCTP_FLOWS) - skb_ext_type_len[SKB_EXT_MCTP] + -#endif - 0; + unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext); + int i; + + for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++) + l += skb_ext_type_len[i]; + + return l; } static void skb_extensions_init(void) @@ -4750,12 +4809,23 @@ static void skb_extensions_init(void) static void skb_extensions_init(void) {} #endif +/* The SKB kmem_cache slab is critical for network performance. Never + * merge/alias the slab with similar sized objects. This avoids fragmentation + * that hurts performance of kmem_cache_{alloc,free}_bulk APIs. + */ +#ifndef CONFIG_SLUB_TINY +#define FLAG_SKB_NO_MERGE SLAB_NO_MERGE +#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */ +#define FLAG_SKB_NO_MERGE 0 +#endif + void __init skb_init(void) { skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", sizeof(struct sk_buff), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, + SLAB_HWCACHE_ALIGN|SLAB_PANIC| + FLAG_SKB_NO_MERGE, offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); @@ -6204,7 +6274,7 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); * * @header_len: size of linear part * @data_len: needed length in frags - * @max_page_order: max page order desired. + * @order: max page order desired. * @errcode: pointer to error code if any * @gfp_mask: allocation mask * @@ -6212,21 +6282,17 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); */ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long data_len, - int max_page_order, + int order, int *errcode, gfp_t gfp_mask) { - int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; unsigned long chunk; struct sk_buff *skb; struct page *page; - int i; + int nr_frags = 0; *errcode = -EMSGSIZE; - /* Note this test could be relaxed, if we succeed to allocate - * high order pages... - */ - if (npages > MAX_SKB_FRAGS) + if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order))) return NULL; *errcode = -ENOBUFS; @@ -6234,34 +6300,32 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, if (!skb) return NULL; - skb->truesize += npages << PAGE_SHIFT; - - for (i = 0; npages > 0; i++) { - int order = max_page_order; - - while (order) { - if (npages >= 1 << order) { - page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | - __GFP_COMP | - __GFP_NOWARN, - order); - if (page) - goto fill_page; - /* Do not retry other high order allocations */ - order = 1; - max_page_order = 0; - } + while (data_len) { + if (nr_frags == MAX_SKB_FRAGS - 1) + goto failure; + while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) order--; + + if (order) { + page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | + __GFP_NOWARN, + order); + if (!page) { + order--; + continue; + } + } else { + page = alloc_page(gfp_mask); + if (!page) + goto failure; } - page = alloc_page(gfp_mask); - if (!page) - goto failure; -fill_page: chunk = min_t(unsigned long, data_len, PAGE_SIZE << order); - skb_fill_page_desc(skb, i, page, 0, chunk); + skb_fill_page_desc(skb, nr_frags, page, 0, chunk); + nr_frags++; + skb->truesize += (PAGE_SIZE << order); data_len -= chunk; - npages -= 1 << order; } return skb; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index ef1a2eb6520b..a0659fc29bcc 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1204,13 +1204,17 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; + const struct proto_ops *ops; int copied; trace_sk_data_ready(sk); - if (unlikely(!sock || !sock->ops || !sock->ops->read_skb)) + if (unlikely(!sock)) return; - copied = sock->ops->read_skb(sk, sk_psock_verdict_recv); + ops = READ_ONCE(sock->ops); + if (!ops || !ops->read_skb) + return; + copied = ops->read_skb(sk, sk_psock_verdict_recv); if (copied >= 0) { struct sk_psock *psock; diff --git a/net/core/sock.c b/net/core/sock.c index c9cffb7acbea..666a17cab4f5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -767,7 +767,7 @@ bool sk_mc_loop(struct sock *sk) return true; switch (sk->sk_family) { case AF_INET: - return inet_sk(sk)->mc_loop; + return inet_test_bit(MC_LOOP, sk); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return inet6_sk(sk)->mc_loop; @@ -797,7 +797,7 @@ EXPORT_SYMBOL(sock_set_reuseport); void sock_no_linger(struct sock *sk) { lock_sock(sk); - sk->sk_lingertime = 0; + WRITE_ONCE(sk->sk_lingertime, 0); sock_set_flag(sk, SOCK_LINGER); release_sock(sk); } @@ -1230,15 +1230,15 @@ set_sndbuf: ret = -EFAULT; break; } - if (!ling.l_onoff) + if (!ling.l_onoff) { sock_reset_flag(sk, SOCK_LINGER); - else { -#if (BITS_PER_LONG == 32) - if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) - sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + } else { + unsigned long t_sec = ling.l_linger; + + if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) + WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); else -#endif - sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; + WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); sock_set_flag(sk, SOCK_LINGER); } break; @@ -1247,17 +1247,11 @@ set_sndbuf: break; case SO_PASSCRED: - if (valbool) - set_bit(SOCK_PASSCRED, &sock->flags); - else - clear_bit(SOCK_PASSCRED, &sock->flags); + assign_bit(SOCK_PASSCRED, &sock->flags, valbool); break; case SO_PASSPIDFD: - if (valbool) - set_bit(SOCK_PASSPIDFD, &sock->flags); - else - clear_bit(SOCK_PASSPIDFD, &sock->flags); + assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); break; case SO_TIMESTAMP_OLD: @@ -1283,14 +1277,19 @@ set_sndbuf: break; case SO_RCVLOWAT: + { + int (*set_rcvlowat)(struct sock *sk, int val) = NULL; + if (val < 0) val = INT_MAX; - if (sock && sock->ops->set_rcvlowat) - ret = sock->ops->set_rcvlowat(sk, val); + if (sock) + set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; + if (set_rcvlowat) + ret = set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; - + } case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, @@ -1361,10 +1360,7 @@ set_sndbuf: break; case SO_PASSSEC: - if (valbool) - set_bit(SOCK_PASSSEC, &sock->flags); - else - clear_bit(SOCK_PASSSEC, &sock->flags); + assign_bit(SOCK_PASSSEC, &sock->flags, valbool); break; case SO_MARK: if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && @@ -1388,11 +1384,16 @@ set_sndbuf: break; case SO_PEEK_OFF: - if (sock->ops->set_peek_off) - ret = sock->ops->set_peek_off(sk, val); + { + int (*set_peek_off)(struct sock *sk, int val); + + set_peek_off = READ_ONCE(sock->ops)->set_peek_off; + if (set_peek_off) + ret = set_peek_off(sk, val); else ret = -EOPNOTSUPP; break; + } case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); @@ -1691,7 +1692,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_LINGER: lv = sizeof(v.ling); v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); - v.ling.l_linger = sk->sk_lingertime / HZ; + v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; break; case SO_BSDCOMPAT: @@ -1823,14 +1824,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_PEERNAME: { - char address[128]; + struct sockaddr_storage address; - lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); + lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; - if (copy_to_sockptr(optval, address, len)) + if (copy_to_sockptr(optval, &address, len)) return -EFAULT; goto lenout; } @@ -1867,7 +1868,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PEEK_OFF: - if (!sock->ops->set_peek_off) + if (!READ_ONCE(sock->ops)->set_peek_off) return -EOPNOTSUPP; v.val = READ_ONCE(sk->sk_peek_off); diff --git a/net/core/xdp.c b/net/core/xdp.c index 8362130bf085..a70670fe9a2d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -14,7 +14,7 @@ #include <linux/idr.h> #include <linux/rhashtable.h> #include <linux/bug.h> -#include <net/page_pool.h> +#include <net/page_pool/helpers.h> #include <net/xdp.h> #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ |