summaryrefslogtreecommitdiff
path: root/net/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-08-29 11:33:01 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-08-29 11:33:01 -0700
commitbd6c11bc43c496cddfc6cf603b5d45365606dbd5 (patch)
tree36318fa68f784d397111991177d65bd6325189c4 /net/sched
parent68cf01760bc0891074e813b9bb06d2696cac1c01 (diff)
parentc873512ef3a39cc1a605b7a5ff2ad0a33d619aa8 (diff)
Merge tag 'net-next-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni: "Core: - Increase size limits for to-be-sent skb frag allocations. This allows tun, tap devices and packet sockets to better cope with large writes operations - Store netdevs in an xarray, to simplify iterating over netdevs - Refactor nexthop selection for multipath routes - Improve sched class lifetime handling - Add backup nexthop ID support for bridge - Implement drop reasons support in openvswitch - Several data races annotations and fixes - Constify the sk parameter of routing functions - Prepend kernel version to netconsole message Protocols: - Implement support for TCP probing the peer being under memory pressure - Remove hard coded limitation on IPv6 specific info placement inside the socket struct - Get rid of sysctl_tcp_adv_win_scale and use an auto-estimated per socket scaling factor - Scaling-up the IPv6 expired route GC via a separated list of expiring routes - In-kernel support for the TLS alert protocol - Better support for UDP reuseport with connected sockets - Add NEXT-C-SID support for SRv6 End.X behavior, reducing the SR header size - Get rid of additional ancillary per MPTCP connection struct socket - Implement support for BPF-based MPTCP packet schedulers - Format MPTCP subtests selftests results in TAP - Several new SMC 2.1 features including unique experimental options, max connections per lgr negotiation, max links per lgr negotiation BPF: - Multi-buffer support in AF_XDP - Add multi uprobe BPF links for attaching multiple uprobes and usdt probes, which is significantly faster and saves extra fds - Implement an fd-based tc BPF attach API (TCX) and BPF link support on top of it - Add SO_REUSEPORT support for TC bpf_sk_assign - Support new instructions from cpu v4 to simplify the generated code and feature completeness, for x86, arm64, riscv64 - Support defragmenting IPv(4|6) packets in BPF - Teach verifier actual bounds of bpf_get_smp_processor_id() and fix perf+libbpf issue related to custom section handling - Introduce bpf map element count and enable it for all program types - Add a BPF hook in sys_socket() to change the protocol ID from IPPROTO_TCP to IPPROTO_MPTCP to cover migration for legacy - Introduce bpf_me_mcache_free_rcu() and fix OOM under stress - Add uprobe support for the bpf_get_func_ip helper - Check skb ownership against full socket - Support for up to 12 arguments in BPF trampoline - Extend link_info for kprobe_multi and perf_event links Netfilter: - Speed-up process exit by aborting ruleset validation if a fatal signal is pending - Allow NLA_POLICY_MASK to be used with BE16/BE32 types Driver API: - Page pool optimizations, to improve data locality and cache usage - Introduce ndo_hwtstamp_get() and ndo_hwtstamp_set() to avoid the need for raw ioctl() handling in drivers - Simplify genetlink dump operations (doit/dumpit) providing them the common information already populated in struct genl_info - Extend and use the yaml devlink specs to [re]generate the split ops - Introduce devlink selective dumps, to allow SF filtering SF based on handle and other attributes - Add yaml netlink spec for netlink-raw families, allow route, link and address related queries via the ynl tool - Remove phylink legacy mode support - Support offload LED blinking to phy - Add devlink port function attributes for IPsec New hardware / drivers: - Ethernet: - Broadcom ASP 2.0 (72165) ethernet controller - MediaTek MT7988 SoC - Texas Instruments AM654 SoC - Texas Instruments IEP driver - Atheros qca8081 phy - Marvell 88Q2110 phy - NXP TJA1120 phy - WiFi: - MediaTek mt7981 support - Can: - Kvaser SmartFusion2 PCI Express devices - Allwinner T113 controllers - Texas Instruments tcan4552/4553 chips - Bluetooth: - Intel Gale Peak - Qualcomm WCN3988 and WCN7850 - NXP AW693 and IW624 - Mediatek MT2925 Drivers: - Ethernet NICs: - nVidia/Mellanox: - mlx5: - support UDP encapsulation in packet offload mode - IPsec packet offload support in eswitch mode - improve aRFS observability by adding new set of counters - extends MACsec offload support to cover RoCE traffic - dynamic completion EQs - mlx4: - convert to use auxiliary bus instead of custom interface logic - Intel - ice: - implement switchdev bridge offload, even for LAG interfaces - implement SRIOV support for LAG interfaces - igc: - add support for multiple in-flight TX timestamps - Broadcom: - bnxt: - use the unified RX page pool buffers for XDP and non-XDP - use the NAPI skb allocation cache - OcteonTX2: - support Round Robin scheduling HTB offload - TC flower offload support for SPI field - Freescale: - add XDP_TX feature support - AMD: - ionic: add support for PCI FLR event - sfc: - basic conntrack offload - introduce eth, ipv4 and ipv6 pedit offloads - ST Microelectronics: - stmmac: maximze PTP timestamping resolution - Virtual NICs: - Microsoft vNIC: - batch ringing RX queue doorbell on receiving packets - add page pool for RX buffers - Virtio vNIC: - add per queue interrupt coalescing support - Google vNIC: - add queue-page-list mode support - Ethernet high-speed switches: - nVidia/Mellanox (mlxsw): - add port range matching tc-flower offload - permit enslavement to netdevices with uppers - Ethernet embedded switches: - Marvell (mv88e6xxx): - convert to phylink_pcs - Renesas: - r8A779fx: add speed change support - rzn1: enables vlan support - Ethernet PHYs: - convert mv88e6xxx to phylink_pcs - WiFi: - Qualcomm Wi-Fi 7 (ath12k): - extremely High Throughput (EHT) PHY support - RealTek (rtl8xxxu): - enable AP mode for: RTL8192FU, RTL8710BU (RTL8188GU), RTL8192EU and RTL8723BU - RealTek (rtw89): - Introduce Time Averaged SAR (TAS) support - Connector: - support for event filtering" * tag 'net-next-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1806 commits) net: ethernet: mtk_wed: minor change in wed_{tx,rx}info_show net: ethernet: mtk_wed: add some more info in wed_txinfo_show handler net: stmmac: clarify difference between "interface" and "phy_interface" r8152: add vendor/device ID pair for D-Link DUB-E250 devlink: move devlink_notify_register/unregister() to dev.c devlink: move small_ops definition into netlink.c devlink: move tracepoint definitions into core.c devlink: push linecard related code into separate file devlink: push rate related code into separate file devlink: push trap related code into separate file devlink: use tracepoint_enabled() helper devlink: push region related code into separate file devlink: push param related code into separate file devlink: push resource related code into separate file devlink: push dpipe related code into separate file devlink: move and rename devlink_dpipe_send_and_alloc_skb() helper devlink: push shared buffer related code into separate file devlink: push port related code into separate file devlink: push object register/unregister notifications into separate helpers inet: fix IP_TRANSPARENT error handling ...
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig4
-rw-r--r--net/sched/act_ct.c3
-rw-r--r--net/sched/cls_flower.c35
-rw-r--r--net/sched/em_meta.c2
-rw-r--r--net/sched/sch_drr.c11
-rw-r--r--net/sched/sch_hfsc.c14
-rw-r--r--net/sched/sch_htb.c17
-rw-r--r--net/sched/sch_ingress.c61
-rw-r--r--net/sched/sch_netem.c49
-rw-r--r--net/sched/sch_qfq.c12
-rw-r--r--net/sched/sch_taprio.c68
11 files changed, 202 insertions, 74 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 4b95cb1ac435..470c70deffe2 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -347,8 +347,7 @@ config NET_SCH_FQ_PIE
config NET_SCH_INGRESS
tristate "Ingress/classifier-action Qdisc"
depends on NET_CLS_ACT
- select NET_INGRESS
- select NET_EGRESS
+ select NET_XGRESS
help
Say Y here if you want to use classifiers for incoming and/or outgoing
packets. This qdisc doesn't do anything else besides running classifiers,
@@ -679,6 +678,7 @@ config NET_EMATCH_IPT
config NET_CLS_ACT
bool "Actions"
select NET_CLS
+ select NET_XGRESS
help
Say Y here if you want to use traffic control actions. Actions
get attached to classifiers and are invoked after a successful
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index abc71a06d634..7c652d14528b 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -1238,7 +1238,8 @@ static int tcf_ct_fill_params(struct net *net,
}
}
- __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
+ if (p->ct_action & TCA_CT_ACT_COMMIT)
+ __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
return 0;
err:
nf_ct_put(p->tmpl);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 9f0711da9c95..e5314a31f75a 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -72,6 +72,7 @@ struct fl_flow_key {
struct flow_dissector_key_num_of_vlans num_of_vlans;
struct flow_dissector_key_pppoe pppoe;
struct flow_dissector_key_l2tpv3 l2tpv3;
+ struct flow_dissector_key_ipsec ipsec;
struct flow_dissector_key_cfm cfm;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
@@ -726,6 +727,8 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
[TCA_FLOWER_KEY_PPPOE_SID] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_PPP_PROTO] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_L2TPV3_SID] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_SPI] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_SPI_MASK] = { .type = NLA_U32 },
[TCA_FLOWER_L2_MISS] = NLA_POLICY_MAX(NLA_U8, 1),
[TCA_FLOWER_KEY_CFM] = { .type = NLA_NESTED },
};
@@ -796,6 +799,24 @@ static void fl_set_key_val(struct nlattr **tb,
nla_memcpy(mask, tb[mask_type], len);
}
+static int fl_set_key_spi(struct nlattr **tb, struct fl_flow_key *key,
+ struct fl_flow_key *mask,
+ struct netlink_ext_ack *extack)
+{
+ if (key->basic.ip_proto != IPPROTO_ESP &&
+ key->basic.ip_proto != IPPROTO_AH) {
+ NL_SET_ERR_MSG(extack,
+ "Protocol must be either ESP or AH");
+ return -EINVAL;
+ }
+
+ fl_set_key_val(tb, &key->ipsec.spi,
+ TCA_FLOWER_KEY_SPI,
+ &mask->ipsec.spi, TCA_FLOWER_KEY_SPI_MASK,
+ sizeof(key->ipsec.spi));
+ return 0;
+}
+
static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
struct fl_flow_key *mask,
struct netlink_ext_ack *extack)
@@ -1895,6 +1916,12 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
return ret;
}
+ if (tb[TCA_FLOWER_KEY_SPI]) {
+ ret = fl_set_key_spi(tb, key, mask, extack);
+ if (ret)
+ return ret;
+ }
+
if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
@@ -2068,6 +2095,8 @@ static void fl_init_dissector(struct flow_dissector *dissector,
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_L2TPV3, l2tpv3);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_IPSEC, ipsec);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_CFM, cfm);
skb_flow_dissector_init(dissector, keys, cnt);
@@ -3365,6 +3394,12 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
sizeof(key->l2tpv3.session_id)))
goto nla_put_failure;
+ if (key->ipsec.spi &&
+ fl_dump_key_val(skb, &key->ipsec.spi, TCA_FLOWER_KEY_SPI,
+ &mask->ipsec.spi, TCA_FLOWER_KEY_SPI_MASK,
+ sizeof(key->ipsec.spi)))
+ goto nla_put_failure;
+
if ((key->basic.ip_proto == IPPROTO_TCP ||
key->basic.ip_proto == IPPROTO_UDP ||
key->basic.ip_proto == IPPROTO_SCTP) &&
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 6fdba069f6bf..da34fd4c9269 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -502,7 +502,7 @@ META_COLLECTOR(int_sk_lingertime)
*err = -1;
return;
}
- dst->value = sk->sk_lingertime / HZ;
+ dst->value = READ_ONCE(sk->sk_lingertime) / HZ;
}
META_COLLECTOR(int_sk_err_qlen)
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index e35a4e90f4e6..19901e77cd3b 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -17,7 +17,6 @@
struct drr_class {
struct Qdisc_class_common common;
- unsigned int filter_cnt;
struct gnet_stats_basic_sync bstats;
struct gnet_stats_queue qstats;
@@ -150,8 +149,10 @@ static int drr_delete_class(struct Qdisc *sch, unsigned long arg,
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl = (struct drr_class *)arg;
- if (cl->filter_cnt > 0)
+ if (qdisc_class_in_use(&cl->common)) {
+ NL_SET_ERR_MSG(extack, "DRR class is in use");
return -EBUSY;
+ }
sch_tree_lock(sch);
@@ -187,8 +188,8 @@ static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent,
{
struct drr_class *cl = drr_find_class(sch, classid);
- if (cl != NULL)
- cl->filter_cnt++;
+ if (cl)
+ qdisc_class_get(&cl->common);
return (unsigned long)cl;
}
@@ -197,7 +198,7 @@ static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg)
{
struct drr_class *cl = (struct drr_class *)arg;
- cl->filter_cnt--;
+ qdisc_class_put(&cl->common);
}
static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 70b0c5873d32..3554085bc2be 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -116,7 +116,6 @@ struct hfsc_class {
struct net_rate_estimator __rcu *rate_est;
struct tcf_proto __rcu *filter_list; /* filter list */
struct tcf_block *block;
- unsigned int filter_cnt; /* filter count */
unsigned int level; /* class level in hierarchy */
struct hfsc_sched *sched; /* scheduler data */
@@ -1012,6 +1011,10 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (parent == NULL)
return -ENOENT;
}
+ if (!(parent->cl_flags & HFSC_FSC) && parent != &q->root) {
+ NL_SET_ERR_MSG(extack, "Invalid parent - parent class must have FSC");
+ return -EINVAL;
+ }
if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0)
return -EINVAL;
@@ -1094,8 +1097,11 @@ hfsc_delete_class(struct Qdisc *sch, unsigned long arg,
struct hfsc_sched *q = qdisc_priv(sch);
struct hfsc_class *cl = (struct hfsc_class *)arg;
- if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root)
+ if (cl->level > 0 || qdisc_class_in_use(&cl->cl_common) ||
+ cl == &q->root) {
+ NL_SET_ERR_MSG(extack, "HFSC class in use");
return -EBUSY;
+ }
sch_tree_lock(sch);
@@ -1223,7 +1229,7 @@ hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid)
if (cl != NULL) {
if (p != NULL && p->level <= cl->level)
return 0;
- cl->filter_cnt++;
+ qdisc_class_get(&cl->cl_common);
}
return (unsigned long)cl;
@@ -1234,7 +1240,7 @@ hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
{
struct hfsc_class *cl = (struct hfsc_class *)arg;
- cl->filter_cnt--;
+ qdisc_class_put(&cl->cl_common);
}
static struct tcf_block *hfsc_tcf_block(struct Qdisc *sch, unsigned long arg,
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 325c29041c7d..0d947414e616 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -102,7 +102,6 @@ struct htb_class {
struct tcf_proto __rcu *filter_list; /* class attached filters */
struct tcf_block *block;
- int filter_cnt;
int level; /* our level (see above) */
unsigned int children;
@@ -1710,8 +1709,10 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg,
* tc subsys guarantee us that in htb_destroy it holds no class
* refs so that we can remove children safely there ?
*/
- if (cl->children || cl->filter_cnt)
+ if (cl->children || qdisc_class_in_use(&cl->common)) {
+ NL_SET_ERR_MSG(extack, "HTB class in use");
return -EBUSY;
+ }
if (!cl->level && htb_parent_last_child(cl))
last_child = 1;
@@ -1810,10 +1811,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter");
goto failure;
}
- if (hopt->quantum) {
- NL_SET_ERR_MSG(extack, "HTB offload doesn't support the quantum parameter");
- goto failure;
- }
}
/* Keeping backward compatible with rate_table based iproute2 tc */
@@ -1910,6 +1907,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
.rate = max_t(u64, hopt->rate.rate, rate64),
.ceil = max_t(u64, hopt->ceil.rate, ceil64),
.prio = hopt->prio,
+ .quantum = hopt->quantum,
.extack = extack,
};
err = htb_offload(dev, &offload_opt);
@@ -1931,6 +1929,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
.rate = max_t(u64, hopt->rate.rate, rate64),
.ceil = max_t(u64, hopt->ceil.rate, ceil64),
.prio = hopt->prio,
+ .quantum = hopt->quantum,
.extack = extack,
};
err = htb_offload(dev, &offload_opt);
@@ -2017,6 +2016,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
.rate = max_t(u64, hopt->rate.rate, rate64),
.ceil = max_t(u64, hopt->ceil.rate, ceil64),
.prio = hopt->prio,
+ .quantum = hopt->quantum,
.extack = extack,
};
err = htb_offload(dev, &offload_opt);
@@ -2108,7 +2108,7 @@ static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
* be broken by class during destroy IIUC.
*/
if (cl)
- cl->filter_cnt++;
+ qdisc_class_get(&cl->common);
return (unsigned long)cl;
}
@@ -2116,8 +2116,7 @@ static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
{
struct htb_class *cl = (struct htb_class *)arg;
- if (cl)
- cl->filter_cnt--;
+ qdisc_class_put(&cl->common);
}
static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index e43a45499372..a463a63192c3 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -13,6 +13,7 @@
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include <net/tcx.h>
struct ingress_sched_data {
struct tcf_block *block;
@@ -78,6 +79,8 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
{
struct ingress_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ struct bpf_mprog_entry *entry;
+ bool created;
int err;
if (sch->parent != TC_H_INGRESS)
@@ -85,7 +88,13 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
net_inc_ingress_queue();
- mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress);
+ entry = tcx_entry_fetch_or_create(dev, true, &created);
+ if (!entry)
+ return -ENOMEM;
+ tcx_miniq_set_active(entry, true);
+ mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq);
+ if (created)
+ tcx_entry_update(dev, entry, true);
q->block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
q->block_info.chain_head_change = clsact_chain_head_change;
@@ -103,11 +112,22 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
static void ingress_destroy(struct Qdisc *sch)
{
struct ingress_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct bpf_mprog_entry *entry = rtnl_dereference(dev->tcx_ingress);
if (sch->parent != TC_H_INGRESS)
return;
tcf_block_put_ext(q->block, sch, &q->block_info);
+
+ if (entry) {
+ tcx_miniq_set_active(entry, false);
+ if (!tcx_entry_is_active(entry)) {
+ tcx_entry_update(dev, NULL, true);
+ tcx_entry_free(entry);
+ }
+ }
+
net_dec_ingress_queue();
}
@@ -223,6 +243,8 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
{
struct clsact_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ struct bpf_mprog_entry *entry;
+ bool created;
int err;
if (sch->parent != TC_H_CLSACT)
@@ -231,7 +253,13 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
net_inc_ingress_queue();
net_inc_egress_queue();
- mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress);
+ entry = tcx_entry_fetch_or_create(dev, true, &created);
+ if (!entry)
+ return -ENOMEM;
+ tcx_miniq_set_active(entry, true);
+ mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq);
+ if (created)
+ tcx_entry_update(dev, entry, true);
q->ingress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
q->ingress_block_info.chain_head_change = clsact_chain_head_change;
@@ -244,7 +272,13 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
mini_qdisc_pair_block_init(&q->miniqp_ingress, q->ingress_block);
- mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress);
+ entry = tcx_entry_fetch_or_create(dev, false, &created);
+ if (!entry)
+ return -ENOMEM;
+ tcx_miniq_set_active(entry, true);
+ mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq);
+ if (created)
+ tcx_entry_update(dev, entry, false);
q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
q->egress_block_info.chain_head_change = clsact_chain_head_change;
@@ -256,12 +290,31 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
static void clsact_destroy(struct Qdisc *sch)
{
struct clsact_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct bpf_mprog_entry *ingress_entry = rtnl_dereference(dev->tcx_ingress);
+ struct bpf_mprog_entry *egress_entry = rtnl_dereference(dev->tcx_egress);
if (sch->parent != TC_H_CLSACT)
return;
- tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info);
tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info);
+ tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info);
+
+ if (ingress_entry) {
+ tcx_miniq_set_active(ingress_entry, false);
+ if (!tcx_entry_is_active(ingress_entry)) {
+ tcx_entry_update(dev, NULL, true);
+ tcx_entry_free(ingress_entry);
+ }
+ }
+
+ if (egress_entry) {
+ tcx_miniq_set_active(egress_entry, false);
+ if (!tcx_entry_is_active(egress_entry)) {
+ tcx_entry_update(dev, NULL, false);
+ tcx_entry_free(egress_entry);
+ }
+ }
net_dec_ingress_queue();
net_dec_egress_queue();
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 38d9aa0cd30e..4ad39a4a3cf5 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -105,6 +105,11 @@ struct netem_sched_data {
u32 rho;
} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
+ struct prng {
+ u64 seed;
+ struct rnd_state prng_state;
+ } prng;
+
struct disttable *delay_dist;
enum {
@@ -179,15 +184,16 @@ static void init_crandom(struct crndstate *state, unsigned long rho)
* Next number depends on last value.
* rho is scaled to avoid floating point.
*/
-static u32 get_crandom(struct crndstate *state)
+static u32 get_crandom(struct crndstate *state, struct prng *p)
{
u64 value, rho;
unsigned long answer;
+ struct rnd_state *s = &p->prng_state;
if (!state || state->rho == 0) /* no correlation */
- return get_random_u32();
+ return prandom_u32_state(s);
- value = get_random_u32();
+ value = prandom_u32_state(s);
rho = (u64)state->rho + 1;
answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
state->last = answer;
@@ -201,7 +207,7 @@ static u32 get_crandom(struct crndstate *state)
static bool loss_4state(struct netem_sched_data *q)
{
struct clgstate *clg = &q->clg;
- u32 rnd = get_random_u32();
+ u32 rnd = prandom_u32_state(&q->prng.prng_state);
/*
* Makes a comparison between rnd and the transition
@@ -266,18 +272,19 @@ static bool loss_4state(struct netem_sched_data *q)
static bool loss_gilb_ell(struct netem_sched_data *q)
{
struct clgstate *clg = &q->clg;
+ struct rnd_state *s = &q->prng.prng_state;
switch (clg->state) {
case GOOD_STATE:
- if (get_random_u32() < clg->a1)
+ if (prandom_u32_state(s) < clg->a1)
clg->state = BAD_STATE;
- if (get_random_u32() < clg->a4)
+ if (prandom_u32_state(s) < clg->a4)
return true;
break;
case BAD_STATE:
- if (get_random_u32() < clg->a2)
+ if (prandom_u32_state(s) < clg->a2)
clg->state = GOOD_STATE;
- if (get_random_u32() > clg->a3)
+ if (prandom_u32_state(s) > clg->a3)
return true;
}
@@ -289,7 +296,7 @@ static bool loss_event(struct netem_sched_data *q)
switch (q->loss_model) {
case CLG_RANDOM:
/* Random packet drop 0 => none, ~0 => all */
- return q->loss && q->loss >= get_crandom(&q->loss_cor);
+ return q->loss && q->loss >= get_crandom(&q->loss_cor, &q->prng);
case CLG_4_STATES:
/* 4state loss model algorithm (used also for GI model)
@@ -318,6 +325,7 @@ static bool loss_event(struct netem_sched_data *q)
*/
static s64 tabledist(s64 mu, s32 sigma,
struct crndstate *state,
+ struct prng *prng,
const struct disttable *dist)
{
s64 x;
@@ -327,7 +335,7 @@ static s64 tabledist(s64 mu, s32 sigma,
if (sigma == 0)
return mu;
- rnd = get_crandom(state);
+ rnd = get_crandom(state, prng);
/* default uniform distribution */
if (dist == NULL)
@@ -449,7 +457,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
skb->prev = NULL;
/* Random duplication */
- if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
+ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng))
++count;
/* Drop packet? */
@@ -492,7 +500,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
* If packet is going to be hardware checksummed, then
* do it now in software before we mangle it.
*/
- if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
+ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor, &q->prng)) {
if (skb_is_gso(skb)) {
skb = netem_segment(skb, sch, to_free);
if (!skb)
@@ -530,12 +538,12 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
cb = netem_skb_cb(skb);
if (q->gap == 0 || /* not doing reordering */
q->counter < q->gap - 1 || /* inside last reordering gap */
- q->reorder < get_crandom(&q->reorder_cor)) {
+ q->reorder < get_crandom(&q->reorder_cor, &q->prng)) {
u64 now;
s64 delay;
delay = tabledist(q->latency, q->jitter,
- &q->delay_cor, q->delay_dist);
+ &q->delay_cor, &q->prng, q->delay_dist);
now = ktime_get_ns();
@@ -639,7 +647,7 @@ static void get_slot_next(struct netem_sched_data *q, u64 now)
else
next_delay = tabledist(q->slot_config.dist_delay,
(s32)(q->slot_config.dist_jitter),
- NULL, q->slot_dist);
+ NULL, &q->prng, q->slot_dist);
q->slot.slot_next = now + next_delay;
q->slot.packets_left = q->slot_config.max_packets;
@@ -922,6 +930,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
[TCA_NETEM_LATENCY64] = { .type = NLA_S64 },
[TCA_NETEM_JITTER64] = { .type = NLA_S64 },
[TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) },
+ [TCA_NETEM_PRNG_SEED] = { .type = NLA_U64 },
};
static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
@@ -1040,6 +1049,12 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
/* capping jitter to the range acceptable by tabledist() */
q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
+ if (tb[TCA_NETEM_PRNG_SEED])
+ q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]);
+ else
+ q->prng.seed = get_random_u64();
+ prandom_seed_state(&q->prng.prng_state, q->prng.seed);
+
unlock:
sch_tree_unlock(sch);
@@ -1203,6 +1218,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
goto nla_put_failure;
}
+ if (nla_put_u64_64bit(skb, TCA_NETEM_PRNG_SEED, q->prng.seed,
+ TCA_NETEM_PAD))
+ goto nla_put_failure;
+
return nla_nest_end(skb, nla);
nla_put_failure:
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index befaf74b33ca..1a25752f1a9a 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -130,8 +130,6 @@ struct qfq_aggregate;
struct qfq_class {
struct Qdisc_class_common common;
- unsigned int filter_cnt;
-
struct gnet_stats_basic_sync bstats;
struct gnet_stats_queue qstats;
struct net_rate_estimator __rcu *rate_est;
@@ -545,8 +543,10 @@ static int qfq_delete_class(struct Qdisc *sch, unsigned long arg,
struct qfq_sched *q = qdisc_priv(sch);
struct qfq_class *cl = (struct qfq_class *)arg;
- if (cl->filter_cnt > 0)
+ if (qdisc_class_in_use(&cl->common)) {
+ NL_SET_ERR_MSG_MOD(extack, "QFQ class in use");
return -EBUSY;
+ }
sch_tree_lock(sch);
@@ -580,8 +580,8 @@ static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
{
struct qfq_class *cl = qfq_find_class(sch, classid);
- if (cl != NULL)
- cl->filter_cnt++;
+ if (cl)
+ qdisc_class_get(&cl->common);
return (unsigned long)cl;
}
@@ -590,7 +590,7 @@ static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
{
struct qfq_class *cl = (struct qfq_class *)arg;
- cl->filter_cnt--;
+ qdisc_class_put(&cl->common);
}
static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 8c9cfff7fd05..1cb5e41c0ec7 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -2099,11 +2099,8 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
return -EOPNOTSUPP;
}
- /* pre-allocate qdisc, attachment can't fail */
- q->qdiscs = kcalloc(dev->num_tx_queues,
- sizeof(q->qdiscs[0]),
+ q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]),
GFP_KERNEL);
-
if (!q->qdiscs)
return -ENOMEM;
@@ -2145,25 +2142,32 @@ static void taprio_attach(struct Qdisc *sch)
/* Attach underlying qdisc */
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
- struct Qdisc *qdisc = q->qdiscs[ntx];
- struct Qdisc *old;
+ struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
+ struct Qdisc *old, *dev_queue_qdisc;
if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+ struct Qdisc *qdisc = q->qdiscs[ntx];
+
+ /* In offload mode, the root taprio qdisc is bypassed
+ * and the netdev TX queues see the children directly
+ */
qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
- old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+ dev_queue_qdisc = qdisc;
} else {
- old = dev_graft_qdisc(qdisc->dev_queue, sch);
- qdisc_refcount_inc(sch);
+ /* In software mode, attach the root taprio qdisc
+ * to all netdev TX queues, so that dev_qdisc_enqueue()
+ * goes through taprio_enqueue().
+ */
+ dev_queue_qdisc = sch;
}
+ old = dev_graft_qdisc(dev_queue, dev_queue_qdisc);
+ /* The qdisc's refcount requires to be elevated once
+ * for each netdev TX queue it is grafted onto
+ */
+ qdisc_refcount_inc(dev_queue_qdisc);
if (old)
qdisc_put(old);
}
-
- /* access to the child qdiscs is not needed in offload mode */
- if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
- kfree(q->qdiscs);
- q->qdiscs = NULL;
- }
}
static struct netdev_queue *taprio_queue_get(struct Qdisc *sch,
@@ -2192,13 +2196,23 @@ static int taprio_graft(struct Qdisc *sch, unsigned long cl,
if (dev->flags & IFF_UP)
dev_deactivate(dev);
+ /* In offload mode, the child Qdisc is directly attached to the netdev
+ * TX queue, and thus, we need to keep its refcount elevated in order
+ * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue.
+ * However, save the reference to the new qdisc in the private array in
+ * both software and offload cases, to have an up-to-date reference to
+ * our children.
+ */
+ *old = q->qdiscs[cl - 1];
if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
- *old = dev_graft_qdisc(dev_queue, new);
- } else {
- *old = q->qdiscs[cl - 1];
- q->qdiscs[cl - 1] = new;
+ WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old);
+ if (new)
+ qdisc_refcount_inc(new);
+ if (*old)
+ qdisc_put(*old);
}
+ q->qdiscs[cl - 1] = new;
if (new)
new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
@@ -2436,12 +2450,14 @@ start_error:
static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl)
{
- struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned int ntx = cl - 1;
- if (!dev_queue)
+ if (ntx >= dev->num_tx_queues)
return NULL;
- return rtnl_dereference(dev_queue->qdisc_sleeping);
+ return q->qdiscs[ntx];
}
static unsigned long taprio_find(struct Qdisc *sch, u32 classid)
@@ -2456,11 +2472,11 @@ static unsigned long taprio_find(struct Qdisc *sch, u32 classid)
static int taprio_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
- struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+ struct Qdisc *child = taprio_leaf(sch, cl);
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_handle |= TC_H_MIN(cl);
- tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle;
+ tcm->tcm_info = child->handle;
return 0;
}
@@ -2470,16 +2486,14 @@ static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
__releases(d->lock)
__acquires(d->lock)
{
- struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+ struct Qdisc *child = taprio_leaf(sch, cl);
struct tc_taprio_qopt_offload offload = {
.cmd = TAPRIO_CMD_QUEUE_STATS,
.queue_stats = {
.queue = cl - 1,
},
};
- struct Qdisc *child;
- child = rtnl_dereference(dev_queue->qdisc_sleeping);
if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 ||
qdisc_qstats_copy(d, child) < 0)
return -1;