From f62af20bed2d9e824f51cfc97ff01bc261f40e58 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 11 Apr 2023 21:01:54 +0300 Subject: net/sched: mqprio: allow per-TC user input of FP adminStatus IEEE 802.1Q-2018 clause 6.7.2 Frame preemption specifies that each packet priority can be assigned to a "frame preemption status" value of either "express" or "preemptible". Express priorities are transmitted by the local device through the eMAC, and preemptible priorities through the pMAC (the concepts of eMAC and pMAC come from the 802.3 MAC Merge layer). The FP adminStatus is defined per packet priority, but 802.1Q clause 12.30.1.1.1 framePreemptionAdminStatus also says that: | Priorities that all map to the same traffic class should be | constrained to use the same value of preemption status. It is impossible to ignore the cognitive dissonance in the standard here, because it practically means that the FP adminStatus only takes distinct values per traffic class, even though it is defined per priority. I can see no valid use case which is prevented by having the kernel take the FP adminStatus as input per traffic class (what we do here). In addition, this also enforces the above constraint by construction. User space network managers which wish to expose FP adminStatus per priority are free to do so; they must only observe the prio_tc_map of the netdev (which presumably is also under their control, when constructing the mqprio netlink attributes). The reason for configuring frame preemption as a property of the Qdisc layer is that the information about "preemptible TCs" is closest to the place which handles the num_tc and prio_tc_map of the netdev. If the UAPI would have been any other layer, it would be unclear what to do with the FP information when num_tc collapses to 0. A key assumption is that only mqprio/taprio change the num_tc and prio_tc_map of the netdev. Not sure if that's a great assumption to make. Having FP in tc-mqprio can be seen as an implementation of the use case defined in 802.1Q Annex S.2 "Preemption used in isolation". There will be a separate implementation of FP in tc-taprio, for the other use cases. Signed-off-by: Vladimir Oltean Reviewed-by: Ferenc Fejes Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_sched.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/uapi/linux/pkt_sched.h') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 000eec106856..b8d29be91b62 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -719,6 +719,11 @@ enum { #define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1) +enum { + TC_FP_EXPRESS = 1, + TC_FP_PREEMPTIBLE = 2, +}; + struct tc_mqprio_qopt { __u8 num_tc; __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; @@ -732,12 +737,23 @@ struct tc_mqprio_qopt { #define TC_MQPRIO_F_MIN_RATE 0x4 #define TC_MQPRIO_F_MAX_RATE 0x8 +enum { + TCA_MQPRIO_TC_ENTRY_UNSPEC, + TCA_MQPRIO_TC_ENTRY_INDEX, /* u32 */ + TCA_MQPRIO_TC_ENTRY_FP, /* u32 */ + + /* add new constants above here */ + __TCA_MQPRIO_TC_ENTRY_CNT, + TCA_MQPRIO_TC_ENTRY_MAX = (__TCA_MQPRIO_TC_ENTRY_CNT - 1) +}; + enum { TCA_MQPRIO_UNSPEC, TCA_MQPRIO_MODE, TCA_MQPRIO_SHAPER, TCA_MQPRIO_MIN_RATE64, TCA_MQPRIO_MAX_RATE64, + TCA_MQPRIO_TC_ENTRY, __TCA_MQPRIO_MAX, }; -- cgit v1.2.3-70-g09d2 From a721c3e54b80e45cd9202e7fca29ef018bed9069 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 11 Apr 2023 21:01:55 +0300 Subject: net/sched: taprio: allow per-TC user input of FP adminStatus This is a duplication of the FP adminStatus logic introduced for tc-mqprio. Offloading is done through the tc_mqprio_qopt_offload structure embedded within tc_taprio_qopt_offload. So practically, if a device driver is written to treat the mqprio portion of taprio just like standalone mqprio, it gets unified handling of frame preemption. I would have reused more code with taprio, but this is mostly netlink attribute parsing, which is hard to transform into generic code without having something that stinks as a result. We have the same variables with the same semantics, just different nlattr type values (TCA_MQPRIO_TC_ENTRY=5 vs TCA_TAPRIO_ATTR_TC_ENTRY=12; TCA_MQPRIO_TC_ENTRY_FP=2 vs TCA_TAPRIO_TC_ENTRY_FP=3, etc) and consequently, different policies for the nest. Every time nla_parse_nested() is called, an on-stack table "tb" of nlattr pointers is allocated statically, up to the maximum understood nlattr type. That array size is hardcoded as a constant, but when transforming this into a common parsing function, it would become either a VLA (which the Linux kernel rightfully doesn't like) or a call to the allocator. Having FP adminStatus in tc-taprio can be seen as addressing the 802.1Q Annex S.3 "Scheduling and preemption used in combination, no HOLD/RELEASE" and S.4 "Scheduling and preemption used in combination with HOLD/RELEASE" use cases. HOLD and RELEASE events are emitted towards the underlying MAC Merge layer when the schedule hits a Set-And-Hold-MAC or a Set-And-Release-MAC gate operation. So within the tc-taprio UAPI space, one can distinguish between the 2 use cases by choosing whether to use the TC_TAPRIO_CMD_SET_AND_HOLD and TC_TAPRIO_CMD_SET_AND_RELEASE gate operations within the schedule, or just TC_TAPRIO_CMD_SET_GATES. A small part of the change is dedicated to refactoring the max_sdu nlattr parsing to put all logic under the "if" that tests for presence of that nlattr. Signed-off-by: Vladimir Oltean Reviewed-by: Ferenc Fejes Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_taprio.c | 65 +++++++++++++++++++++++++++++++++--------- 2 files changed, 53 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux/pkt_sched.h') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index b8d29be91b62..51a7addc56c6 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1252,6 +1252,7 @@ enum { TCA_TAPRIO_TC_ENTRY_UNSPEC, TCA_TAPRIO_TC_ENTRY_INDEX, /* u32 */ TCA_TAPRIO_TC_ENTRY_MAX_SDU, /* u32 */ + TCA_TAPRIO_TC_ENTRY_FP, /* u32 */ /* add new constants above here */ __TCA_TAPRIO_TC_ENTRY_CNT, diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index cbad43019172..76db9a10ef50 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -96,6 +97,7 @@ struct taprio_sched { struct list_head taprio_list; int cur_txq[TC_MAX_QUEUE]; u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */ + u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */ u32 txtime_delay; }; @@ -1002,6 +1004,9 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { [TCA_TAPRIO_TC_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 }, + [TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, + TC_FP_EXPRESS, + TC_FP_PREEMPTIBLE), }; static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { @@ -1524,6 +1529,7 @@ static int taprio_enable_offload(struct net_device *dev, mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt); offload->mqprio.extack = extack; taprio_sched_to_offload(dev, sched, offload, &caps); + mqprio_fp_to_offload(q->fp, &offload->mqprio); for (tc = 0; tc < TC_MAX_QUEUE; tc++) offload->max_sdu[tc] = q->max_sdu[tc]; @@ -1671,13 +1677,14 @@ out: static int taprio_parse_tc_entry(struct Qdisc *sch, struct nlattr *opt, u32 max_sdu[TC_QOPT_MAX_QUEUE], + u32 fp[TC_QOPT_MAX_QUEUE], unsigned long *seen_tcs, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { }; struct net_device *dev = qdisc_dev(sch); - u32 val = 0; int err, tc; + u32 val; err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt, taprio_tc_policy, extack); @@ -1702,15 +1709,18 @@ static int taprio_parse_tc_entry(struct Qdisc *sch, *seen_tcs |= BIT(tc); - if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) + if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) { val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]); + if (val > dev->max_mtu) { + NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU"); + return -ERANGE; + } - if (val > dev->max_mtu) { - NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU"); - return -ERANGE; + max_sdu[tc] = val; } - max_sdu[tc] = val; + if (tb[TCA_TAPRIO_TC_ENTRY_FP]) + fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]); return 0; } @@ -1720,29 +1730,51 @@ static int taprio_parse_tc_entries(struct Qdisc *sch, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); u32 max_sdu[TC_QOPT_MAX_QUEUE]; + bool have_preemption = false; unsigned long seen_tcs = 0; + u32 fp[TC_QOPT_MAX_QUEUE]; struct nlattr *n; int tc, rem; int err = 0; - for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { max_sdu[tc] = q->max_sdu[tc]; + fp[tc] = q->fp[tc]; + } nla_for_each_nested(n, opt, rem) { if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY) continue; - err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs, + err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs, extack); if (err) - goto out; + return err; } - for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { q->max_sdu[tc] = max_sdu[tc]; + q->fp[tc] = fp[tc]; + if (fp[tc] != TC_FP_EXPRESS) + have_preemption = true; + } + + if (have_preemption) { + if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) { + NL_SET_ERR_MSG(extack, + "Preemption only supported with full offload"); + return -EOPNOTSUPP; + } + + if (!ethtool_dev_mm_supported(dev)) { + NL_SET_ERR_MSG(extack, + "Device does not support preemption"); + return -EOPNOTSUPP; + } + } -out: return err; } @@ -2023,7 +2055,7 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - int i; + int i, tc; spin_lock_init(&q->current_entry_lock); @@ -2080,6 +2112,9 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, q->qdiscs[i] = qdisc; } + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) + q->fp[tc] = TC_FP_EXPRESS; + taprio_detect_broken_mqprio(q); return taprio_change(sch, opt, extack); @@ -2223,6 +2258,7 @@ error_nest: } static int taprio_dump_tc_entries(struct sk_buff *skb, + struct taprio_sched *q, struct sched_gate_list *sched) { struct nlattr *n; @@ -2240,6 +2276,9 @@ static int taprio_dump_tc_entries(struct sk_buff *skb, sched->max_sdu[tc])) goto nla_put_failure; + if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_FP, q->fp[tc])) + goto nla_put_failure; + nla_nest_end(skb, n); } @@ -2281,7 +2320,7 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; - if (oper && taprio_dump_tc_entries(skb, oper)) + if (oper && taprio_dump_tc_entries(skb, q, oper)) goto options_error; if (oper && dump_schedule(skb, oper)) -- cgit v1.2.3-70-g09d2